aboutsummaryrefslogtreecommitdiff
path: root/src/dsp/x86
diff options
context:
space:
mode:
Diffstat (limited to 'src/dsp/x86')
-rw-r--r--src/dsp/x86/average_blend_sse4.cc156
-rw-r--r--src/dsp/x86/average_blend_sse4.h41
-rw-r--r--src/dsp/x86/cdef_sse4.cc728
-rw-r--r--src/dsp/x86/cdef_sse4.h45
-rw-r--r--src/dsp/x86/common_avx2.h138
-rw-r--r--src/dsp/x86/common_sse4.h265
-rw-r--r--src/dsp/x86/convolve_avx2.cc534
-rw-r--r--src/dsp/x86/convolve_avx2.h43
-rw-r--r--src/dsp/x86/convolve_sse4.cc2830
-rw-r--r--src/dsp/x86/convolve_sse4.h75
-rw-r--r--src/dsp/x86/distance_weighted_blend_sse4.cc230
-rw-r--r--src/dsp/x86/distance_weighted_blend_sse4.h41
-rw-r--r--src/dsp/x86/intra_edge_sse4.cc270
-rw-r--r--src/dsp/x86/intra_edge_sse4.h46
-rw-r--r--src/dsp/x86/intrapred_cfl_sse4.cc976
-rw-r--r--src/dsp/x86/intrapred_smooth_sse4.cc2662
-rw-r--r--src/dsp/x86/intrapred_sse4.cc3535
-rw-r--r--src/dsp/x86/intrapred_sse4.h1060
-rw-r--r--src/dsp/x86/inverse_transform_sse4.cc3086
-rw-r--r--src/dsp/x86/inverse_transform_sse4.h89
-rw-r--r--src/dsp/x86/loop_filter_sse4.cc2256
-rw-r--r--src/dsp/x86/loop_filter_sse4.h119
-rw-r--r--src/dsp/x86/loop_restoration_10bit_avx2.cc592
-rw-r--r--src/dsp/x86/loop_restoration_10bit_sse4.cc551
-rw-r--r--src/dsp/x86/loop_restoration_avx2.cc2902
-rw-r--r--src/dsp/x86/loop_restoration_avx2.h52
-rw-r--r--src/dsp/x86/loop_restoration_sse4.cc2549
-rw-r--r--src/dsp/x86/loop_restoration_sse4.h52
-rw-r--r--src/dsp/x86/mask_blend_sse4.cc447
-rw-r--r--src/dsp/x86/mask_blend_sse4.h60
-rw-r--r--src/dsp/x86/motion_field_projection_sse4.cc397
-rw-r--r--src/dsp/x86/motion_field_projection_sse4.h41
-rw-r--r--src/dsp/x86/motion_vector_search_sse4.cc262
-rw-r--r--src/dsp/x86/motion_vector_search_sse4.h41
-rw-r--r--src/dsp/x86/obmc_sse4.cc329
-rw-r--r--src/dsp/x86/obmc_sse4.h43
-rw-r--r--src/dsp/x86/super_res_sse4.cc166
-rw-r--r--src/dsp/x86/super_res_sse4.h38
-rw-r--r--src/dsp/x86/transpose_sse4.h307
-rw-r--r--src/dsp/x86/warp_sse4.cc525
-rw-r--r--src/dsp/x86/warp_sse4.h44
-rw-r--r--src/dsp/x86/weight_mask_sse4.cc464
-rw-r--r--src/dsp/x86/weight_mask_sse4.h104
43 files changed, 29191 insertions, 0 deletions
diff --git a/src/dsp/x86/average_blend_sse4.cc b/src/dsp/x86/average_blend_sse4.cc
new file mode 100644
index 0000000..8e008d1
--- /dev/null
+++ b/src/dsp/x86/average_blend_sse4.cc
@@ -0,0 +1,156 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/average_blend.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <xmmintrin.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kInterPostRoundBit = 4;
+
+inline void AverageBlend4Row(const int16_t* prediction_0,
+ const int16_t* prediction_1, uint8_t* dest) {
+ const __m128i pred_0 = LoadLo8(prediction_0);
+ const __m128i pred_1 = LoadLo8(prediction_1);
+ __m128i res = _mm_add_epi16(pred_0, pred_1);
+ res = RightShiftWithRounding_S16(res, kInterPostRoundBit + 1);
+ Store4(dest, _mm_packus_epi16(res, res));
+}
+
+inline void AverageBlend8Row(const int16_t* prediction_0,
+ const int16_t* prediction_1, uint8_t* dest) {
+ const __m128i pred_0 = LoadAligned16(prediction_0);
+ const __m128i pred_1 = LoadAligned16(prediction_1);
+ __m128i res = _mm_add_epi16(pred_0, pred_1);
+ res = RightShiftWithRounding_S16(res, kInterPostRoundBit + 1);
+ StoreLo8(dest, _mm_packus_epi16(res, res));
+}
+
+inline void AverageBlendLargeRow(const int16_t* prediction_0,
+ const int16_t* prediction_1, const int width,
+ uint8_t* dest) {
+ int x = 0;
+ do {
+ const __m128i pred_00 = LoadAligned16(&prediction_0[x]);
+ const __m128i pred_01 = LoadAligned16(&prediction_1[x]);
+ __m128i res0 = _mm_add_epi16(pred_00, pred_01);
+ res0 = RightShiftWithRounding_S16(res0, kInterPostRoundBit + 1);
+ const __m128i pred_10 = LoadAligned16(&prediction_0[x + 8]);
+ const __m128i pred_11 = LoadAligned16(&prediction_1[x + 8]);
+ __m128i res1 = _mm_add_epi16(pred_10, pred_11);
+ res1 = RightShiftWithRounding_S16(res1, kInterPostRoundBit + 1);
+ StoreUnaligned16(dest + x, _mm_packus_epi16(res0, res1));
+ x += 16;
+ } while (x < width);
+}
+
+void AverageBlend_SSE4_1(const void* prediction_0, const void* prediction_1,
+ const int width, const int height, void* const dest,
+ const ptrdiff_t dest_stride) {
+ auto* dst = static_cast<uint8_t*>(dest);
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ int y = height;
+
+ if (width == 4) {
+ do {
+ // TODO(b/150326556): |prediction_[01]| values are packed. It is possible
+ // to load 8 values at a time.
+ AverageBlend4Row(pred_0, pred_1, dst);
+ dst += dest_stride;
+ pred_0 += width;
+ pred_1 += width;
+
+ AverageBlend4Row(pred_0, pred_1, dst);
+ dst += dest_stride;
+ pred_0 += width;
+ pred_1 += width;
+
+ y -= 2;
+ } while (y != 0);
+ return;
+ }
+
+ if (width == 8) {
+ do {
+ AverageBlend8Row(pred_0, pred_1, dst);
+ dst += dest_stride;
+ pred_0 += width;
+ pred_1 += width;
+
+ AverageBlend8Row(pred_0, pred_1, dst);
+ dst += dest_stride;
+ pred_0 += width;
+ pred_1 += width;
+
+ y -= 2;
+ } while (y != 0);
+ return;
+ }
+
+ do {
+ AverageBlendLargeRow(pred_0, pred_1, width, dst);
+ dst += dest_stride;
+ pred_0 += width;
+ pred_1 += width;
+
+ AverageBlendLargeRow(pred_0, pred_1, width, dst);
+ dst += dest_stride;
+ pred_0 += width;
+ pred_1 += width;
+
+ y -= 2;
+ } while (y != 0);
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+#if DSP_ENABLED_8BPP_SSE4_1(AverageBlend)
+ dsp->average_blend = AverageBlend_SSE4_1;
+#endif
+}
+
+} // namespace
+
+void AverageBlendInit_SSE4_1() { Init8bpp(); }
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_TARGETING_SSE4_1
+
+namespace libgav1 {
+namespace dsp {
+
+void AverageBlendInit_SSE4_1() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/average_blend_sse4.h b/src/dsp/x86/average_blend_sse4.h
new file mode 100644
index 0000000..937e8e2
--- /dev/null
+++ b/src/dsp/x86/average_blend_sse4.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_AVERAGE_BLEND_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_AVERAGE_BLEND_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::average_blend. This function is not thread-safe.
+void AverageBlendInit_SSE4_1();
+
+} // namespace dsp
+} // namespace libgav1
+
+// If sse4 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the sse4 implementation should be used.
+#if LIBGAV1_TARGETING_SSE4_1
+#ifndef LIBGAV1_Dsp8bpp_AverageBlend
+#define LIBGAV1_Dsp8bpp_AverageBlend LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif // LIBGAV1_TARGETING_SSE4_1
+
+#endif // LIBGAV1_SRC_DSP_X86_AVERAGE_BLEND_SSE4_H_
diff --git a/src/dsp/x86/cdef_sse4.cc b/src/dsp/x86/cdef_sse4.cc
new file mode 100644
index 0000000..3211a2d
--- /dev/null
+++ b/src/dsp/x86/cdef_sse4.cc
@@ -0,0 +1,728 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/cdef.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <emmintrin.h>
+#include <tmmintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/dsp/x86/transpose_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+#include "src/dsp/cdef.inc"
+
+// Used when calculating odd |cost[x]| values.
+// Holds elements 1 3 5 7 7 7 7 7
+alignas(16) constexpr uint32_t kCdefDivisionTableOddPadded[] = {
+ 420, 210, 140, 105, 105, 105, 105, 105};
+
+// ----------------------------------------------------------------------------
+// Refer to CdefDirection_C().
+//
+// int32_t partial[8][15] = {};
+// for (int i = 0; i < 8; ++i) {
+// for (int j = 0; j < 8; ++j) {
+// const int x = 1;
+// partial[0][i + j] += x;
+// partial[1][i + j / 2] += x;
+// partial[2][i] += x;
+// partial[3][3 + i - j / 2] += x;
+// partial[4][7 + i - j] += x;
+// partial[5][3 - i / 2 + j] += x;
+// partial[6][j] += x;
+// partial[7][i / 2 + j] += x;
+// }
+// }
+//
+// Using the code above, generate the position count for partial[8][15].
+//
+// partial[0]: 1 2 3 4 5 6 7 8 7 6 5 4 3 2 1
+// partial[1]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
+// partial[2]: 8 8 8 8 8 8 8 8 0 0 0 0 0 0 0
+// partial[3]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
+// partial[4]: 1 2 3 4 5 6 7 8 7 6 5 4 3 2 1
+// partial[5]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
+// partial[6]: 8 8 8 8 8 8 8 8 0 0 0 0 0 0 0
+// partial[7]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
+//
+// The SIMD code shifts the input horizontally, then adds vertically to get the
+// correct partial value for the given position.
+// ----------------------------------------------------------------------------
+
+// ----------------------------------------------------------------------------
+// partial[0][i + j] += x;
+//
+// 00 01 02 03 04 05 06 07 00 00 00 00 00 00 00
+// 00 10 11 12 13 14 15 16 17 00 00 00 00 00 00
+// 00 00 20 21 22 23 24 25 26 27 00 00 00 00 00
+// 00 00 00 30 31 32 33 34 35 36 37 00 00 00 00
+// 00 00 00 00 40 41 42 43 44 45 46 47 00 00 00
+// 00 00 00 00 00 50 51 52 53 54 55 56 57 00 00
+// 00 00 00 00 00 00 60 61 62 63 64 65 66 67 00
+// 00 00 00 00 00 00 00 70 71 72 73 74 75 76 77
+//
+// partial[4] is the same except the source is reversed.
+LIBGAV1_ALWAYS_INLINE void AddPartial_D0_D4(__m128i* v_src_16,
+ __m128i* partial_lo,
+ __m128i* partial_hi) {
+ // 00 01 02 03 04 05 06 07
+ *partial_lo = v_src_16[0];
+ // 00 00 00 00 00 00 00 00
+ *partial_hi = _mm_setzero_si128();
+
+ // 00 10 11 12 13 14 15 16
+ *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_src_16[1], 2));
+ // 17 00 00 00 00 00 00 00
+ *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_src_16[1], 14));
+
+ // 00 00 20 21 22 23 24 25
+ *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_src_16[2], 4));
+ // 26 27 00 00 00 00 00 00
+ *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_src_16[2], 12));
+
+ // 00 00 00 30 31 32 33 34
+ *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_src_16[3], 6));
+ // 35 36 37 00 00 00 00 00
+ *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_src_16[3], 10));
+
+ // 00 00 00 00 40 41 42 43
+ *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_src_16[4], 8));
+ // 44 45 46 47 00 00 00 00
+ *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_src_16[4], 8));
+
+ // 00 00 00 00 00 50 51 52
+ *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_src_16[5], 10));
+ // 53 54 55 56 57 00 00 00
+ *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_src_16[5], 6));
+
+ // 00 00 00 00 00 00 60 61
+ *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_src_16[6], 12));
+ // 62 63 64 65 66 67 00 00
+ *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_src_16[6], 4));
+
+ // 00 00 00 00 00 00 00 70
+ *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_src_16[7], 14));
+ // 71 72 73 74 75 76 77 00
+ *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_src_16[7], 2));
+}
+
+// ----------------------------------------------------------------------------
+// partial[1][i + j / 2] += x;
+//
+// A0 = src[0] + src[1], A1 = src[2] + src[3], ...
+//
+// A0 A1 A2 A3 00 00 00 00 00 00 00 00 00 00 00
+// 00 B0 B1 B2 B3 00 00 00 00 00 00 00 00 00 00
+// 00 00 C0 C1 C2 C3 00 00 00 00 00 00 00 00 00
+// 00 00 00 D0 D1 D2 D3 00 00 00 00 00 00 00 00
+// 00 00 00 00 E0 E1 E2 E3 00 00 00 00 00 00 00
+// 00 00 00 00 00 F0 F1 F2 F3 00 00 00 00 00 00
+// 00 00 00 00 00 00 G0 G1 G2 G3 00 00 00 00 00
+// 00 00 00 00 00 00 00 H0 H1 H2 H3 00 00 00 00
+//
+// partial[3] is the same except the source is reversed.
+LIBGAV1_ALWAYS_INLINE void AddPartial_D1_D3(__m128i* v_src_16,
+ __m128i* partial_lo,
+ __m128i* partial_hi) {
+ __m128i v_d1_temp[8];
+ const __m128i v_zero = _mm_setzero_si128();
+
+ for (int i = 0; i < 8; ++i) {
+ v_d1_temp[i] = _mm_hadd_epi16(v_src_16[i], v_zero);
+ }
+
+ *partial_lo = *partial_hi = v_zero;
+ // A0 A1 A2 A3 00 00 00 00
+ *partial_lo = _mm_add_epi16(*partial_lo, v_d1_temp[0]);
+
+ // 00 B0 B1 B2 B3 00 00 00
+ *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_d1_temp[1], 2));
+
+ // 00 00 C0 C1 C2 C3 00 00
+ *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_d1_temp[2], 4));
+ // 00 00 00 D0 D1 D2 D3 00
+ *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_d1_temp[3], 6));
+ // 00 00 00 00 E0 E1 E2 E3
+ *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_d1_temp[4], 8));
+
+ // 00 00 00 00 00 F0 F1 F2
+ *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_d1_temp[5], 10));
+ // F3 00 00 00 00 00 00 00
+ *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_d1_temp[5], 6));
+
+ // 00 00 00 00 00 00 G0 G1
+ *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_d1_temp[6], 12));
+ // G2 G3 00 00 00 00 00 00
+ *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_d1_temp[6], 4));
+
+ // 00 00 00 00 00 00 00 H0
+ *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_d1_temp[7], 14));
+ // H1 H2 H3 00 00 00 00 00
+ *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_d1_temp[7], 2));
+}
+
+// ----------------------------------------------------------------------------
+// partial[7][i / 2 + j] += x;
+//
+// 00 01 02 03 04 05 06 07 00 00 00 00 00 00 00
+// 10 11 12 13 14 15 16 17 00 00 00 00 00 00 00
+// 00 20 21 22 23 24 25 26 27 00 00 00 00 00 00
+// 00 30 31 32 33 34 35 36 37 00 00 00 00 00 00
+// 00 00 40 41 42 43 44 45 46 47 00 00 00 00 00
+// 00 00 50 51 52 53 54 55 56 57 00 00 00 00 00
+// 00 00 00 60 61 62 63 64 65 66 67 00 00 00 00
+// 00 00 00 70 71 72 73 74 75 76 77 00 00 00 00
+//
+// partial[5] is the same except the source is reversed.
+LIBGAV1_ALWAYS_INLINE void AddPartial_D5_D7(__m128i* v_src, __m128i* partial_lo,
+ __m128i* partial_hi) {
+ __m128i v_pair_add[4];
+ // Add vertical source pairs.
+ v_pair_add[0] = _mm_add_epi16(v_src[0], v_src[1]);
+ v_pair_add[1] = _mm_add_epi16(v_src[2], v_src[3]);
+ v_pair_add[2] = _mm_add_epi16(v_src[4], v_src[5]);
+ v_pair_add[3] = _mm_add_epi16(v_src[6], v_src[7]);
+
+ // 00 01 02 03 04 05 06 07
+ // 10 11 12 13 14 15 16 17
+ *partial_lo = v_pair_add[0];
+ // 00 00 00 00 00 00 00 00
+ // 00 00 00 00 00 00 00 00
+ *partial_hi = _mm_setzero_si128();
+
+ // 00 20 21 22 23 24 25 26
+ // 00 30 31 32 33 34 35 36
+ *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_pair_add[1], 2));
+ // 27 00 00 00 00 00 00 00
+ // 37 00 00 00 00 00 00 00
+ *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_pair_add[1], 14));
+
+ // 00 00 40 41 42 43 44 45
+ // 00 00 50 51 52 53 54 55
+ *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_pair_add[2], 4));
+ // 46 47 00 00 00 00 00 00
+ // 56 57 00 00 00 00 00 00
+ *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_pair_add[2], 12));
+
+ // 00 00 00 60 61 62 63 64
+ // 00 00 00 70 71 72 73 74
+ *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_pair_add[3], 6));
+ // 65 66 67 00 00 00 00 00
+ // 75 76 77 00 00 00 00 00
+ *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_pair_add[3], 10));
+}
+
+LIBGAV1_ALWAYS_INLINE void AddPartial(const uint8_t* src, ptrdiff_t stride,
+ __m128i* partial_lo,
+ __m128i* partial_hi) {
+ // 8x8 input
+ // 00 01 02 03 04 05 06 07
+ // 10 11 12 13 14 15 16 17
+ // 20 21 22 23 24 25 26 27
+ // 30 31 32 33 34 35 36 37
+ // 40 41 42 43 44 45 46 47
+ // 50 51 52 53 54 55 56 57
+ // 60 61 62 63 64 65 66 67
+ // 70 71 72 73 74 75 76 77
+ __m128i v_src[8];
+ for (auto& i : v_src) {
+ i = LoadLo8(src);
+ src += stride;
+ }
+
+ const __m128i v_zero = _mm_setzero_si128();
+ // partial for direction 2
+ // --------------------------------------------------------------------------
+ // partial[2][i] += x;
+ // 00 10 20 30 40 50 60 70 00 00 00 00 00 00 00 00
+ // 01 11 21 33 41 51 61 71 00 00 00 00 00 00 00 00
+ // 02 12 22 33 42 52 62 72 00 00 00 00 00 00 00 00
+ // 03 13 23 33 43 53 63 73 00 00 00 00 00 00 00 00
+ // 04 14 24 34 44 54 64 74 00 00 00 00 00 00 00 00
+ // 05 15 25 35 45 55 65 75 00 00 00 00 00 00 00 00
+ // 06 16 26 36 46 56 66 76 00 00 00 00 00 00 00 00
+ // 07 17 27 37 47 57 67 77 00 00 00 00 00 00 00 00
+ const __m128i v_src_4_0 = _mm_unpacklo_epi64(v_src[0], v_src[4]);
+ const __m128i v_src_5_1 = _mm_unpacklo_epi64(v_src[1], v_src[5]);
+ const __m128i v_src_6_2 = _mm_unpacklo_epi64(v_src[2], v_src[6]);
+ const __m128i v_src_7_3 = _mm_unpacklo_epi64(v_src[3], v_src[7]);
+ const __m128i v_hsum_4_0 = _mm_sad_epu8(v_src_4_0, v_zero);
+ const __m128i v_hsum_5_1 = _mm_sad_epu8(v_src_5_1, v_zero);
+ const __m128i v_hsum_6_2 = _mm_sad_epu8(v_src_6_2, v_zero);
+ const __m128i v_hsum_7_3 = _mm_sad_epu8(v_src_7_3, v_zero);
+ const __m128i v_hsum_1_0 = _mm_unpacklo_epi16(v_hsum_4_0, v_hsum_5_1);
+ const __m128i v_hsum_3_2 = _mm_unpacklo_epi16(v_hsum_6_2, v_hsum_7_3);
+ const __m128i v_hsum_5_4 = _mm_unpackhi_epi16(v_hsum_4_0, v_hsum_5_1);
+ const __m128i v_hsum_7_6 = _mm_unpackhi_epi16(v_hsum_6_2, v_hsum_7_3);
+ partial_lo[2] =
+ _mm_unpacklo_epi64(_mm_unpacklo_epi32(v_hsum_1_0, v_hsum_3_2),
+ _mm_unpacklo_epi32(v_hsum_5_4, v_hsum_7_6));
+
+ __m128i v_src_16[8];
+ for (int i = 0; i < 8; ++i) {
+ v_src_16[i] = _mm_cvtepu8_epi16(v_src[i]);
+ }
+
+ // partial for direction 6
+ // --------------------------------------------------------------------------
+ // partial[6][j] += x;
+ // 00 01 02 03 04 05 06 07 00 00 00 00 00 00 00 00
+ // 10 11 12 13 14 15 16 17 00 00 00 00 00 00 00 00
+ // 20 21 22 23 24 25 26 27 00 00 00 00 00 00 00 00
+ // 30 31 32 33 34 35 36 37 00 00 00 00 00 00 00 00
+ // 40 41 42 43 44 45 46 47 00 00 00 00 00 00 00 00
+ // 50 51 52 53 54 55 56 57 00 00 00 00 00 00 00 00
+ // 60 61 62 63 64 65 66 67 00 00 00 00 00 00 00 00
+ // 70 71 72 73 74 75 76 77 00 00 00 00 00 00 00 00
+ partial_lo[6] = v_src_16[0];
+ for (int i = 1; i < 8; ++i) {
+ partial_lo[6] = _mm_add_epi16(partial_lo[6], v_src_16[i]);
+ }
+
+ // partial for direction 0
+ AddPartial_D0_D4(v_src_16, &partial_lo[0], &partial_hi[0]);
+
+ // partial for direction 1
+ AddPartial_D1_D3(v_src_16, &partial_lo[1], &partial_hi[1]);
+
+ // partial for direction 7
+ AddPartial_D5_D7(v_src_16, &partial_lo[7], &partial_hi[7]);
+
+ __m128i v_src_reverse[8];
+ const __m128i reverser =
+ _mm_set_epi32(0x01000302, 0x05040706, 0x09080b0a, 0x0d0c0f0e);
+ for (int i = 0; i < 8; ++i) {
+ v_src_reverse[i] = _mm_shuffle_epi8(v_src_16[i], reverser);
+ }
+
+ // partial for direction 4
+ AddPartial_D0_D4(v_src_reverse, &partial_lo[4], &partial_hi[4]);
+
+ // partial for direction 3
+ AddPartial_D1_D3(v_src_reverse, &partial_lo[3], &partial_hi[3]);
+
+ // partial for direction 5
+ AddPartial_D5_D7(v_src_reverse, &partial_lo[5], &partial_hi[5]);
+}
+
+inline uint32_t SumVector_S32(__m128i a) {
+ a = _mm_hadd_epi32(a, a);
+ a = _mm_add_epi32(a, _mm_srli_si128(a, 4));
+ return _mm_cvtsi128_si32(a);
+}
+
+// |cost[0]| and |cost[4]| square the input and sum with the corresponding
+// element from the other end of the vector:
+// |kCdefDivisionTable[]| element:
+// cost[0] += (Square(partial[0][i]) + Square(partial[0][14 - i])) *
+// kCdefDivisionTable[i + 1];
+// cost[0] += Square(partial[0][7]) * kCdefDivisionTable[8];
+inline uint32_t Cost0Or4(const __m128i a, const __m128i b,
+ const __m128i division_table[2]) {
+ // Reverse and clear upper 2 bytes.
+ const __m128i reverser =
+ _mm_set_epi32(0x80800100, 0x03020504, 0x07060908, 0x0b0a0d0c);
+ // 14 13 12 11 10 09 08 ZZ
+ const __m128i b_reversed = _mm_shuffle_epi8(b, reverser);
+ // 00 14 01 13 02 12 03 11
+ const __m128i ab_lo = _mm_unpacklo_epi16(a, b_reversed);
+ // 04 10 05 09 06 08 07 ZZ
+ const __m128i ab_hi = _mm_unpackhi_epi16(a, b_reversed);
+
+ // Square(partial[0][i]) + Square(partial[0][14 - i])
+ const __m128i square_lo = _mm_madd_epi16(ab_lo, ab_lo);
+ const __m128i square_hi = _mm_madd_epi16(ab_hi, ab_hi);
+
+ const __m128i c = _mm_mullo_epi32(square_lo, division_table[0]);
+ const __m128i d = _mm_mullo_epi32(square_hi, division_table[1]);
+ return SumVector_S32(_mm_add_epi32(c, d));
+}
+
+inline uint32_t CostOdd(const __m128i a, const __m128i b,
+ const __m128i division_table[2]) {
+ // Reverse and clear upper 10 bytes.
+ const __m128i reverser =
+ _mm_set_epi32(0x80808080, 0x80808080, 0x80800100, 0x03020504);
+ // 10 09 08 ZZ ZZ ZZ ZZ ZZ
+ const __m128i b_reversed = _mm_shuffle_epi8(b, reverser);
+ // 00 10 01 09 02 08 03 ZZ
+ const __m128i ab_lo = _mm_unpacklo_epi16(a, b_reversed);
+ // 04 ZZ 05 ZZ 06 ZZ 07 ZZ
+ const __m128i ab_hi = _mm_unpackhi_epi16(a, b_reversed);
+
+ // Square(partial[0][i]) + Square(partial[0][10 - i])
+ const __m128i square_lo = _mm_madd_epi16(ab_lo, ab_lo);
+ const __m128i square_hi = _mm_madd_epi16(ab_hi, ab_hi);
+
+ const __m128i c = _mm_mullo_epi32(square_lo, division_table[0]);
+ const __m128i d = _mm_mullo_epi32(square_hi, division_table[1]);
+ return SumVector_S32(_mm_add_epi32(c, d));
+}
+
+// Sum of squared elements.
+inline uint32_t SquareSum_S16(const __m128i a) {
+ const __m128i square = _mm_madd_epi16(a, a);
+ return SumVector_S32(square);
+}
+
+void CdefDirection_SSE4_1(const void* const source, ptrdiff_t stride,
+ uint8_t* const direction, int* const variance) {
+ assert(direction != nullptr);
+ assert(variance != nullptr);
+ const auto* src = static_cast<const uint8_t*>(source);
+ uint32_t cost[8];
+ __m128i partial_lo[8], partial_hi[8];
+
+ AddPartial(src, stride, partial_lo, partial_hi);
+
+ cost[2] = kCdefDivisionTable[7] * SquareSum_S16(partial_lo[2]);
+ cost[6] = kCdefDivisionTable[7] * SquareSum_S16(partial_lo[6]);
+
+ const __m128i division_table[2] = {LoadUnaligned16(kCdefDivisionTable),
+ LoadUnaligned16(kCdefDivisionTable + 4)};
+
+ cost[0] = Cost0Or4(partial_lo[0], partial_hi[0], division_table);
+ cost[4] = Cost0Or4(partial_lo[4], partial_hi[4], division_table);
+
+ const __m128i division_table_odd[2] = {
+ LoadAligned16(kCdefDivisionTableOddPadded),
+ LoadAligned16(kCdefDivisionTableOddPadded + 4)};
+
+ cost[1] = CostOdd(partial_lo[1], partial_hi[1], division_table_odd);
+ cost[3] = CostOdd(partial_lo[3], partial_hi[3], division_table_odd);
+ cost[5] = CostOdd(partial_lo[5], partial_hi[5], division_table_odd);
+ cost[7] = CostOdd(partial_lo[7], partial_hi[7], division_table_odd);
+
+ uint32_t best_cost = 0;
+ *direction = 0;
+ for (int i = 0; i < 8; ++i) {
+ if (cost[i] > best_cost) {
+ best_cost = cost[i];
+ *direction = i;
+ }
+ }
+ *variance = (best_cost - cost[(*direction + 4) & 7]) >> 10;
+}
+
+// -------------------------------------------------------------------------
+// CdefFilter
+
+// Load 4 vectors based on the given |direction|.
+inline void LoadDirection(const uint16_t* const src, const ptrdiff_t stride,
+ __m128i* output, const int direction) {
+ // Each |direction| describes a different set of source values. Expand this
+ // set by negating each set. For |direction| == 0 this gives a diagonal line
+ // from top right to bottom left. The first value is y, the second x. Negative
+ // y values move up.
+ // a b c d
+ // {-1, 1}, {1, -1}, {-2, 2}, {2, -2}
+ // c
+ // a
+ // 0
+ // b
+ // d
+ const int y_0 = kCdefDirections[direction][0][0];
+ const int x_0 = kCdefDirections[direction][0][1];
+ const int y_1 = kCdefDirections[direction][1][0];
+ const int x_1 = kCdefDirections[direction][1][1];
+ output[0] = LoadUnaligned16(src - y_0 * stride - x_0);
+ output[1] = LoadUnaligned16(src + y_0 * stride + x_0);
+ output[2] = LoadUnaligned16(src - y_1 * stride - x_1);
+ output[3] = LoadUnaligned16(src + y_1 * stride + x_1);
+}
+
+// Load 4 vectors based on the given |direction|. Use when |block_width| == 4 to
+// do 2 rows at a time.
+void LoadDirection4(const uint16_t* const src, const ptrdiff_t stride,
+ __m128i* output, const int direction) {
+ const int y_0 = kCdefDirections[direction][0][0];
+ const int x_0 = kCdefDirections[direction][0][1];
+ const int y_1 = kCdefDirections[direction][1][0];
+ const int x_1 = kCdefDirections[direction][1][1];
+ output[0] = LoadHi8(LoadLo8(src - y_0 * stride - x_0),
+ src - y_0 * stride + stride - x_0);
+ output[1] = LoadHi8(LoadLo8(src + y_0 * stride + x_0),
+ src + y_0 * stride + stride + x_0);
+ output[2] = LoadHi8(LoadLo8(src - y_1 * stride - x_1),
+ src - y_1 * stride + stride - x_1);
+ output[3] = LoadHi8(LoadLo8(src + y_1 * stride + x_1),
+ src + y_1 * stride + stride + x_1);
+}
+
+inline __m128i Constrain(const __m128i& pixel, const __m128i& reference,
+ const __m128i& damping, const __m128i& threshold) {
+ const __m128i diff = _mm_sub_epi16(pixel, reference);
+ const __m128i abs_diff = _mm_abs_epi16(diff);
+ // sign(diff) * Clip3(threshold - (std::abs(diff) >> damping),
+ // 0, std::abs(diff))
+ const __m128i shifted_diff = _mm_srl_epi16(abs_diff, damping);
+ // For bitdepth == 8, the threshold range is [0, 15] and the damping range is
+ // [3, 6]. If pixel == kCdefLargeValue(0x4000), shifted_diff will always be
+ // larger than threshold. Subtract using saturation will return 0 when pixel
+ // == kCdefLargeValue.
+ static_assert(kCdefLargeValue == 0x4000, "Invalid kCdefLargeValue");
+ const __m128i thresh_minus_shifted_diff =
+ _mm_subs_epu16(threshold, shifted_diff);
+ const __m128i clamp_abs_diff =
+ _mm_min_epi16(thresh_minus_shifted_diff, abs_diff);
+ // Restore the sign.
+ return _mm_sign_epi16(clamp_abs_diff, diff);
+}
+
+inline __m128i ApplyConstrainAndTap(const __m128i& pixel, const __m128i& val,
+ const __m128i& tap, const __m128i& damping,
+ const __m128i& threshold) {
+ const __m128i constrained = Constrain(val, pixel, damping, threshold);
+ return _mm_mullo_epi16(constrained, tap);
+}
+
+template <int width, bool enable_primary = true, bool enable_secondary = true>
+void CdefFilter_SSE4_1(const uint16_t* src, const ptrdiff_t src_stride,
+ const int height, const int primary_strength,
+ const int secondary_strength, const int damping,
+ const int direction, void* dest,
+ const ptrdiff_t dst_stride) {
+ static_assert(width == 8 || width == 4, "Invalid CDEF width.");
+ static_assert(enable_primary || enable_secondary, "");
+ constexpr bool clipping_required = enable_primary && enable_secondary;
+ auto* dst = static_cast<uint8_t*>(dest);
+ __m128i primary_damping_shift, secondary_damping_shift;
+
+ // FloorLog2() requires input to be > 0.
+ // 8-bit damping range: Y: [3, 6], UV: [2, 5].
+ if (enable_primary) {
+ // primary_strength: [0, 15] -> FloorLog2: [0, 3] so a clamp is necessary
+ // for UV filtering.
+ primary_damping_shift =
+ _mm_cvtsi32_si128(std::max(0, damping - FloorLog2(primary_strength)));
+ }
+ if (enable_secondary) {
+ // secondary_strength: [0, 4] -> FloorLog2: [0, 2] so no clamp to 0 is
+ // necessary.
+ assert(damping - FloorLog2(secondary_strength) >= 0);
+ secondary_damping_shift =
+ _mm_cvtsi32_si128(damping - FloorLog2(secondary_strength));
+ }
+
+ const __m128i primary_tap_0 =
+ _mm_set1_epi16(kCdefPrimaryTaps[primary_strength & 1][0]);
+ const __m128i primary_tap_1 =
+ _mm_set1_epi16(kCdefPrimaryTaps[primary_strength & 1][1]);
+ const __m128i secondary_tap_0 = _mm_set1_epi16(kCdefSecondaryTap0);
+ const __m128i secondary_tap_1 = _mm_set1_epi16(kCdefSecondaryTap1);
+ const __m128i cdef_large_value_mask =
+ _mm_set1_epi16(static_cast<int16_t>(~kCdefLargeValue));
+ const __m128i primary_threshold = _mm_set1_epi16(primary_strength);
+ const __m128i secondary_threshold = _mm_set1_epi16(secondary_strength);
+
+ int y = height;
+ do {
+ __m128i pixel;
+ if (width == 8) {
+ pixel = LoadUnaligned16(src);
+ } else {
+ pixel = LoadHi8(LoadLo8(src), src + src_stride);
+ }
+
+ __m128i min = pixel;
+ __m128i max = pixel;
+ __m128i sum;
+
+ if (enable_primary) {
+ // Primary |direction|.
+ __m128i primary_val[4];
+ if (width == 8) {
+ LoadDirection(src, src_stride, primary_val, direction);
+ } else {
+ LoadDirection4(src, src_stride, primary_val, direction);
+ }
+
+ if (clipping_required) {
+ min = _mm_min_epu16(min, primary_val[0]);
+ min = _mm_min_epu16(min, primary_val[1]);
+ min = _mm_min_epu16(min, primary_val[2]);
+ min = _mm_min_epu16(min, primary_val[3]);
+
+ // The source is 16 bits, however, we only really care about the lower
+ // 8 bits. The upper 8 bits contain the "large" flag. After the final
+ // primary max has been calculated, zero out the upper 8 bits. Use this
+ // to find the "16 bit" max.
+ const __m128i max_p01 = _mm_max_epu8(primary_val[0], primary_val[1]);
+ const __m128i max_p23 = _mm_max_epu8(primary_val[2], primary_val[3]);
+ const __m128i max_p = _mm_max_epu8(max_p01, max_p23);
+ max = _mm_max_epu16(max, _mm_and_si128(max_p, cdef_large_value_mask));
+ }
+
+ sum = ApplyConstrainAndTap(pixel, primary_val[0], primary_tap_0,
+ primary_damping_shift, primary_threshold);
+ sum = _mm_add_epi16(
+ sum, ApplyConstrainAndTap(pixel, primary_val[1], primary_tap_0,
+ primary_damping_shift, primary_threshold));
+ sum = _mm_add_epi16(
+ sum, ApplyConstrainAndTap(pixel, primary_val[2], primary_tap_1,
+ primary_damping_shift, primary_threshold));
+ sum = _mm_add_epi16(
+ sum, ApplyConstrainAndTap(pixel, primary_val[3], primary_tap_1,
+ primary_damping_shift, primary_threshold));
+ } else {
+ sum = _mm_setzero_si128();
+ }
+
+ if (enable_secondary) {
+ // Secondary |direction| values (+/- 2). Clamp |direction|.
+ __m128i secondary_val[8];
+ if (width == 8) {
+ LoadDirection(src, src_stride, secondary_val, direction + 2);
+ LoadDirection(src, src_stride, secondary_val + 4, direction - 2);
+ } else {
+ LoadDirection4(src, src_stride, secondary_val, direction + 2);
+ LoadDirection4(src, src_stride, secondary_val + 4, direction - 2);
+ }
+
+ if (clipping_required) {
+ min = _mm_min_epu16(min, secondary_val[0]);
+ min = _mm_min_epu16(min, secondary_val[1]);
+ min = _mm_min_epu16(min, secondary_val[2]);
+ min = _mm_min_epu16(min, secondary_val[3]);
+ min = _mm_min_epu16(min, secondary_val[4]);
+ min = _mm_min_epu16(min, secondary_val[5]);
+ min = _mm_min_epu16(min, secondary_val[6]);
+ min = _mm_min_epu16(min, secondary_val[7]);
+
+ const __m128i max_s01 =
+ _mm_max_epu8(secondary_val[0], secondary_val[1]);
+ const __m128i max_s23 =
+ _mm_max_epu8(secondary_val[2], secondary_val[3]);
+ const __m128i max_s45 =
+ _mm_max_epu8(secondary_val[4], secondary_val[5]);
+ const __m128i max_s67 =
+ _mm_max_epu8(secondary_val[6], secondary_val[7]);
+ const __m128i max_s = _mm_max_epu8(_mm_max_epu8(max_s01, max_s23),
+ _mm_max_epu8(max_s45, max_s67));
+ max = _mm_max_epu16(max, _mm_and_si128(max_s, cdef_large_value_mask));
+ }
+
+ sum = _mm_add_epi16(
+ sum,
+ ApplyConstrainAndTap(pixel, secondary_val[0], secondary_tap_0,
+ secondary_damping_shift, secondary_threshold));
+ sum = _mm_add_epi16(
+ sum,
+ ApplyConstrainAndTap(pixel, secondary_val[1], secondary_tap_0,
+ secondary_damping_shift, secondary_threshold));
+ sum = _mm_add_epi16(
+ sum,
+ ApplyConstrainAndTap(pixel, secondary_val[2], secondary_tap_1,
+ secondary_damping_shift, secondary_threshold));
+ sum = _mm_add_epi16(
+ sum,
+ ApplyConstrainAndTap(pixel, secondary_val[3], secondary_tap_1,
+ secondary_damping_shift, secondary_threshold));
+ sum = _mm_add_epi16(
+ sum,
+ ApplyConstrainAndTap(pixel, secondary_val[4], secondary_tap_0,
+ secondary_damping_shift, secondary_threshold));
+ sum = _mm_add_epi16(
+ sum,
+ ApplyConstrainAndTap(pixel, secondary_val[5], secondary_tap_0,
+ secondary_damping_shift, secondary_threshold));
+ sum = _mm_add_epi16(
+ sum,
+ ApplyConstrainAndTap(pixel, secondary_val[6], secondary_tap_1,
+ secondary_damping_shift, secondary_threshold));
+ sum = _mm_add_epi16(
+ sum,
+ ApplyConstrainAndTap(pixel, secondary_val[7], secondary_tap_1,
+ secondary_damping_shift, secondary_threshold));
+ }
+ // Clip3(pixel + ((8 + sum - (sum < 0)) >> 4), min, max))
+ const __m128i sum_lt_0 = _mm_srai_epi16(sum, 15);
+ // 8 + sum
+ sum = _mm_add_epi16(sum, _mm_set1_epi16(8));
+ // (... - (sum < 0)) >> 4
+ sum = _mm_add_epi16(sum, sum_lt_0);
+ sum = _mm_srai_epi16(sum, 4);
+ // pixel + ...
+ sum = _mm_add_epi16(sum, pixel);
+ if (clipping_required) {
+ // Clip3
+ sum = _mm_min_epi16(sum, max);
+ sum = _mm_max_epi16(sum, min);
+ }
+
+ const __m128i result = _mm_packus_epi16(sum, sum);
+ if (width == 8) {
+ src += src_stride;
+ StoreLo8(dst, result);
+ dst += dst_stride;
+ --y;
+ } else {
+ src += src_stride << 1;
+ Store4(dst, result);
+ dst += dst_stride;
+ Store4(dst, _mm_srli_si128(result, 4));
+ dst += dst_stride;
+ y -= 2;
+ }
+ } while (y != 0);
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+ assert(dsp != nullptr);
+ dsp->cdef_direction = CdefDirection_SSE4_1;
+ dsp->cdef_filters[0][0] = CdefFilter_SSE4_1<4>;
+ dsp->cdef_filters[0][1] =
+ CdefFilter_SSE4_1<4, /*enable_primary=*/true, /*enable_secondary=*/false>;
+ dsp->cdef_filters[0][2] = CdefFilter_SSE4_1<4, /*enable_primary=*/false>;
+ dsp->cdef_filters[1][0] = CdefFilter_SSE4_1<8>;
+ dsp->cdef_filters[1][1] =
+ CdefFilter_SSE4_1<8, /*enable_primary=*/true, /*enable_secondary=*/false>;
+ dsp->cdef_filters[1][2] = CdefFilter_SSE4_1<8, /*enable_primary=*/false>;
+}
+
+} // namespace
+} // namespace low_bitdepth
+
+void CdefInit_SSE4_1() { low_bitdepth::Init8bpp(); }
+
+} // namespace dsp
+} // namespace libgav1
+#else // !LIBGAV1_TARGETING_SSE4_1
+namespace libgav1 {
+namespace dsp {
+
+void CdefInit_SSE4_1() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/cdef_sse4.h b/src/dsp/x86/cdef_sse4.h
new file mode 100644
index 0000000..6631eb7
--- /dev/null
+++ b/src/dsp/x86/cdef_sse4.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_CDEF_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_CDEF_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::cdef_direction and Dsp::cdef_filters. This function is not
+// thread-safe.
+void CdefInit_SSE4_1();
+
+} // namespace dsp
+} // namespace libgav1
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#ifndef LIBGAV1_Dsp8bpp_CdefDirection
+#define LIBGAV1_Dsp8bpp_CdefDirection LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_CdefFilters
+#define LIBGAV1_Dsp8bpp_CdefFilters LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif // LIBGAV1_TARGETING_SSE4_1
+
+#endif // LIBGAV1_SRC_DSP_X86_CDEF_SSE4_H_
diff --git a/src/dsp/x86/common_avx2.h b/src/dsp/x86/common_avx2.h
new file mode 100644
index 0000000..4ce7de2
--- /dev/null
+++ b/src/dsp/x86/common_avx2.h
@@ -0,0 +1,138 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_COMMON_AVX2_H_
+#define LIBGAV1_SRC_DSP_X86_COMMON_AVX2_H_
+
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_AVX2
+
+#include <immintrin.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+namespace libgav1 {
+namespace dsp {
+
+//------------------------------------------------------------------------------
+// Compatibility functions.
+
+inline __m256i SetrM128i(const __m128i lo, const __m128i hi) {
+ // For compatibility with older gcc toolchains (< 8) use
+ // _mm256_inserti128_si256 over _mm256_setr_m128i. Newer gcc implementations
+ // are implemented similarly to the following, clang uses a different method
+ // but no differences in assembly have been observed.
+ return _mm256_inserti128_si256(_mm256_castsi128_si256(lo), hi, 1);
+}
+
+//------------------------------------------------------------------------------
+// Load functions.
+
+inline __m256i LoadAligned32(const void* a) {
+ assert((reinterpret_cast<uintptr_t>(a) & 0x1f) == 0);
+ return _mm256_load_si256(static_cast<const __m256i*>(a));
+}
+
+inline void LoadAligned64(const void* a, __m256i dst[2]) {
+ assert((reinterpret_cast<uintptr_t>(a) & 0x1f) == 0);
+ dst[0] = _mm256_load_si256(static_cast<const __m256i*>(a) + 0);
+ dst[1] = _mm256_load_si256(static_cast<const __m256i*>(a) + 1);
+}
+
+inline __m256i LoadUnaligned32(const void* a) {
+ return _mm256_loadu_si256(static_cast<const __m256i*>(a));
+}
+
+//------------------------------------------------------------------------------
+// Load functions to avoid MemorySanitizer's use-of-uninitialized-value warning.
+
+inline __m256i MaskOverreads(const __m256i source,
+ const ptrdiff_t over_read_in_bytes) {
+ __m256i dst = source;
+#if LIBGAV1_MSAN
+ if (over_read_in_bytes >= 32) return _mm256_setzero_si256();
+ if (over_read_in_bytes > 0) {
+ __m128i m = _mm_set1_epi8(-1);
+ for (ptrdiff_t i = 0; i < over_read_in_bytes % 16; ++i) {
+ m = _mm_srli_si128(m, 1);
+ }
+ const __m256i mask = (over_read_in_bytes < 16)
+ ? SetrM128i(_mm_set1_epi8(-1), m)
+ : SetrM128i(m, _mm_setzero_si128());
+ dst = _mm256_and_si256(dst, mask);
+ }
+#else
+ static_cast<void>(over_read_in_bytes);
+#endif
+ return dst;
+}
+
+inline __m256i LoadAligned32Msan(const void* const source,
+ const ptrdiff_t over_read_in_bytes) {
+ return MaskOverreads(LoadAligned32(source), over_read_in_bytes);
+}
+
+inline void LoadAligned64Msan(const void* const source,
+ const ptrdiff_t over_read_in_bytes,
+ __m256i dst[2]) {
+ dst[0] = MaskOverreads(LoadAligned32(source), over_read_in_bytes);
+ dst[1] = MaskOverreads(LoadAligned32(static_cast<const __m256i*>(source) + 1),
+ over_read_in_bytes);
+}
+
+inline __m256i LoadUnaligned32Msan(const void* const source,
+ const ptrdiff_t over_read_in_bytes) {
+ return MaskOverreads(LoadUnaligned32(source), over_read_in_bytes);
+}
+
+//------------------------------------------------------------------------------
+// Store functions.
+
+inline void StoreAligned32(void* a, const __m256i v) {
+ assert((reinterpret_cast<uintptr_t>(a) & 0x1f) == 0);
+ _mm256_store_si256(static_cast<__m256i*>(a), v);
+}
+
+inline void StoreAligned64(void* a, const __m256i v[2]) {
+ assert((reinterpret_cast<uintptr_t>(a) & 0x1f) == 0);
+ _mm256_store_si256(static_cast<__m256i*>(a) + 0, v[0]);
+ _mm256_store_si256(static_cast<__m256i*>(a) + 1, v[1]);
+}
+
+inline void StoreUnaligned32(void* a, const __m256i v) {
+ _mm256_storeu_si256(static_cast<__m256i*>(a), v);
+}
+
+//------------------------------------------------------------------------------
+// Arithmetic utilities.
+
+inline __m256i RightShiftWithRounding_S16(const __m256i v_val_d, int bits) {
+ assert(bits <= 16);
+ const __m256i v_bias_d =
+ _mm256_set1_epi16(static_cast<int16_t>((1 << bits) >> 1));
+ const __m256i v_tmp_d = _mm256_add_epi16(v_val_d, v_bias_d);
+ return _mm256_srai_epi16(v_tmp_d, bits);
+}
+
+} // namespace dsp
+} // namespace libgav1
+
+#endif // LIBGAV1_TARGETING_AVX2
+#endif // LIBGAV1_SRC_DSP_X86_COMMON_AVX2_H_
diff --git a/src/dsp/x86/common_sse4.h b/src/dsp/x86/common_sse4.h
new file mode 100644
index 0000000..c510f8c
--- /dev/null
+++ b/src/dsp/x86/common_sse4.h
@@ -0,0 +1,265 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_COMMON_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_COMMON_SSE4_H_
+
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <emmintrin.h>
+#include <smmintrin.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+
+#if 0
+#include <cinttypes>
+#include <cstdio>
+
+// Quite useful macro for debugging. Left here for convenience.
+inline void PrintReg(const __m128i r, const char* const name, int size) {
+ int n;
+ union {
+ __m128i r;
+ uint8_t i8[16];
+ uint16_t i16[8];
+ uint32_t i32[4];
+ uint64_t i64[2];
+ } tmp;
+ tmp.r = r;
+ fprintf(stderr, "%s\t: ", name);
+ if (size == 8) {
+ for (n = 0; n < 16; ++n) fprintf(stderr, "%.2x ", tmp.i8[n]);
+ } else if (size == 16) {
+ for (n = 0; n < 8; ++n) fprintf(stderr, "%.4x ", tmp.i16[n]);
+ } else if (size == 32) {
+ for (n = 0; n < 4; ++n) fprintf(stderr, "%.8x ", tmp.i32[n]);
+ } else {
+ for (n = 0; n < 2; ++n)
+ fprintf(stderr, "%.16" PRIx64 " ", static_cast<uint64_t>(tmp.i64[n]));
+ }
+ fprintf(stderr, "\n");
+}
+
+inline void PrintReg(const int r, const char* const name) {
+ fprintf(stderr, "%s: %d\n", name, r);
+}
+
+inline void PrintRegX(const int r, const char* const name) {
+ fprintf(stderr, "%s: %.8x\n", name, r);
+}
+
+#define PR(var, N) PrintReg(var, #var, N)
+#define PD(var) PrintReg(var, #var);
+#define PX(var) PrintRegX(var, #var);
+#endif // 0
+
+namespace libgav1 {
+namespace dsp {
+
+//------------------------------------------------------------------------------
+// Load functions.
+
+inline __m128i Load2(const void* src) {
+ int16_t val;
+ memcpy(&val, src, sizeof(val));
+ return _mm_cvtsi32_si128(val);
+}
+
+inline __m128i Load2x2(const void* src1, const void* src2) {
+ uint16_t val1;
+ uint16_t val2;
+ memcpy(&val1, src1, sizeof(val1));
+ memcpy(&val2, src2, sizeof(val2));
+ return _mm_cvtsi32_si128(val1 | (val2 << 16));
+}
+
+// Load 2 uint8_t values into |lane| * 2 and |lane| * 2 + 1.
+template <int lane>
+inline __m128i Load2(const void* const buf, __m128i val) {
+ uint16_t temp;
+ memcpy(&temp, buf, 2);
+ return _mm_insert_epi16(val, temp, lane);
+}
+
+inline __m128i Load4(const void* src) {
+ // With new compilers such as clang 8.0.0 we can use the new _mm_loadu_si32
+ // intrinsic. Both _mm_loadu_si32(src) and the code here are compiled into a
+ // movss instruction.
+ //
+ // Until compiler support of _mm_loadu_si32 is widespread, use of
+ // _mm_loadu_si32 is banned.
+ int val;
+ memcpy(&val, src, sizeof(val));
+ return _mm_cvtsi32_si128(val);
+}
+
+inline __m128i Load4x2(const void* src1, const void* src2) {
+ // With new compilers such as clang 8.0.0 we can use the new _mm_loadu_si32
+ // intrinsic. Both _mm_loadu_si32(src) and the code here are compiled into a
+ // movss instruction.
+ //
+ // Until compiler support of _mm_loadu_si32 is widespread, use of
+ // _mm_loadu_si32 is banned.
+ int val1, val2;
+ memcpy(&val1, src1, sizeof(val1));
+ memcpy(&val2, src2, sizeof(val2));
+ return _mm_insert_epi32(_mm_cvtsi32_si128(val1), val2, 1);
+}
+
+inline __m128i LoadLo8(const void* a) {
+ return _mm_loadl_epi64(static_cast<const __m128i*>(a));
+}
+
+inline __m128i LoadHi8(const __m128i v, const void* a) {
+ const __m128 x =
+ _mm_loadh_pi(_mm_castsi128_ps(v), static_cast<const __m64*>(a));
+ return _mm_castps_si128(x);
+}
+
+inline __m128i LoadUnaligned16(const void* a) {
+ return _mm_loadu_si128(static_cast<const __m128i*>(a));
+}
+
+inline __m128i LoadAligned16(const void* a) {
+ assert((reinterpret_cast<uintptr_t>(a) & 0xf) == 0);
+ return _mm_load_si128(static_cast<const __m128i*>(a));
+}
+
+//------------------------------------------------------------------------------
+// Load functions to avoid MemorySanitizer's use-of-uninitialized-value warning.
+
+inline __m128i MaskOverreads(const __m128i source,
+ const ptrdiff_t over_read_in_bytes) {
+ __m128i dst = source;
+#if LIBGAV1_MSAN
+ if (over_read_in_bytes > 0) {
+ __m128i mask = _mm_set1_epi8(-1);
+ for (ptrdiff_t i = 0; i < over_read_in_bytes; ++i) {
+ mask = _mm_srli_si128(mask, 1);
+ }
+ dst = _mm_and_si128(dst, mask);
+ }
+#else
+ static_cast<void>(over_read_in_bytes);
+#endif
+ return dst;
+}
+
+inline __m128i LoadLo8Msan(const void* const source,
+ const ptrdiff_t over_read_in_bytes) {
+ return MaskOverreads(LoadLo8(source), over_read_in_bytes + 8);
+}
+
+inline __m128i LoadHi8Msan(const __m128i v, const void* source,
+ const ptrdiff_t over_read_in_bytes) {
+ return MaskOverreads(LoadHi8(v, source), over_read_in_bytes);
+}
+
+inline __m128i LoadAligned16Msan(const void* const source,
+ const ptrdiff_t over_read_in_bytes) {
+ return MaskOverreads(LoadAligned16(source), over_read_in_bytes);
+}
+
+inline __m128i LoadUnaligned16Msan(const void* const source,
+ const ptrdiff_t over_read_in_bytes) {
+ return MaskOverreads(LoadUnaligned16(source), over_read_in_bytes);
+}
+
+//------------------------------------------------------------------------------
+// Store functions.
+
+inline void Store2(void* dst, const __m128i x) {
+ const int val = _mm_cvtsi128_si32(x);
+ memcpy(dst, &val, 2);
+}
+
+inline void Store4(void* dst, const __m128i x) {
+ const int val = _mm_cvtsi128_si32(x);
+ memcpy(dst, &val, sizeof(val));
+}
+
+inline void StoreLo8(void* a, const __m128i v) {
+ _mm_storel_epi64(static_cast<__m128i*>(a), v);
+}
+
+inline void StoreHi8(void* a, const __m128i v) {
+ _mm_storeh_pi(static_cast<__m64*>(a), _mm_castsi128_ps(v));
+}
+
+inline void StoreAligned16(void* a, const __m128i v) {
+ assert((reinterpret_cast<uintptr_t>(a) & 0xf) == 0);
+ _mm_store_si128(static_cast<__m128i*>(a), v);
+}
+
+inline void StoreUnaligned16(void* a, const __m128i v) {
+ _mm_storeu_si128(static_cast<__m128i*>(a), v);
+}
+
+//------------------------------------------------------------------------------
+// Arithmetic utilities.
+
+inline __m128i RightShiftWithRounding_U16(const __m128i v_val_d, int bits) {
+ assert(bits <= 16);
+ // Shift out all but the last bit.
+ const __m128i v_tmp_d = _mm_srli_epi16(v_val_d, bits - 1);
+ // Avg with zero will shift by 1 and round.
+ return _mm_avg_epu16(v_tmp_d, _mm_setzero_si128());
+}
+
+inline __m128i RightShiftWithRounding_S16(const __m128i v_val_d, int bits) {
+ assert(bits <= 16);
+ const __m128i v_bias_d =
+ _mm_set1_epi16(static_cast<int16_t>((1 << bits) >> 1));
+ const __m128i v_tmp_d = _mm_add_epi16(v_val_d, v_bias_d);
+ return _mm_srai_epi16(v_tmp_d, bits);
+}
+
+inline __m128i RightShiftWithRounding_U32(const __m128i v_val_d, int bits) {
+ const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
+ const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d);
+ return _mm_srli_epi32(v_tmp_d, bits);
+}
+
+inline __m128i RightShiftWithRounding_S32(const __m128i v_val_d, int bits) {
+ const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
+ const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d);
+ return _mm_srai_epi32(v_tmp_d, bits);
+}
+
+//------------------------------------------------------------------------------
+// Masking utilities
+inline __m128i MaskHighNBytes(int n) {
+ static constexpr uint8_t kMask[32] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ };
+
+ return LoadUnaligned16(kMask + n);
+}
+
+} // namespace dsp
+} // namespace libgav1
+
+#endif // LIBGAV1_TARGETING_SSE4_1
+#endif // LIBGAV1_SRC_DSP_X86_COMMON_SSE4_H_
diff --git a/src/dsp/x86/convolve_avx2.cc b/src/dsp/x86/convolve_avx2.cc
new file mode 100644
index 0000000..3df2120
--- /dev/null
+++ b/src/dsp/x86/convolve_avx2.cc
@@ -0,0 +1,534 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/convolve.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_AVX2
+#include <immintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_avx2.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+constexpr int kHorizontalOffset = 3;
+
+// Multiply every entry in |src[]| by the corresponding entry in |taps[]| and
+// sum. The filters in |taps[]| are pre-shifted by 1. This prevents the final
+// sum from outranging int16_t.
+template <int filter_index>
+__m256i SumOnePassTaps(const __m256i* const src, const __m256i* const taps) {
+ __m256i sum;
+ if (filter_index < 2) {
+ // 6 taps.
+ const __m256i v_madd_21 = _mm256_maddubs_epi16(src[0], taps[0]); // k2k1
+ const __m256i v_madd_43 = _mm256_maddubs_epi16(src[1], taps[1]); // k4k3
+ const __m256i v_madd_65 = _mm256_maddubs_epi16(src[2], taps[2]); // k6k5
+ sum = _mm256_add_epi16(v_madd_21, v_madd_43);
+ sum = _mm256_add_epi16(sum, v_madd_65);
+ } else if (filter_index == 2) {
+ // 8 taps.
+ const __m256i v_madd_10 = _mm256_maddubs_epi16(src[0], taps[0]); // k1k0
+ const __m256i v_madd_32 = _mm256_maddubs_epi16(src[1], taps[1]); // k3k2
+ const __m256i v_madd_54 = _mm256_maddubs_epi16(src[2], taps[2]); // k5k4
+ const __m256i v_madd_76 = _mm256_maddubs_epi16(src[3], taps[3]); // k7k6
+ const __m256i v_sum_3210 = _mm256_add_epi16(v_madd_10, v_madd_32);
+ const __m256i v_sum_7654 = _mm256_add_epi16(v_madd_54, v_madd_76);
+ sum = _mm256_add_epi16(v_sum_7654, v_sum_3210);
+ } else if (filter_index == 3) {
+ // 2 taps.
+ sum = _mm256_maddubs_epi16(src[0], taps[0]); // k4k3
+ } else {
+ // 4 taps.
+ const __m256i v_madd_32 = _mm256_maddubs_epi16(src[0], taps[0]); // k3k2
+ const __m256i v_madd_54 = _mm256_maddubs_epi16(src[1], taps[1]); // k5k4
+ sum = _mm256_add_epi16(v_madd_32, v_madd_54);
+ }
+ return sum;
+}
+
+template <int filter_index>
+__m256i SumHorizontalTaps(const __m256i* const src,
+ const __m256i* const v_tap) {
+ __m256i v_src[4];
+ const __m256i src_long = *src;
+ const __m256i src_long_dup_lo = _mm256_unpacklo_epi8(src_long, src_long);
+ const __m256i src_long_dup_hi = _mm256_unpackhi_epi8(src_long, src_long);
+
+ if (filter_index < 2) {
+ // 6 taps.
+ v_src[0] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 3); // _21
+ v_src[1] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 7); // _43
+ v_src[2] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 11); // _65
+ } else if (filter_index == 2) {
+ // 8 taps.
+ v_src[0] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 1); // _10
+ v_src[1] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 5); // _32
+ v_src[2] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 9); // _54
+ v_src[3] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 13); // _76
+ } else if (filter_index == 3) {
+ // 2 taps.
+ v_src[0] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 7); // _43
+ } else if (filter_index > 3) {
+ // 4 taps.
+ v_src[0] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 5); // _32
+ v_src[1] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 9); // _54
+ }
+ return SumOnePassTaps<filter_index>(v_src, v_tap);
+}
+
+template <int filter_index>
+__m256i SimpleHorizontalTaps(const __m256i* const src,
+ const __m256i* const v_tap) {
+ __m256i sum = SumHorizontalTaps<filter_index>(src, v_tap);
+
+ // Normally the Horizontal pass does the downshift in two passes:
+ // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
+ // kInterRoundBitsHorizontal). Each one uses a rounding shift. Combining them
+ // requires adding the rounding offset from the skipped shift.
+ constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2);
+
+ sum = _mm256_add_epi16(sum, _mm256_set1_epi16(first_shift_rounding_bit));
+ sum = RightShiftWithRounding_S16(sum, kFilterBits - 1);
+ return _mm256_packus_epi16(sum, sum);
+}
+
+template <int filter_index>
+__m128i SumHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride,
+ const __m128i* const v_tap) {
+ // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17
+ const __m128i v_src = LoadHi8(LoadLo8(&src[0]), &src[src_stride]);
+
+ if (filter_index == 3) {
+ // 03 04 04 05 05 06 06 07 13 14 14 15 15 16 16 17
+ const __m128i v_src_43 = _mm_shuffle_epi8(
+ v_src, _mm_set_epi32(0x0f0e0e0d, 0x0d0c0c0b, 0x07060605, 0x05040403));
+ const __m128i v_sum_43 = _mm_maddubs_epi16(v_src_43, v_tap[0]); // k4k3
+ return v_sum_43;
+ }
+
+ // 02 03 03 04 04 05 05 06 12 13 13 14 14 15 15 16
+ const __m128i v_src_32 = _mm_shuffle_epi8(
+ v_src, _mm_set_epi32(0x0e0d0d0c, 0x0c0b0b0a, 0x06050504, 0x04030302));
+ // 04 05 05 06 06 07 07 xx 14 15 15 16 16 17 17 xx
+ const __m128i v_src_54 = _mm_shuffle_epi8(
+ v_src, _mm_set_epi32(0x800f0f0e, 0x0e0d0d0c, 0x80070706, 0x06050504));
+ const __m128i v_madd_32 = _mm_maddubs_epi16(v_src_32, v_tap[0]); // k3k2
+ const __m128i v_madd_54 = _mm_maddubs_epi16(v_src_54, v_tap[1]); // k5k4
+ const __m128i v_sum_5432 = _mm_add_epi16(v_madd_54, v_madd_32);
+ return v_sum_5432;
+}
+
+template <int filter_index>
+__m128i SimpleHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride,
+ const __m128i* const v_tap) {
+ __m128i sum = SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
+
+ // Normally the Horizontal pass does the downshift in two passes:
+ // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
+ // kInterRoundBitsHorizontal). Each one uses a rounding shift. Combining them
+ // requires adding the rounding offset from the skipped shift.
+ constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2);
+
+ sum = _mm_add_epi16(sum, _mm_set1_epi16(first_shift_rounding_bit));
+ sum = RightShiftWithRounding_S16(sum, kFilterBits - 1);
+ return _mm_packus_epi16(sum, sum);
+}
+
+template <int filter_index>
+__m128i HorizontalTaps8To16_2x2(const uint8_t* src, const ptrdiff_t src_stride,
+ const __m128i* const v_tap) {
+ const __m128i sum =
+ SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
+
+ return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
+}
+
+// Filter 2xh sizes.
+template <int num_taps, int step, int filter_index, bool is_2d = false,
+ bool is_compound = false>
+void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride,
+ void* const dest, const ptrdiff_t pred_stride,
+ const int /*width*/, const int height,
+ const __m128i* const v_tap) {
+ auto* dest8 = static_cast<uint8_t*>(dest);
+ auto* dest16 = static_cast<uint16_t*>(dest);
+
+ // Horizontal passes only need to account for |num_taps| 2 and 4 when
+ // |width| <= 4.
+ assert(num_taps <= 4);
+ if (num_taps <= 4) {
+ if (!is_compound) {
+ int y = 0;
+ do {
+ if (is_2d) {
+ const __m128i sum =
+ HorizontalTaps8To16_2x2<filter_index>(src, src_stride, v_tap);
+ Store4(&dest16[0], sum);
+ dest16 += pred_stride;
+ Store4(&dest16[0], _mm_srli_si128(sum, 8));
+ dest16 += pred_stride;
+ } else {
+ const __m128i sum =
+ SimpleHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
+ Store2(dest8, sum);
+ dest8 += pred_stride;
+ Store2(dest8, _mm_srli_si128(sum, 4));
+ dest8 += pred_stride;
+ }
+
+ src += src_stride << 1;
+ y += 2;
+ } while (y < height - 1);
+
+ // The 2d filters have an odd |height| because the horizontal pass
+ // generates context for the vertical pass.
+ if (is_2d) {
+ assert(height % 2 == 1);
+ __m128i sum;
+ const __m128i input = LoadLo8(&src[2]);
+ if (filter_index == 3) {
+ // 03 04 04 05 05 06 06 07 ....
+ const __m128i v_src_43 =
+ _mm_srli_si128(_mm_unpacklo_epi8(input, input), 3);
+ sum = _mm_maddubs_epi16(v_src_43, v_tap[0]); // k4k3
+ } else {
+ // 02 03 03 04 04 05 05 06 06 07 ....
+ const __m128i v_src_32 =
+ _mm_srli_si128(_mm_unpacklo_epi8(input, input), 1);
+ // 04 05 05 06 06 07 07 08 ...
+ const __m128i v_src_54 = _mm_srli_si128(v_src_32, 4);
+ const __m128i v_madd_32 =
+ _mm_maddubs_epi16(v_src_32, v_tap[0]); // k3k2
+ const __m128i v_madd_54 =
+ _mm_maddubs_epi16(v_src_54, v_tap[1]); // k5k4
+ sum = _mm_add_epi16(v_madd_54, v_madd_32);
+ }
+ sum = RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
+ Store4(dest16, sum);
+ }
+ }
+ }
+}
+
+// Filter widths >= 4.
+template <int num_taps, int step, int filter_index, bool is_2d = false,
+ bool is_compound = false>
+void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride,
+ void* const dest, const ptrdiff_t pred_stride,
+ const int width, const int height,
+ const __m256i* const v_tap) {
+ auto* dest8 = static_cast<uint8_t*>(dest);
+ auto* dest16 = static_cast<uint16_t*>(dest);
+
+ if (width >= 32) {
+ int y = height;
+ do {
+ int x = 0;
+ do {
+ if (is_2d || is_compound) {
+ // placeholder
+ } else {
+ // Load src used to calculate dest8[7:0] and dest8[23:16].
+ const __m256i src_long = LoadUnaligned32(&src[x]);
+ const __m256i result =
+ SimpleHorizontalTaps<filter_index>(&src_long, v_tap);
+ // Load src used to calculate dest8[15:8] and dest8[31:24].
+ const __m256i src_long2 = LoadUnaligned32(&src[x + 8]);
+ const __m256i result2 =
+ SimpleHorizontalTaps<filter_index>(&src_long2, v_tap);
+ // Combine results and store.
+ StoreUnaligned32(&dest8[x], _mm256_unpacklo_epi64(result, result2));
+ }
+ x += step * 4;
+ } while (x < width);
+ src += src_stride;
+ dest8 += pred_stride;
+ dest16 += pred_stride;
+ } while (--y != 0);
+ } else if (width == 16) {
+ int y = height;
+ do {
+ if (is_2d || is_compound) {
+ // placeholder
+ } else {
+ // Load into 2 128 bit lanes.
+ const __m256i src_long = SetrM128i(LoadUnaligned16(&src[0]),
+ LoadUnaligned16(&src[src_stride]));
+ const __m256i result =
+ SimpleHorizontalTaps<filter_index>(&src_long, v_tap);
+ const __m256i src_long2 = SetrM128i(
+ LoadUnaligned16(&src[8]), LoadUnaligned16(&src[8 + src_stride]));
+ const __m256i result2 =
+ SimpleHorizontalTaps<filter_index>(&src_long2, v_tap);
+ const __m256i packed_result = _mm256_unpacklo_epi64(result, result2);
+ StoreUnaligned16(&dest8[0], _mm256_castsi256_si128(packed_result));
+ StoreUnaligned16(&dest8[pred_stride],
+ _mm256_extracti128_si256(packed_result, 1));
+ }
+ src += src_stride * 2;
+ dest8 += pred_stride * 2;
+ dest16 += pred_stride * 2;
+ y -= 2;
+ } while (y != 0);
+ } else if (width == 8) {
+ int y = height;
+ do {
+ if (is_2d || is_compound) {
+ // placeholder
+ } else {
+ const __m128i this_row = LoadUnaligned16(&src[0]);
+ const __m128i next_row = LoadUnaligned16(&src[src_stride]);
+ // Load into 2 128 bit lanes.
+ const __m256i src_long = SetrM128i(this_row, next_row);
+ const __m256i result =
+ SimpleHorizontalTaps<filter_index>(&src_long, v_tap);
+ StoreLo8(&dest8[0], _mm256_castsi256_si128(result));
+ StoreLo8(&dest8[pred_stride], _mm256_extracti128_si256(result, 1));
+ }
+ src += src_stride * 2;
+ dest8 += pred_stride * 2;
+ dest16 += pred_stride * 2;
+ y -= 2;
+ } while (y != 0);
+ } else { // width == 4
+ int y = height;
+ do {
+ if (is_2d || is_compound) {
+ // placeholder
+ } else {
+ const __m128i this_row = LoadUnaligned16(&src[0]);
+ const __m128i next_row = LoadUnaligned16(&src[src_stride]);
+ // Load into 2 128 bit lanes.
+ const __m256i src_long = SetrM128i(this_row, next_row);
+ const __m256i result =
+ SimpleHorizontalTaps<filter_index>(&src_long, v_tap);
+ Store4(&dest8[0], _mm256_castsi256_si128(result));
+ Store4(&dest8[pred_stride], _mm256_extracti128_si256(result, 1));
+ }
+ src += src_stride * 2;
+ dest8 += pred_stride * 2;
+ dest16 += pred_stride * 2;
+ y -= 2;
+ } while (y != 0);
+ }
+}
+
+template <int num_taps, bool is_2d_vertical = false>
+LIBGAV1_ALWAYS_INLINE void SetupTaps(const __m128i* const filter,
+ __m128i* v_tap) {
+ if (num_taps == 8) {
+ v_tap[0] = _mm_shufflelo_epi16(*filter, 0x0); // k1k0
+ v_tap[1] = _mm_shufflelo_epi16(*filter, 0x55); // k3k2
+ v_tap[2] = _mm_shufflelo_epi16(*filter, 0xaa); // k5k4
+ v_tap[3] = _mm_shufflelo_epi16(*filter, 0xff); // k7k6
+ if (is_2d_vertical) {
+ v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
+ v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]);
+ v_tap[2] = _mm_cvtepi8_epi16(v_tap[2]);
+ v_tap[3] = _mm_cvtepi8_epi16(v_tap[3]);
+ } else {
+ v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
+ v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]);
+ v_tap[2] = _mm_unpacklo_epi64(v_tap[2], v_tap[2]);
+ v_tap[3] = _mm_unpacklo_epi64(v_tap[3], v_tap[3]);
+ }
+ } else if (num_taps == 6) {
+ const __m128i adjusted_filter = _mm_srli_si128(*filter, 1);
+ v_tap[0] = _mm_shufflelo_epi16(adjusted_filter, 0x0); // k2k1
+ v_tap[1] = _mm_shufflelo_epi16(adjusted_filter, 0x55); // k4k3
+ v_tap[2] = _mm_shufflelo_epi16(adjusted_filter, 0xaa); // k6k5
+ if (is_2d_vertical) {
+ v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
+ v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]);
+ v_tap[2] = _mm_cvtepi8_epi16(v_tap[2]);
+ } else {
+ v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
+ v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]);
+ v_tap[2] = _mm_unpacklo_epi64(v_tap[2], v_tap[2]);
+ }
+ } else if (num_taps == 4) {
+ v_tap[0] = _mm_shufflelo_epi16(*filter, 0x55); // k3k2
+ v_tap[1] = _mm_shufflelo_epi16(*filter, 0xaa); // k5k4
+ if (is_2d_vertical) {
+ v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
+ v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]);
+ } else {
+ v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
+ v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]);
+ }
+ } else { // num_taps == 2
+ const __m128i adjusted_filter = _mm_srli_si128(*filter, 1);
+ v_tap[0] = _mm_shufflelo_epi16(adjusted_filter, 0x55); // k4k3
+ if (is_2d_vertical) {
+ v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
+ } else {
+ v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
+ }
+ }
+}
+
+template <int num_taps, bool is_2d_vertical = false>
+LIBGAV1_ALWAYS_INLINE void SetupTaps(const __m128i* const filter,
+ __m256i* v_tap) {
+ if (num_taps == 8) {
+ v_tap[0] = _mm256_broadcastw_epi16(*filter); // k1k0
+ v_tap[1] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 2)); // k3k2
+ v_tap[2] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 4)); // k5k4
+ v_tap[3] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 6)); // k7k6
+ if (is_2d_vertical) {
+ // placeholder
+ }
+ } else if (num_taps == 6) {
+ v_tap[0] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 1)); // k2k1
+ v_tap[1] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 3)); // k4k3
+ v_tap[2] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 5)); // k6k5
+ if (is_2d_vertical) {
+ // placeholder
+ }
+ } else if (num_taps == 4) {
+ v_tap[0] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 2)); // k3k2
+ v_tap[1] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 4)); // k5k4
+ if (is_2d_vertical) {
+ // placeholder
+ }
+ } else { // num_taps == 2
+ v_tap[0] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 3)); // k4k3
+ if (is_2d_vertical) {
+ // placeholder
+ }
+ }
+}
+
+template <bool is_2d = false, bool is_compound = false>
+LIBGAV1_ALWAYS_INLINE void DoHorizontalPass2xH(
+ const uint8_t* const src, const ptrdiff_t src_stride, void* const dst,
+ const ptrdiff_t dst_stride, const int width, const int height,
+ const int filter_id, const int filter_index) {
+ assert(filter_id != 0);
+ __m128i v_tap[4];
+ const __m128i v_horizontal_filter =
+ LoadLo8(kHalfSubPixelFilters[filter_index][filter_id]);
+
+ if (filter_index == 4) { // 4 tap.
+ SetupTaps<4>(&v_horizontal_filter, v_tap);
+ FilterHorizontal<4, 8, 4, is_2d, is_compound>(
+ src, src_stride, dst, dst_stride, width, height, v_tap);
+ } else if (filter_index == 5) { // 4 tap.
+ SetupTaps<4>(&v_horizontal_filter, v_tap);
+ FilterHorizontal<4, 8, 5, is_2d, is_compound>(
+ src, src_stride, dst, dst_stride, width, height, v_tap);
+ } else { // 2 tap.
+ SetupTaps<2>(&v_horizontal_filter, v_tap);
+ FilterHorizontal<2, 8, 3, is_2d, is_compound>(
+ src, src_stride, dst, dst_stride, width, height, v_tap);
+ }
+}
+
+template <bool is_2d = false, bool is_compound = false>
+LIBGAV1_ALWAYS_INLINE void DoHorizontalPass(
+ const uint8_t* const src, const ptrdiff_t src_stride, void* const dst,
+ const ptrdiff_t dst_stride, const int width, const int height,
+ const int filter_id, const int filter_index) {
+ assert(filter_id != 0);
+ __m256i v_tap[4];
+ const __m128i v_horizontal_filter =
+ LoadLo8(kHalfSubPixelFilters[filter_index][filter_id]);
+
+ if (filter_index == 2) { // 8 tap.
+ SetupTaps<8>(&v_horizontal_filter, v_tap);
+ FilterHorizontal<8, 8, 2, is_2d, is_compound>(
+ src, src_stride, dst, dst_stride, width, height, v_tap);
+ } else if (filter_index == 1) { // 6 tap.
+ SetupTaps<6>(&v_horizontal_filter, v_tap);
+ FilterHorizontal<6, 8, 1, is_2d, is_compound>(
+ src, src_stride, dst, dst_stride, width, height, v_tap);
+ } else if (filter_index == 0) { // 6 tap.
+ SetupTaps<6>(&v_horizontal_filter, v_tap);
+ FilterHorizontal<6, 8, 0, is_2d, is_compound>(
+ src, src_stride, dst, dst_stride, width, height, v_tap);
+ } else if (filter_index == 4) { // 4 tap.
+ SetupTaps<4>(&v_horizontal_filter, v_tap);
+ FilterHorizontal<4, 8, 4, is_2d, is_compound>(
+ src, src_stride, dst, dst_stride, width, height, v_tap);
+ } else if (filter_index == 5) { // 4 tap.
+ SetupTaps<4>(&v_horizontal_filter, v_tap);
+ FilterHorizontal<4, 8, 5, is_2d, is_compound>(
+ src, src_stride, dst, dst_stride, width, height, v_tap);
+ } else { // 2 tap.
+ SetupTaps<2>(&v_horizontal_filter, v_tap);
+ FilterHorizontal<2, 8, 3, is_2d, is_compound>(
+ src, src_stride, dst, dst_stride, width, height, v_tap);
+ }
+}
+
+void ConvolveHorizontal_AVX2(const void* const reference,
+ const ptrdiff_t reference_stride,
+ const int horizontal_filter_index,
+ const int /*vertical_filter_index*/,
+ const int horizontal_filter_id,
+ const int /*vertical_filter_id*/, const int width,
+ const int height, void* prediction,
+ const ptrdiff_t pred_stride) {
+ const int filter_index = GetFilterIndex(horizontal_filter_index, width);
+ // Set |src| to the outermost tap.
+ const auto* src = static_cast<const uint8_t*>(reference) - kHorizontalOffset;
+ auto* dest = static_cast<uint8_t*>(prediction);
+
+ if (width > 2) {
+ DoHorizontalPass(src, reference_stride, dest, pred_stride, width, height,
+ horizontal_filter_id, filter_index);
+ } else {
+ // Use non avx2 version for smaller widths.
+ DoHorizontalPass2xH(src, reference_stride, dest, pred_stride, width, height,
+ horizontal_filter_id, filter_index);
+ }
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+ dsp->convolve[0][0][0][1] = ConvolveHorizontal_AVX2;
+}
+
+} // namespace
+} // namespace low_bitdepth
+
+void ConvolveInit_AVX2() { low_bitdepth::Init8bpp(); }
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_TARGETING_AVX2
+namespace libgav1 {
+namespace dsp {
+
+void ConvolveInit_AVX2() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_TARGETING_AVX2
diff --git a/src/dsp/x86/convolve_avx2.h b/src/dsp/x86/convolve_avx2.h
new file mode 100644
index 0000000..6179d98
--- /dev/null
+++ b/src/dsp/x86/convolve_avx2.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_CONVOLVE_AVX2_H_
+#define LIBGAV1_SRC_DSP_X86_CONVOLVE_AVX2_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::convolve, see the defines below for specifics. This
+// function is not thread-safe.
+void ConvolveInit_AVX2();
+
+} // namespace dsp
+} // namespace libgav1
+
+// If avx2 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the avx2 implementation should be used.
+#if LIBGAV1_TARGETING_AVX2
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveHorizontal
+#define LIBGAV1_Dsp8bpp_ConvolveHorizontal LIBGAV1_CPU_AVX2
+#endif
+
+#endif // LIBGAV1_TARGETING_AVX2
+
+#endif // LIBGAV1_SRC_DSP_X86_CONVOLVE_AVX2_H_
diff --git a/src/dsp/x86/convolve_sse4.cc b/src/dsp/x86/convolve_sse4.cc
new file mode 100644
index 0000000..3a0fff5
--- /dev/null
+++ b/src/dsp/x86/convolve_sse4.cc
@@ -0,0 +1,2830 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/convolve.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+#include <smmintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+#include "src/dsp/convolve.inc"
+
+// Multiply every entry in |src[]| by the corresponding entry in |taps[]| and
+// sum. The filters in |taps[]| are pre-shifted by 1. This prevents the final
+// sum from outranging int16_t.
+template <int filter_index>
+__m128i SumOnePassTaps(const __m128i* const src, const __m128i* const taps) {
+ __m128i sum;
+ if (filter_index < 2) {
+ // 6 taps.
+ const __m128i v_madd_21 = _mm_maddubs_epi16(src[0], taps[0]); // k2k1
+ const __m128i v_madd_43 = _mm_maddubs_epi16(src[1], taps[1]); // k4k3
+ const __m128i v_madd_65 = _mm_maddubs_epi16(src[2], taps[2]); // k6k5
+ sum = _mm_add_epi16(v_madd_21, v_madd_43);
+ sum = _mm_add_epi16(sum, v_madd_65);
+ } else if (filter_index == 2) {
+ // 8 taps.
+ const __m128i v_madd_10 = _mm_maddubs_epi16(src[0], taps[0]); // k1k0
+ const __m128i v_madd_32 = _mm_maddubs_epi16(src[1], taps[1]); // k3k2
+ const __m128i v_madd_54 = _mm_maddubs_epi16(src[2], taps[2]); // k5k4
+ const __m128i v_madd_76 = _mm_maddubs_epi16(src[3], taps[3]); // k7k6
+ const __m128i v_sum_3210 = _mm_add_epi16(v_madd_10, v_madd_32);
+ const __m128i v_sum_7654 = _mm_add_epi16(v_madd_54, v_madd_76);
+ sum = _mm_add_epi16(v_sum_7654, v_sum_3210);
+ } else if (filter_index == 3) {
+ // 2 taps.
+ sum = _mm_maddubs_epi16(src[0], taps[0]); // k4k3
+ } else {
+ // 4 taps.
+ const __m128i v_madd_32 = _mm_maddubs_epi16(src[0], taps[0]); // k3k2
+ const __m128i v_madd_54 = _mm_maddubs_epi16(src[1], taps[1]); // k5k4
+ sum = _mm_add_epi16(v_madd_32, v_madd_54);
+ }
+ return sum;
+}
+
+template <int filter_index>
+__m128i SumHorizontalTaps(const uint8_t* const src,
+ const __m128i* const v_tap) {
+ __m128i v_src[4];
+ const __m128i src_long = LoadUnaligned16(src);
+ const __m128i src_long_dup_lo = _mm_unpacklo_epi8(src_long, src_long);
+ const __m128i src_long_dup_hi = _mm_unpackhi_epi8(src_long, src_long);
+
+ if (filter_index < 2) {
+ // 6 taps.
+ v_src[0] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 3); // _21
+ v_src[1] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 7); // _43
+ v_src[2] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 11); // _65
+ } else if (filter_index == 2) {
+ // 8 taps.
+ v_src[0] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 1); // _10
+ v_src[1] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 5); // _32
+ v_src[2] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 9); // _54
+ v_src[3] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 13); // _76
+ } else if (filter_index == 3) {
+ // 2 taps.
+ v_src[0] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 7); // _43
+ } else if (filter_index > 3) {
+ // 4 taps.
+ v_src[0] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 5); // _32
+ v_src[1] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 9); // _54
+ }
+ const __m128i sum = SumOnePassTaps<filter_index>(v_src, v_tap);
+ return sum;
+}
+
+template <int filter_index>
+__m128i SimpleHorizontalTaps(const uint8_t* const src,
+ const __m128i* const v_tap) {
+ __m128i sum = SumHorizontalTaps<filter_index>(src, v_tap);
+
+ // Normally the Horizontal pass does the downshift in two passes:
+ // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
+ // kInterRoundBitsHorizontal). Each one uses a rounding shift. Combining them
+ // requires adding the rounding offset from the skipped shift.
+ constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2);
+
+ sum = _mm_add_epi16(sum, _mm_set1_epi16(first_shift_rounding_bit));
+ sum = RightShiftWithRounding_S16(sum, kFilterBits - 1);
+ return _mm_packus_epi16(sum, sum);
+}
+
+template <int filter_index>
+__m128i HorizontalTaps8To16(const uint8_t* const src,
+ const __m128i* const v_tap) {
+ const __m128i sum = SumHorizontalTaps<filter_index>(src, v_tap);
+
+ return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
+}
+
+template <int filter_index>
+__m128i SumHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride,
+ const __m128i* const v_tap) {
+ const __m128i input0 = LoadLo8(&src[2]);
+ const __m128i input1 = LoadLo8(&src[2 + src_stride]);
+
+ if (filter_index == 3) {
+ // 03 04 04 05 05 06 06 07 ....
+ const __m128i input0_dup =
+ _mm_srli_si128(_mm_unpacklo_epi8(input0, input0), 3);
+ // 13 14 14 15 15 16 16 17 ....
+ const __m128i input1_dup =
+ _mm_srli_si128(_mm_unpacklo_epi8(input1, input1), 3);
+ const __m128i v_src_43 = _mm_unpacklo_epi64(input0_dup, input1_dup);
+ const __m128i v_sum_43 = _mm_maddubs_epi16(v_src_43, v_tap[0]); // k4k3
+ return v_sum_43;
+ }
+
+ // 02 03 03 04 04 05 05 06 06 07 ....
+ const __m128i input0_dup =
+ _mm_srli_si128(_mm_unpacklo_epi8(input0, input0), 1);
+ // 12 13 13 14 14 15 15 16 16 17 ....
+ const __m128i input1_dup =
+ _mm_srli_si128(_mm_unpacklo_epi8(input1, input1), 1);
+ // 04 05 05 06 06 07 07 08 ...
+ const __m128i input0_dup_54 = _mm_srli_si128(input0_dup, 4);
+ // 14 15 15 16 16 17 17 18 ...
+ const __m128i input1_dup_54 = _mm_srli_si128(input1_dup, 4);
+ const __m128i v_src_32 = _mm_unpacklo_epi64(input0_dup, input1_dup);
+ const __m128i v_src_54 = _mm_unpacklo_epi64(input0_dup_54, input1_dup_54);
+ const __m128i v_madd_32 = _mm_maddubs_epi16(v_src_32, v_tap[0]); // k3k2
+ const __m128i v_madd_54 = _mm_maddubs_epi16(v_src_54, v_tap[1]); // k5k4
+ const __m128i v_sum_5432 = _mm_add_epi16(v_madd_54, v_madd_32);
+ return v_sum_5432;
+}
+
+template <int filter_index>
+__m128i SimpleHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride,
+ const __m128i* const v_tap) {
+ __m128i sum = SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
+
+ // Normally the Horizontal pass does the downshift in two passes:
+ // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
+ // kInterRoundBitsHorizontal). Each one uses a rounding shift. Combining them
+ // requires adding the rounding offset from the skipped shift.
+ constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2);
+
+ sum = _mm_add_epi16(sum, _mm_set1_epi16(first_shift_rounding_bit));
+ sum = RightShiftWithRounding_S16(sum, kFilterBits - 1);
+ return _mm_packus_epi16(sum, sum);
+}
+
+template <int filter_index>
+__m128i HorizontalTaps8To16_2x2(const uint8_t* src, const ptrdiff_t src_stride,
+ const __m128i* const v_tap) {
+ const __m128i sum =
+ SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
+
+ return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
+}
+
+template <int num_taps, int step, int filter_index, bool is_2d = false,
+ bool is_compound = false>
+void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride,
+ void* const dest, const ptrdiff_t pred_stride,
+ const int width, const int height,
+ const __m128i* const v_tap) {
+ auto* dest8 = static_cast<uint8_t*>(dest);
+ auto* dest16 = static_cast<uint16_t*>(dest);
+
+ // 4 tap filters are never used when width > 4.
+ if (num_taps != 4 && width > 4) {
+ int y = 0;
+ do {
+ int x = 0;
+ do {
+ if (is_2d || is_compound) {
+ const __m128i v_sum =
+ HorizontalTaps8To16<filter_index>(&src[x], v_tap);
+ if (is_2d) {
+ StoreAligned16(&dest16[x], v_sum);
+ } else {
+ StoreUnaligned16(&dest16[x], v_sum);
+ }
+ } else {
+ const __m128i result =
+ SimpleHorizontalTaps<filter_index>(&src[x], v_tap);
+ StoreLo8(&dest8[x], result);
+ }
+ x += step;
+ } while (x < width);
+ src += src_stride;
+ dest8 += pred_stride;
+ dest16 += pred_stride;
+ } while (++y < height);
+ return;
+ }
+
+ // Horizontal passes only needs to account for |num_taps| 2 and 4 when
+ // |width| <= 4.
+ assert(width <= 4);
+ assert(num_taps <= 4);
+ if (num_taps <= 4) {
+ if (width == 4) {
+ int y = 0;
+ do {
+ if (is_2d || is_compound) {
+ const __m128i v_sum = HorizontalTaps8To16<filter_index>(src, v_tap);
+ StoreLo8(dest16, v_sum);
+ } else {
+ const __m128i result = SimpleHorizontalTaps<filter_index>(src, v_tap);
+ Store4(&dest8[0], result);
+ }
+ src += src_stride;
+ dest8 += pred_stride;
+ dest16 += pred_stride;
+ } while (++y < height);
+ return;
+ }
+
+ if (!is_compound) {
+ int y = 0;
+ do {
+ if (is_2d) {
+ const __m128i sum =
+ HorizontalTaps8To16_2x2<filter_index>(src, src_stride, v_tap);
+ Store4(&dest16[0], sum);
+ dest16 += pred_stride;
+ Store4(&dest16[0], _mm_srli_si128(sum, 8));
+ dest16 += pred_stride;
+ } else {
+ const __m128i sum =
+ SimpleHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
+ Store2(dest8, sum);
+ dest8 += pred_stride;
+ Store2(dest8, _mm_srli_si128(sum, 4));
+ dest8 += pred_stride;
+ }
+
+ src += src_stride << 1;
+ y += 2;
+ } while (y < height - 1);
+
+ // The 2d filters have an odd |height| because the horizontal pass
+ // generates context for the vertical pass.
+ if (is_2d) {
+ assert(height % 2 == 1);
+ __m128i sum;
+ const __m128i input = LoadLo8(&src[2]);
+ if (filter_index == 3) {
+ // 03 04 04 05 05 06 06 07 ....
+ const __m128i v_src_43 =
+ _mm_srli_si128(_mm_unpacklo_epi8(input, input), 3);
+ sum = _mm_maddubs_epi16(v_src_43, v_tap[0]); // k4k3
+ } else {
+ // 02 03 03 04 04 05 05 06 06 07 ....
+ const __m128i v_src_32 =
+ _mm_srli_si128(_mm_unpacklo_epi8(input, input), 1);
+ // 04 05 05 06 06 07 07 08 ...
+ const __m128i v_src_54 = _mm_srli_si128(v_src_32, 4);
+ const __m128i v_madd_32 =
+ _mm_maddubs_epi16(v_src_32, v_tap[0]); // k3k2
+ const __m128i v_madd_54 =
+ _mm_maddubs_epi16(v_src_54, v_tap[1]); // k5k4
+ sum = _mm_add_epi16(v_madd_54, v_madd_32);
+ }
+ sum = RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
+ Store4(dest16, sum);
+ }
+ }
+ }
+}
+
+template <int num_taps, bool is_2d_vertical = false>
+LIBGAV1_ALWAYS_INLINE void SetupTaps(const __m128i* const filter,
+ __m128i* v_tap) {
+ if (num_taps == 8) {
+ v_tap[0] = _mm_shufflelo_epi16(*filter, 0x0); // k1k0
+ v_tap[1] = _mm_shufflelo_epi16(*filter, 0x55); // k3k2
+ v_tap[2] = _mm_shufflelo_epi16(*filter, 0xaa); // k5k4
+ v_tap[3] = _mm_shufflelo_epi16(*filter, 0xff); // k7k6
+ if (is_2d_vertical) {
+ v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
+ v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]);
+ v_tap[2] = _mm_cvtepi8_epi16(v_tap[2]);
+ v_tap[3] = _mm_cvtepi8_epi16(v_tap[3]);
+ } else {
+ v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
+ v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]);
+ v_tap[2] = _mm_unpacklo_epi64(v_tap[2], v_tap[2]);
+ v_tap[3] = _mm_unpacklo_epi64(v_tap[3], v_tap[3]);
+ }
+ } else if (num_taps == 6) {
+ const __m128i adjusted_filter = _mm_srli_si128(*filter, 1);
+ v_tap[0] = _mm_shufflelo_epi16(adjusted_filter, 0x0); // k2k1
+ v_tap[1] = _mm_shufflelo_epi16(adjusted_filter, 0x55); // k4k3
+ v_tap[2] = _mm_shufflelo_epi16(adjusted_filter, 0xaa); // k6k5
+ if (is_2d_vertical) {
+ v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
+ v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]);
+ v_tap[2] = _mm_cvtepi8_epi16(v_tap[2]);
+ } else {
+ v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
+ v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]);
+ v_tap[2] = _mm_unpacklo_epi64(v_tap[2], v_tap[2]);
+ }
+ } else if (num_taps == 4) {
+ v_tap[0] = _mm_shufflelo_epi16(*filter, 0x55); // k3k2
+ v_tap[1] = _mm_shufflelo_epi16(*filter, 0xaa); // k5k4
+ if (is_2d_vertical) {
+ v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
+ v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]);
+ } else {
+ v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
+ v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]);
+ }
+ } else { // num_taps == 2
+ const __m128i adjusted_filter = _mm_srli_si128(*filter, 1);
+ v_tap[0] = _mm_shufflelo_epi16(adjusted_filter, 0x55); // k4k3
+ if (is_2d_vertical) {
+ v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
+ } else {
+ v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
+ }
+ }
+}
+
+template <int num_taps, bool is_compound>
+__m128i SimpleSum2DVerticalTaps(const __m128i* const src,
+ const __m128i* const taps) {
+ __m128i sum_lo = _mm_madd_epi16(_mm_unpacklo_epi16(src[0], src[1]), taps[0]);
+ __m128i sum_hi = _mm_madd_epi16(_mm_unpackhi_epi16(src[0], src[1]), taps[0]);
+ if (num_taps >= 4) {
+ __m128i madd_lo =
+ _mm_madd_epi16(_mm_unpacklo_epi16(src[2], src[3]), taps[1]);
+ __m128i madd_hi =
+ _mm_madd_epi16(_mm_unpackhi_epi16(src[2], src[3]), taps[1]);
+ sum_lo = _mm_add_epi32(sum_lo, madd_lo);
+ sum_hi = _mm_add_epi32(sum_hi, madd_hi);
+ if (num_taps >= 6) {
+ madd_lo = _mm_madd_epi16(_mm_unpacklo_epi16(src[4], src[5]), taps[2]);
+ madd_hi = _mm_madd_epi16(_mm_unpackhi_epi16(src[4], src[5]), taps[2]);
+ sum_lo = _mm_add_epi32(sum_lo, madd_lo);
+ sum_hi = _mm_add_epi32(sum_hi, madd_hi);
+ if (num_taps == 8) {
+ madd_lo = _mm_madd_epi16(_mm_unpacklo_epi16(src[6], src[7]), taps[3]);
+ madd_hi = _mm_madd_epi16(_mm_unpackhi_epi16(src[6], src[7]), taps[3]);
+ sum_lo = _mm_add_epi32(sum_lo, madd_lo);
+ sum_hi = _mm_add_epi32(sum_hi, madd_hi);
+ }
+ }
+ }
+
+ if (is_compound) {
+ return _mm_packs_epi32(
+ RightShiftWithRounding_S32(sum_lo, kInterRoundBitsCompoundVertical - 1),
+ RightShiftWithRounding_S32(sum_hi,
+ kInterRoundBitsCompoundVertical - 1));
+ }
+
+ return _mm_packs_epi32(
+ RightShiftWithRounding_S32(sum_lo, kInterRoundBitsVertical - 1),
+ RightShiftWithRounding_S32(sum_hi, kInterRoundBitsVertical - 1));
+}
+
+template <int num_taps, bool is_compound = false>
+void Filter2DVertical(const uint16_t* src, void* const dst,
+ const ptrdiff_t dst_stride, const int width,
+ const int height, const __m128i* const taps) {
+ assert(width >= 8);
+ constexpr int next_row = num_taps - 1;
+ // The Horizontal pass uses |width| as |stride| for the intermediate buffer.
+ const ptrdiff_t src_stride = width;
+
+ auto* dst8 = static_cast<uint8_t*>(dst);
+ auto* dst16 = static_cast<uint16_t*>(dst);
+
+ int x = 0;
+ do {
+ __m128i srcs[8];
+ const uint16_t* src_x = src + x;
+ srcs[0] = LoadAligned16(src_x);
+ src_x += src_stride;
+ if (num_taps >= 4) {
+ srcs[1] = LoadAligned16(src_x);
+ src_x += src_stride;
+ srcs[2] = LoadAligned16(src_x);
+ src_x += src_stride;
+ if (num_taps >= 6) {
+ srcs[3] = LoadAligned16(src_x);
+ src_x += src_stride;
+ srcs[4] = LoadAligned16(src_x);
+ src_x += src_stride;
+ if (num_taps == 8) {
+ srcs[5] = LoadAligned16(src_x);
+ src_x += src_stride;
+ srcs[6] = LoadAligned16(src_x);
+ src_x += src_stride;
+ }
+ }
+ }
+
+ int y = 0;
+ do {
+ srcs[next_row] = LoadAligned16(src_x);
+ src_x += src_stride;
+
+ const __m128i sum =
+ SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps);
+ if (is_compound) {
+ StoreUnaligned16(dst16 + x + y * dst_stride, sum);
+ } else {
+ StoreLo8(dst8 + x + y * dst_stride, _mm_packus_epi16(sum, sum));
+ }
+
+ srcs[0] = srcs[1];
+ if (num_taps >= 4) {
+ srcs[1] = srcs[2];
+ srcs[2] = srcs[3];
+ if (num_taps >= 6) {
+ srcs[3] = srcs[4];
+ srcs[4] = srcs[5];
+ if (num_taps == 8) {
+ srcs[5] = srcs[6];
+ srcs[6] = srcs[7];
+ }
+ }
+ }
+ } while (++y < height);
+ x += 8;
+ } while (x < width);
+}
+
+// Take advantage of |src_stride| == |width| to process two rows at a time.
+template <int num_taps, bool is_compound = false>
+void Filter2DVertical4xH(const uint16_t* src, void* const dst,
+ const ptrdiff_t dst_stride, const int height,
+ const __m128i* const taps) {
+ auto* dst8 = static_cast<uint8_t*>(dst);
+ auto* dst16 = static_cast<uint16_t*>(dst);
+
+ __m128i srcs[9];
+ srcs[0] = LoadAligned16(src);
+ src += 8;
+ if (num_taps >= 4) {
+ srcs[2] = LoadAligned16(src);
+ src += 8;
+ srcs[1] = _mm_unpacklo_epi64(_mm_srli_si128(srcs[0], 8), srcs[2]);
+ if (num_taps >= 6) {
+ srcs[4] = LoadAligned16(src);
+ src += 8;
+ srcs[3] = _mm_unpacklo_epi64(_mm_srli_si128(srcs[2], 8), srcs[4]);
+ if (num_taps == 8) {
+ srcs[6] = LoadAligned16(src);
+ src += 8;
+ srcs[5] = _mm_unpacklo_epi64(_mm_srli_si128(srcs[4], 8), srcs[6]);
+ }
+ }
+ }
+
+ int y = 0;
+ do {
+ srcs[num_taps] = LoadAligned16(src);
+ src += 8;
+ srcs[num_taps - 1] = _mm_unpacklo_epi64(
+ _mm_srli_si128(srcs[num_taps - 2], 8), srcs[num_taps]);
+
+ const __m128i sum =
+ SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps);
+ if (is_compound) {
+ StoreUnaligned16(dst16, sum);
+ dst16 += 4 << 1;
+ } else {
+ const __m128i results = _mm_packus_epi16(sum, sum);
+ Store4(dst8, results);
+ dst8 += dst_stride;
+ Store4(dst8, _mm_srli_si128(results, 4));
+ dst8 += dst_stride;
+ }
+
+ srcs[0] = srcs[2];
+ if (num_taps >= 4) {
+ srcs[1] = srcs[3];
+ srcs[2] = srcs[4];
+ if (num_taps >= 6) {
+ srcs[3] = srcs[5];
+ srcs[4] = srcs[6];
+ if (num_taps == 8) {
+ srcs[5] = srcs[7];
+ srcs[6] = srcs[8];
+ }
+ }
+ }
+ y += 2;
+ } while (y < height);
+}
+
+// Take advantage of |src_stride| == |width| to process four rows at a time.
+template <int num_taps>
+void Filter2DVertical2xH(const uint16_t* src, void* const dst,
+ const ptrdiff_t dst_stride, const int height,
+ const __m128i* const taps) {
+ constexpr int next_row = (num_taps < 6) ? 4 : 8;
+
+ auto* dst8 = static_cast<uint8_t*>(dst);
+
+ __m128i srcs[9];
+ srcs[0] = LoadAligned16(src);
+ src += 8;
+ if (num_taps >= 6) {
+ srcs[4] = LoadAligned16(src);
+ src += 8;
+ srcs[1] = _mm_alignr_epi8(srcs[4], srcs[0], 4);
+ if (num_taps == 8) {
+ srcs[2] = _mm_alignr_epi8(srcs[4], srcs[0], 8);
+ srcs[3] = _mm_alignr_epi8(srcs[4], srcs[0], 12);
+ }
+ }
+
+ int y = 0;
+ do {
+ srcs[next_row] = LoadAligned16(src);
+ src += 8;
+ if (num_taps == 2) {
+ srcs[1] = _mm_alignr_epi8(srcs[4], srcs[0], 4);
+ } else if (num_taps == 4) {
+ srcs[1] = _mm_alignr_epi8(srcs[4], srcs[0], 4);
+ srcs[2] = _mm_alignr_epi8(srcs[4], srcs[0], 8);
+ srcs[3] = _mm_alignr_epi8(srcs[4], srcs[0], 12);
+ } else if (num_taps == 6) {
+ srcs[2] = _mm_alignr_epi8(srcs[4], srcs[0], 8);
+ srcs[3] = _mm_alignr_epi8(srcs[4], srcs[0], 12);
+ srcs[5] = _mm_alignr_epi8(srcs[8], srcs[4], 4);
+ } else if (num_taps == 8) {
+ srcs[5] = _mm_alignr_epi8(srcs[8], srcs[4], 4);
+ srcs[6] = _mm_alignr_epi8(srcs[8], srcs[4], 8);
+ srcs[7] = _mm_alignr_epi8(srcs[8], srcs[4], 12);
+ }
+
+ const __m128i sum =
+ SimpleSum2DVerticalTaps<num_taps, /*is_compound=*/false>(srcs, taps);
+ const __m128i results = _mm_packus_epi16(sum, sum);
+
+ Store2(dst8, results);
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 2));
+ // When |height| <= 4 the taps are restricted to 2 and 4 tap variants.
+ // Therefore we don't need to check this condition when |height| > 4.
+ if (num_taps <= 4 && height == 2) return;
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 4));
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 6));
+ dst8 += dst_stride;
+
+ srcs[0] = srcs[4];
+ if (num_taps == 6) {
+ srcs[1] = srcs[5];
+ srcs[4] = srcs[8];
+ } else if (num_taps == 8) {
+ srcs[1] = srcs[5];
+ srcs[2] = srcs[6];
+ srcs[3] = srcs[7];
+ srcs[4] = srcs[8];
+ }
+
+ y += 4;
+ } while (y < height);
+}
+
+template <bool is_2d = false, bool is_compound = false>
+LIBGAV1_ALWAYS_INLINE void DoHorizontalPass(
+ const uint8_t* const src, const ptrdiff_t src_stride, void* const dst,
+ const ptrdiff_t dst_stride, const int width, const int height,
+ const int filter_id, const int filter_index) {
+ assert(filter_id != 0);
+ __m128i v_tap[4];
+ const __m128i v_horizontal_filter =
+ LoadLo8(kHalfSubPixelFilters[filter_index][filter_id]);
+
+ if (filter_index == 2) { // 8 tap.
+ SetupTaps<8>(&v_horizontal_filter, v_tap);
+ FilterHorizontal<8, 8, 2, is_2d, is_compound>(
+ src, src_stride, dst, dst_stride, width, height, v_tap);
+ } else if (filter_index == 1) { // 6 tap.
+ SetupTaps<6>(&v_horizontal_filter, v_tap);
+ FilterHorizontal<6, 8, 1, is_2d, is_compound>(
+ src, src_stride, dst, dst_stride, width, height, v_tap);
+ } else if (filter_index == 0) { // 6 tap.
+ SetupTaps<6>(&v_horizontal_filter, v_tap);
+ FilterHorizontal<6, 8, 0, is_2d, is_compound>(
+ src, src_stride, dst, dst_stride, width, height, v_tap);
+ } else if (filter_index == 4) { // 4 tap.
+ SetupTaps<4>(&v_horizontal_filter, v_tap);
+ FilterHorizontal<4, 8, 4, is_2d, is_compound>(
+ src, src_stride, dst, dst_stride, width, height, v_tap);
+ } else if (filter_index == 5) { // 4 tap.
+ SetupTaps<4>(&v_horizontal_filter, v_tap);
+ FilterHorizontal<4, 8, 5, is_2d, is_compound>(
+ src, src_stride, dst, dst_stride, width, height, v_tap);
+ } else { // 2 tap.
+ SetupTaps<2>(&v_horizontal_filter, v_tap);
+ FilterHorizontal<2, 8, 3, is_2d, is_compound>(
+ src, src_stride, dst, dst_stride, width, height, v_tap);
+ }
+}
+
+void Convolve2D_SSE4_1(const void* const reference,
+ const ptrdiff_t reference_stride,
+ const int horizontal_filter_index,
+ const int vertical_filter_index,
+ const int horizontal_filter_id,
+ const int vertical_filter_id, const int width,
+ const int height, void* prediction,
+ const ptrdiff_t pred_stride) {
+ const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
+ const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
+ const int vertical_taps = GetNumTapsInFilter(vert_filter_index);
+
+ // The output of the horizontal filter is guaranteed to fit in 16 bits.
+ alignas(16) uint16_t
+ intermediate_result[kMaxSuperBlockSizeInPixels *
+ (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
+ const int intermediate_height = height + vertical_taps - 1;
+
+ const ptrdiff_t src_stride = reference_stride;
+ const auto* src = static_cast<const uint8_t*>(reference) -
+ (vertical_taps / 2 - 1) * src_stride - kHorizontalOffset;
+
+ DoHorizontalPass</*is_2d=*/true>(src, src_stride, intermediate_result, width,
+ width, intermediate_height,
+ horizontal_filter_id, horiz_filter_index);
+
+ // Vertical filter.
+ auto* dest = static_cast<uint8_t*>(prediction);
+ const ptrdiff_t dest_stride = pred_stride;
+ assert(vertical_filter_id != 0);
+
+ __m128i taps[4];
+ const __m128i v_filter =
+ LoadLo8(kHalfSubPixelFilters[vert_filter_index][vertical_filter_id]);
+
+ if (vertical_taps == 8) {
+ SetupTaps<8, /*is_2d_vertical=*/true>(&v_filter, taps);
+ if (width == 2) {
+ Filter2DVertical2xH<8>(intermediate_result, dest, dest_stride, height,
+ taps);
+ } else if (width == 4) {
+ Filter2DVertical4xH<8>(intermediate_result, dest, dest_stride, height,
+ taps);
+ } else {
+ Filter2DVertical<8>(intermediate_result, dest, dest_stride, width, height,
+ taps);
+ }
+ } else if (vertical_taps == 6) {
+ SetupTaps<6, /*is_2d_vertical=*/true>(&v_filter, taps);
+ if (width == 2) {
+ Filter2DVertical2xH<6>(intermediate_result, dest, dest_stride, height,
+ taps);
+ } else if (width == 4) {
+ Filter2DVertical4xH<6>(intermediate_result, dest, dest_stride, height,
+ taps);
+ } else {
+ Filter2DVertical<6>(intermediate_result, dest, dest_stride, width, height,
+ taps);
+ }
+ } else if (vertical_taps == 4) {
+ SetupTaps<4, /*is_2d_vertical=*/true>(&v_filter, taps);
+ if (width == 2) {
+ Filter2DVertical2xH<4>(intermediate_result, dest, dest_stride, height,
+ taps);
+ } else if (width == 4) {
+ Filter2DVertical4xH<4>(intermediate_result, dest, dest_stride, height,
+ taps);
+ } else {
+ Filter2DVertical<4>(intermediate_result, dest, dest_stride, width, height,
+ taps);
+ }
+ } else { // |vertical_taps| == 2
+ SetupTaps<2, /*is_2d_vertical=*/true>(&v_filter, taps);
+ if (width == 2) {
+ Filter2DVertical2xH<2>(intermediate_result, dest, dest_stride, height,
+ taps);
+ } else if (width == 4) {
+ Filter2DVertical4xH<2>(intermediate_result, dest, dest_stride, height,
+ taps);
+ } else {
+ Filter2DVertical<2>(intermediate_result, dest, dest_stride, width, height,
+ taps);
+ }
+ }
+}
+
+// The 1D compound shift is always |kInterRoundBitsHorizontal|, even for 1D
+// Vertical calculations.
+__m128i Compound1DShift(const __m128i sum) {
+ return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
+}
+
+template <int filter_index>
+__m128i SumVerticalTaps(const __m128i* const srcs, const __m128i* const v_tap) {
+ __m128i v_src[4];
+
+ if (filter_index < 2) {
+ // 6 taps.
+ v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
+ v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]);
+ v_src[2] = _mm_unpacklo_epi8(srcs[4], srcs[5]);
+ } else if (filter_index == 2) {
+ // 8 taps.
+ v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
+ v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]);
+ v_src[2] = _mm_unpacklo_epi8(srcs[4], srcs[5]);
+ v_src[3] = _mm_unpacklo_epi8(srcs[6], srcs[7]);
+ } else if (filter_index == 3) {
+ // 2 taps.
+ v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
+ } else if (filter_index > 3) {
+ // 4 taps.
+ v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
+ v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]);
+ }
+ const __m128i sum = SumOnePassTaps<filter_index>(v_src, v_tap);
+ return sum;
+}
+
+template <int filter_index, bool is_compound = false>
+void FilterVertical(const uint8_t* src, const ptrdiff_t src_stride,
+ void* const dst, const ptrdiff_t dst_stride,
+ const int width, const int height,
+ const __m128i* const v_tap) {
+ const int num_taps = GetNumTapsInFilter(filter_index);
+ const int next_row = num_taps - 1;
+ auto* dst8 = static_cast<uint8_t*>(dst);
+ auto* dst16 = static_cast<uint16_t*>(dst);
+ assert(width >= 8);
+
+ int x = 0;
+ do {
+ const uint8_t* src_x = src + x;
+ __m128i srcs[8];
+ srcs[0] = LoadLo8(src_x);
+ src_x += src_stride;
+ if (num_taps >= 4) {
+ srcs[1] = LoadLo8(src_x);
+ src_x += src_stride;
+ srcs[2] = LoadLo8(src_x);
+ src_x += src_stride;
+ if (num_taps >= 6) {
+ srcs[3] = LoadLo8(src_x);
+ src_x += src_stride;
+ srcs[4] = LoadLo8(src_x);
+ src_x += src_stride;
+ if (num_taps == 8) {
+ srcs[5] = LoadLo8(src_x);
+ src_x += src_stride;
+ srcs[6] = LoadLo8(src_x);
+ src_x += src_stride;
+ }
+ }
+ }
+
+ int y = 0;
+ do {
+ srcs[next_row] = LoadLo8(src_x);
+ src_x += src_stride;
+
+ const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+ if (is_compound) {
+ const __m128i results = Compound1DShift(sums);
+ StoreUnaligned16(dst16 + x + y * dst_stride, results);
+ } else {
+ const __m128i results =
+ RightShiftWithRounding_S16(sums, kFilterBits - 1);
+ StoreLo8(dst8 + x + y * dst_stride, _mm_packus_epi16(results, results));
+ }
+
+ srcs[0] = srcs[1];
+ if (num_taps >= 4) {
+ srcs[1] = srcs[2];
+ srcs[2] = srcs[3];
+ if (num_taps >= 6) {
+ srcs[3] = srcs[4];
+ srcs[4] = srcs[5];
+ if (num_taps == 8) {
+ srcs[5] = srcs[6];
+ srcs[6] = srcs[7];
+ }
+ }
+ }
+ } while (++y < height);
+ x += 8;
+ } while (x < width);
+}
+
+template <int filter_index, bool is_compound = false>
+void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
+ void* const dst, const ptrdiff_t dst_stride,
+ const int height, const __m128i* const v_tap) {
+ const int num_taps = GetNumTapsInFilter(filter_index);
+ auto* dst8 = static_cast<uint8_t*>(dst);
+ auto* dst16 = static_cast<uint16_t*>(dst);
+
+ __m128i srcs[9];
+
+ if (num_taps == 2) {
+ srcs[2] = _mm_setzero_si128();
+ // 00 01 02 03
+ srcs[0] = Load4(src);
+ src += src_stride;
+
+ int y = 0;
+ do {
+ // 10 11 12 13
+ const __m128i a = Load4(src);
+ // 00 01 02 03 10 11 12 13
+ srcs[0] = _mm_unpacklo_epi32(srcs[0], a);
+ src += src_stride;
+ // 20 21 22 23
+ srcs[2] = Load4(src);
+ src += src_stride;
+ // 10 11 12 13 20 21 22 23
+ srcs[1] = _mm_unpacklo_epi32(a, srcs[2]);
+
+ const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+ if (is_compound) {
+ const __m128i results = Compound1DShift(sums);
+ StoreUnaligned16(dst16, results);
+ dst16 += 4 << 1;
+ } else {
+ const __m128i results_16 =
+ RightShiftWithRounding_S16(sums, kFilterBits - 1);
+ const __m128i results = _mm_packus_epi16(results_16, results_16);
+ Store4(dst8, results);
+ dst8 += dst_stride;
+ Store4(dst8, _mm_srli_si128(results, 4));
+ dst8 += dst_stride;
+ }
+
+ srcs[0] = srcs[2];
+ y += 2;
+ } while (y < height);
+ } else if (num_taps == 4) {
+ srcs[4] = _mm_setzero_si128();
+ // 00 01 02 03
+ srcs[0] = Load4(src);
+ src += src_stride;
+ // 10 11 12 13
+ const __m128i a = Load4(src);
+ // 00 01 02 03 10 11 12 13
+ srcs[0] = _mm_unpacklo_epi32(srcs[0], a);
+ src += src_stride;
+ // 20 21 22 23
+ srcs[2] = Load4(src);
+ src += src_stride;
+ // 10 11 12 13 20 21 22 23
+ srcs[1] = _mm_unpacklo_epi32(a, srcs[2]);
+
+ int y = 0;
+ do {
+ // 30 31 32 33
+ const __m128i b = Load4(src);
+ // 20 21 22 23 30 31 32 33
+ srcs[2] = _mm_unpacklo_epi32(srcs[2], b);
+ src += src_stride;
+ // 40 41 42 43
+ srcs[4] = Load4(src);
+ src += src_stride;
+ // 30 31 32 33 40 41 42 43
+ srcs[3] = _mm_unpacklo_epi32(b, srcs[4]);
+
+ const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+ if (is_compound) {
+ const __m128i results = Compound1DShift(sums);
+ StoreUnaligned16(dst16, results);
+ dst16 += 4 << 1;
+ } else {
+ const __m128i results_16 =
+ RightShiftWithRounding_S16(sums, kFilterBits - 1);
+ const __m128i results = _mm_packus_epi16(results_16, results_16);
+ Store4(dst8, results);
+ dst8 += dst_stride;
+ Store4(dst8, _mm_srli_si128(results, 4));
+ dst8 += dst_stride;
+ }
+
+ srcs[0] = srcs[2];
+ srcs[1] = srcs[3];
+ srcs[2] = srcs[4];
+ y += 2;
+ } while (y < height);
+ } else if (num_taps == 6) {
+ srcs[6] = _mm_setzero_si128();
+ // 00 01 02 03
+ srcs[0] = Load4(src);
+ src += src_stride;
+ // 10 11 12 13
+ const __m128i a = Load4(src);
+ // 00 01 02 03 10 11 12 13
+ srcs[0] = _mm_unpacklo_epi32(srcs[0], a);
+ src += src_stride;
+ // 20 21 22 23
+ srcs[2] = Load4(src);
+ src += src_stride;
+ // 10 11 12 13 20 21 22 23
+ srcs[1] = _mm_unpacklo_epi32(a, srcs[2]);
+ // 30 31 32 33
+ const __m128i b = Load4(src);
+ // 20 21 22 23 30 31 32 33
+ srcs[2] = _mm_unpacklo_epi32(srcs[2], b);
+ src += src_stride;
+ // 40 41 42 43
+ srcs[4] = Load4(src);
+ src += src_stride;
+ // 30 31 32 33 40 41 42 43
+ srcs[3] = _mm_unpacklo_epi32(b, srcs[4]);
+
+ int y = 0;
+ do {
+ // 50 51 52 53
+ const __m128i c = Load4(src);
+ // 40 41 42 43 50 51 52 53
+ srcs[4] = _mm_unpacklo_epi32(srcs[4], c);
+ src += src_stride;
+ // 60 61 62 63
+ srcs[6] = Load4(src);
+ src += src_stride;
+ // 50 51 52 53 60 61 62 63
+ srcs[5] = _mm_unpacklo_epi32(c, srcs[6]);
+
+ const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+ if (is_compound) {
+ const __m128i results = Compound1DShift(sums);
+ StoreUnaligned16(dst16, results);
+ dst16 += 4 << 1;
+ } else {
+ const __m128i results_16 =
+ RightShiftWithRounding_S16(sums, kFilterBits - 1);
+ const __m128i results = _mm_packus_epi16(results_16, results_16);
+ Store4(dst8, results);
+ dst8 += dst_stride;
+ Store4(dst8, _mm_srli_si128(results, 4));
+ dst8 += dst_stride;
+ }
+
+ srcs[0] = srcs[2];
+ srcs[1] = srcs[3];
+ srcs[2] = srcs[4];
+ srcs[3] = srcs[5];
+ srcs[4] = srcs[6];
+ y += 2;
+ } while (y < height);
+ } else if (num_taps == 8) {
+ srcs[8] = _mm_setzero_si128();
+ // 00 01 02 03
+ srcs[0] = Load4(src);
+ src += src_stride;
+ // 10 11 12 13
+ const __m128i a = Load4(src);
+ // 00 01 02 03 10 11 12 13
+ srcs[0] = _mm_unpacklo_epi32(srcs[0], a);
+ src += src_stride;
+ // 20 21 22 23
+ srcs[2] = Load4(src);
+ src += src_stride;
+ // 10 11 12 13 20 21 22 23
+ srcs[1] = _mm_unpacklo_epi32(a, srcs[2]);
+ // 30 31 32 33
+ const __m128i b = Load4(src);
+ // 20 21 22 23 30 31 32 33
+ srcs[2] = _mm_unpacklo_epi32(srcs[2], b);
+ src += src_stride;
+ // 40 41 42 43
+ srcs[4] = Load4(src);
+ src += src_stride;
+ // 30 31 32 33 40 41 42 43
+ srcs[3] = _mm_unpacklo_epi32(b, srcs[4]);
+ // 50 51 52 53
+ const __m128i c = Load4(src);
+ // 40 41 42 43 50 51 52 53
+ srcs[4] = _mm_unpacklo_epi32(srcs[4], c);
+ src += src_stride;
+ // 60 61 62 63
+ srcs[6] = Load4(src);
+ src += src_stride;
+ // 50 51 52 53 60 61 62 63
+ srcs[5] = _mm_unpacklo_epi32(c, srcs[6]);
+
+ int y = 0;
+ do {
+ // 70 71 72 73
+ const __m128i d = Load4(src);
+ // 60 61 62 63 70 71 72 73
+ srcs[6] = _mm_unpacklo_epi32(srcs[6], d);
+ src += src_stride;
+ // 80 81 82 83
+ srcs[8] = Load4(src);
+ src += src_stride;
+ // 70 71 72 73 80 81 82 83
+ srcs[7] = _mm_unpacklo_epi32(d, srcs[8]);
+
+ const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+ if (is_compound) {
+ const __m128i results = Compound1DShift(sums);
+ StoreUnaligned16(dst16, results);
+ dst16 += 4 << 1;
+ } else {
+ const __m128i results_16 =
+ RightShiftWithRounding_S16(sums, kFilterBits - 1);
+ const __m128i results = _mm_packus_epi16(results_16, results_16);
+ Store4(dst8, results);
+ dst8 += dst_stride;
+ Store4(dst8, _mm_srli_si128(results, 4));
+ dst8 += dst_stride;
+ }
+
+ srcs[0] = srcs[2];
+ srcs[1] = srcs[3];
+ srcs[2] = srcs[4];
+ srcs[3] = srcs[5];
+ srcs[4] = srcs[6];
+ srcs[5] = srcs[7];
+ srcs[6] = srcs[8];
+ y += 2;
+ } while (y < height);
+ }
+}
+
+template <int filter_index, bool negative_outside_taps = false>
+void FilterVertical2xH(const uint8_t* src, const ptrdiff_t src_stride,
+ void* const dst, const ptrdiff_t dst_stride,
+ const int height, const __m128i* const v_tap) {
+ const int num_taps = GetNumTapsInFilter(filter_index);
+ auto* dst8 = static_cast<uint8_t*>(dst);
+
+ __m128i srcs[9];
+
+ if (num_taps == 2) {
+ srcs[2] = _mm_setzero_si128();
+ // 00 01
+ srcs[0] = Load2(src);
+ src += src_stride;
+
+ int y = 0;
+ do {
+ // 00 01 10 11
+ srcs[0] = Load2<1>(src, srcs[0]);
+ src += src_stride;
+ // 00 01 10 11 20 21
+ srcs[0] = Load2<2>(src, srcs[0]);
+ src += src_stride;
+ // 00 01 10 11 20 21 30 31
+ srcs[0] = Load2<3>(src, srcs[0]);
+ src += src_stride;
+ // 40 41
+ srcs[2] = Load2<0>(src, srcs[2]);
+ src += src_stride;
+ // 00 01 10 11 20 21 30 31 40 41
+ const __m128i srcs_0_2 = _mm_unpacklo_epi64(srcs[0], srcs[2]);
+ // 10 11 20 21 30 31 40 41
+ srcs[1] = _mm_srli_si128(srcs_0_2, 2);
+ // This uses srcs[0]..srcs[1].
+ const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+ const __m128i results_16 =
+ RightShiftWithRounding_S16(sums, kFilterBits - 1);
+ const __m128i results = _mm_packus_epi16(results_16, results_16);
+
+ Store2(dst8, results);
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 2));
+ if (height == 2) return;
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 4));
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 6));
+ dst8 += dst_stride;
+
+ srcs[0] = srcs[2];
+ y += 4;
+ } while (y < height);
+ } else if (num_taps == 4) {
+ srcs[4] = _mm_setzero_si128();
+
+ // 00 01
+ srcs[0] = Load2(src);
+ src += src_stride;
+ // 00 01 10 11
+ srcs[0] = Load2<1>(src, srcs[0]);
+ src += src_stride;
+ // 00 01 10 11 20 21
+ srcs[0] = Load2<2>(src, srcs[0]);
+ src += src_stride;
+
+ int y = 0;
+ do {
+ // 00 01 10 11 20 21 30 31
+ srcs[0] = Load2<3>(src, srcs[0]);
+ src += src_stride;
+ // 40 41
+ srcs[4] = Load2<0>(src, srcs[4]);
+ src += src_stride;
+ // 40 41 50 51
+ srcs[4] = Load2<1>(src, srcs[4]);
+ src += src_stride;
+ // 40 41 50 51 60 61
+ srcs[4] = Load2<2>(src, srcs[4]);
+ src += src_stride;
+ // 00 01 10 11 20 21 30 31 40 41 50 51 60 61
+ const __m128i srcs_0_4 = _mm_unpacklo_epi64(srcs[0], srcs[4]);
+ // 10 11 20 21 30 31 40 41
+ srcs[1] = _mm_srli_si128(srcs_0_4, 2);
+ // 20 21 30 31 40 41 50 51
+ srcs[2] = _mm_srli_si128(srcs_0_4, 4);
+ // 30 31 40 41 50 51 60 61
+ srcs[3] = _mm_srli_si128(srcs_0_4, 6);
+
+ // This uses srcs[0]..srcs[3].
+ const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+ const __m128i results_16 =
+ RightShiftWithRounding_S16(sums, kFilterBits - 1);
+ const __m128i results = _mm_packus_epi16(results_16, results_16);
+
+ Store2(dst8, results);
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 2));
+ if (height == 2) return;
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 4));
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 6));
+ dst8 += dst_stride;
+
+ srcs[0] = srcs[4];
+ y += 4;
+ } while (y < height);
+ } else if (num_taps == 6) {
+ // During the vertical pass the number of taps is restricted when
+ // |height| <= 4.
+ assert(height > 4);
+ srcs[8] = _mm_setzero_si128();
+
+ // 00 01
+ srcs[0] = Load2(src);
+ src += src_stride;
+ // 00 01 10 11
+ srcs[0] = Load2<1>(src, srcs[0]);
+ src += src_stride;
+ // 00 01 10 11 20 21
+ srcs[0] = Load2<2>(src, srcs[0]);
+ src += src_stride;
+ // 00 01 10 11 20 21 30 31
+ srcs[0] = Load2<3>(src, srcs[0]);
+ src += src_stride;
+ // 40 41
+ srcs[4] = Load2(src);
+ src += src_stride;
+ // 00 01 10 11 20 21 30 31 40 41 50 51 60 61
+ const __m128i srcs_0_4x = _mm_unpacklo_epi64(srcs[0], srcs[4]);
+ // 10 11 20 21 30 31 40 41
+ srcs[1] = _mm_srli_si128(srcs_0_4x, 2);
+
+ int y = 0;
+ do {
+ // 40 41 50 51
+ srcs[4] = Load2<1>(src, srcs[4]);
+ src += src_stride;
+ // 40 41 50 51 60 61
+ srcs[4] = Load2<2>(src, srcs[4]);
+ src += src_stride;
+ // 40 41 50 51 60 61 70 71
+ srcs[4] = Load2<3>(src, srcs[4]);
+ src += src_stride;
+ // 80 81
+ srcs[8] = Load2<0>(src, srcs[8]);
+ src += src_stride;
+ // 00 01 10 11 20 21 30 31 40 41 50 51 60 61
+ const __m128i srcs_0_4 = _mm_unpacklo_epi64(srcs[0], srcs[4]);
+ // 20 21 30 31 40 41 50 51
+ srcs[2] = _mm_srli_si128(srcs_0_4, 4);
+ // 30 31 40 41 50 51 60 61
+ srcs[3] = _mm_srli_si128(srcs_0_4, 6);
+ const __m128i srcs_4_8 = _mm_unpacklo_epi64(srcs[4], srcs[8]);
+ // 50 51 60 61 70 71 80 81
+ srcs[5] = _mm_srli_si128(srcs_4_8, 2);
+
+ // This uses srcs[0]..srcs[5].
+ const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+ const __m128i results_16 =
+ RightShiftWithRounding_S16(sums, kFilterBits - 1);
+ const __m128i results = _mm_packus_epi16(results_16, results_16);
+
+ Store2(dst8, results);
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 2));
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 4));
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 6));
+ dst8 += dst_stride;
+
+ srcs[0] = srcs[4];
+ srcs[1] = srcs[5];
+ srcs[4] = srcs[8];
+ y += 4;
+ } while (y < height);
+ } else if (num_taps == 8) {
+ // During the vertical pass the number of taps is restricted when
+ // |height| <= 4.
+ assert(height > 4);
+ srcs[8] = _mm_setzero_si128();
+ // 00 01
+ srcs[0] = Load2(src);
+ src += src_stride;
+ // 00 01 10 11
+ srcs[0] = Load2<1>(src, srcs[0]);
+ src += src_stride;
+ // 00 01 10 11 20 21
+ srcs[0] = Load2<2>(src, srcs[0]);
+ src += src_stride;
+ // 00 01 10 11 20 21 30 31
+ srcs[0] = Load2<3>(src, srcs[0]);
+ src += src_stride;
+ // 40 41
+ srcs[4] = Load2(src);
+ src += src_stride;
+ // 40 41 50 51
+ srcs[4] = Load2<1>(src, srcs[4]);
+ src += src_stride;
+ // 40 41 50 51 60 61
+ srcs[4] = Load2<2>(src, srcs[4]);
+ src += src_stride;
+
+ // 00 01 10 11 20 21 30 31 40 41 50 51 60 61
+ const __m128i srcs_0_4 = _mm_unpacklo_epi64(srcs[0], srcs[4]);
+ // 10 11 20 21 30 31 40 41
+ srcs[1] = _mm_srli_si128(srcs_0_4, 2);
+ // 20 21 30 31 40 41 50 51
+ srcs[2] = _mm_srli_si128(srcs_0_4, 4);
+ // 30 31 40 41 50 51 60 61
+ srcs[3] = _mm_srli_si128(srcs_0_4, 6);
+
+ int y = 0;
+ do {
+ // 40 41 50 51 60 61 70 71
+ srcs[4] = Load2<3>(src, srcs[4]);
+ src += src_stride;
+ // 80 81
+ srcs[8] = Load2<0>(src, srcs[8]);
+ src += src_stride;
+ // 80 81 90 91
+ srcs[8] = Load2<1>(src, srcs[8]);
+ src += src_stride;
+ // 80 81 90 91 a0 a1
+ srcs[8] = Load2<2>(src, srcs[8]);
+ src += src_stride;
+
+ // 40 41 50 51 60 61 70 71 80 81 90 91 a0 a1
+ const __m128i srcs_4_8 = _mm_unpacklo_epi64(srcs[4], srcs[8]);
+ // 50 51 60 61 70 71 80 81
+ srcs[5] = _mm_srli_si128(srcs_4_8, 2);
+ // 60 61 70 71 80 81 90 91
+ srcs[6] = _mm_srli_si128(srcs_4_8, 4);
+ // 70 71 80 81 90 91 a0 a1
+ srcs[7] = _mm_srli_si128(srcs_4_8, 6);
+
+ // This uses srcs[0]..srcs[7].
+ const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+ const __m128i results_16 =
+ RightShiftWithRounding_S16(sums, kFilterBits - 1);
+ const __m128i results = _mm_packus_epi16(results_16, results_16);
+
+ Store2(dst8, results);
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 2));
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 4));
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 6));
+ dst8 += dst_stride;
+
+ srcs[0] = srcs[4];
+ srcs[1] = srcs[5];
+ srcs[2] = srcs[6];
+ srcs[3] = srcs[7];
+ srcs[4] = srcs[8];
+ y += 4;
+ } while (y < height);
+ }
+}
+
+void ConvolveVertical_SSE4_1(const void* const reference,
+ const ptrdiff_t reference_stride,
+ const int /*horizontal_filter_index*/,
+ const int vertical_filter_index,
+ const int /*horizontal_filter_id*/,
+ const int vertical_filter_id, const int width,
+ const int height, void* prediction,
+ const ptrdiff_t pred_stride) {
+ const int filter_index = GetFilterIndex(vertical_filter_index, height);
+ const int vertical_taps = GetNumTapsInFilter(filter_index);
+ const ptrdiff_t src_stride = reference_stride;
+ const auto* src = static_cast<const uint8_t*>(reference) -
+ (vertical_taps / 2 - 1) * src_stride;
+ auto* dest = static_cast<uint8_t*>(prediction);
+ const ptrdiff_t dest_stride = pred_stride;
+ assert(vertical_filter_id != 0);
+
+ __m128i taps[4];
+ const __m128i v_filter =
+ LoadLo8(kHalfSubPixelFilters[filter_index][vertical_filter_id]);
+
+ if (filter_index < 2) { // 6 tap.
+ SetupTaps<6>(&v_filter, taps);
+ if (width == 2) {
+ FilterVertical2xH<0>(src, src_stride, dest, dest_stride, height, taps);
+ } else if (width == 4) {
+ FilterVertical4xH<0>(src, src_stride, dest, dest_stride, height, taps);
+ } else {
+ FilterVertical<0>(src, src_stride, dest, dest_stride, width, height,
+ taps);
+ }
+ } else if (filter_index == 2) { // 8 tap.
+ SetupTaps<8>(&v_filter, taps);
+ if (width == 2) {
+ FilterVertical2xH<2>(src, src_stride, dest, dest_stride, height, taps);
+ } else if (width == 4) {
+ FilterVertical4xH<2>(src, src_stride, dest, dest_stride, height, taps);
+ } else {
+ FilterVertical<2>(src, src_stride, dest, dest_stride, width, height,
+ taps);
+ }
+ } else if (filter_index == 3) { // 2 tap.
+ SetupTaps<2>(&v_filter, taps);
+ if (width == 2) {
+ FilterVertical2xH<3>(src, src_stride, dest, dest_stride, height, taps);
+ } else if (width == 4) {
+ FilterVertical4xH<3>(src, src_stride, dest, dest_stride, height, taps);
+ } else {
+ FilterVertical<3>(src, src_stride, dest, dest_stride, width, height,
+ taps);
+ }
+ } else if (filter_index == 4) { // 4 tap.
+ SetupTaps<4>(&v_filter, taps);
+ if (width == 2) {
+ FilterVertical2xH<4>(src, src_stride, dest, dest_stride, height, taps);
+ } else if (width == 4) {
+ FilterVertical4xH<4>(src, src_stride, dest, dest_stride, height, taps);
+ } else {
+ FilterVertical<4>(src, src_stride, dest, dest_stride, width, height,
+ taps);
+ }
+ } else {
+ // TODO(slavarnway): Investigate adding |filter_index| == 1 special cases.
+ // See convolve_neon.cc
+ SetupTaps<4>(&v_filter, taps);
+
+ if (width == 2) {
+ FilterVertical2xH<5>(src, src_stride, dest, dest_stride, height, taps);
+ } else if (width == 4) {
+ FilterVertical4xH<5>(src, src_stride, dest, dest_stride, height, taps);
+ } else {
+ FilterVertical<5>(src, src_stride, dest, dest_stride, width, height,
+ taps);
+ }
+ }
+}
+
+void ConvolveCompoundCopy_SSE4(const void* const reference,
+ const ptrdiff_t reference_stride,
+ const int /*horizontal_filter_index*/,
+ const int /*vertical_filter_index*/,
+ const int /*horizontal_filter_id*/,
+ const int /*vertical_filter_id*/,
+ const int width, const int height,
+ void* prediction, const ptrdiff_t pred_stride) {
+ const auto* src = static_cast<const uint8_t*>(reference);
+ const ptrdiff_t src_stride = reference_stride;
+ auto* dest = static_cast<uint16_t*>(prediction);
+ constexpr int kRoundBitsVertical =
+ kInterRoundBitsVertical - kInterRoundBitsCompoundVertical;
+ if (width >= 16) {
+ int y = height;
+ do {
+ int x = 0;
+ do {
+ const __m128i v_src = LoadUnaligned16(&src[x]);
+ const __m128i v_src_ext_lo = _mm_cvtepu8_epi16(v_src);
+ const __m128i v_src_ext_hi =
+ _mm_cvtepu8_epi16(_mm_srli_si128(v_src, 8));
+ const __m128i v_dest_lo =
+ _mm_slli_epi16(v_src_ext_lo, kRoundBitsVertical);
+ const __m128i v_dest_hi =
+ _mm_slli_epi16(v_src_ext_hi, kRoundBitsVertical);
+ // TODO(slavarnway): Investigate using aligned stores.
+ StoreUnaligned16(&dest[x], v_dest_lo);
+ StoreUnaligned16(&dest[x + 8], v_dest_hi);
+ x += 16;
+ } while (x < width);
+ src += src_stride;
+ dest += pred_stride;
+ } while (--y != 0);
+ } else if (width == 8) {
+ int y = height;
+ do {
+ const __m128i v_src = LoadLo8(&src[0]);
+ const __m128i v_src_ext = _mm_cvtepu8_epi16(v_src);
+ const __m128i v_dest = _mm_slli_epi16(v_src_ext, kRoundBitsVertical);
+ StoreUnaligned16(&dest[0], v_dest);
+ src += src_stride;
+ dest += pred_stride;
+ } while (--y != 0);
+ } else { /* width == 4 */
+ int y = height;
+ do {
+ const __m128i v_src0 = Load4(&src[0]);
+ const __m128i v_src1 = Load4(&src[src_stride]);
+ const __m128i v_src = _mm_unpacklo_epi32(v_src0, v_src1);
+ const __m128i v_src_ext = _mm_cvtepu8_epi16(v_src);
+ const __m128i v_dest = _mm_slli_epi16(v_src_ext, kRoundBitsVertical);
+ StoreLo8(&dest[0], v_dest);
+ StoreHi8(&dest[pred_stride], v_dest);
+ src += src_stride * 2;
+ dest += pred_stride * 2;
+ y -= 2;
+ } while (y != 0);
+ }
+}
+
+void ConvolveCompoundVertical_SSE4_1(
+ const void* const reference, const ptrdiff_t reference_stride,
+ const int /*horizontal_filter_index*/, const int vertical_filter_index,
+ const int /*horizontal_filter_id*/, const int vertical_filter_id,
+ const int width, const int height, void* prediction,
+ const ptrdiff_t /*pred_stride*/) {
+ const int filter_index = GetFilterIndex(vertical_filter_index, height);
+ const int vertical_taps = GetNumTapsInFilter(filter_index);
+ const ptrdiff_t src_stride = reference_stride;
+ const auto* src = static_cast<const uint8_t*>(reference) -
+ (vertical_taps / 2 - 1) * src_stride;
+ auto* dest = static_cast<uint16_t*>(prediction);
+ assert(vertical_filter_id != 0);
+
+ __m128i taps[4];
+ const __m128i v_filter =
+ LoadLo8(kHalfSubPixelFilters[filter_index][vertical_filter_id]);
+
+ if (filter_index < 2) { // 6 tap.
+ SetupTaps<6>(&v_filter, taps);
+ if (width == 4) {
+ FilterVertical4xH<0, /*is_compound=*/true>(src, src_stride, dest, 4,
+ height, taps);
+ } else {
+ FilterVertical<0, /*is_compound=*/true>(src, src_stride, dest, width,
+ width, height, taps);
+ }
+ } else if (filter_index == 2) { // 8 tap.
+ SetupTaps<8>(&v_filter, taps);
+
+ if (width == 4) {
+ FilterVertical4xH<2, /*is_compound=*/true>(src, src_stride, dest, 4,
+ height, taps);
+ } else {
+ FilterVertical<2, /*is_compound=*/true>(src, src_stride, dest, width,
+ width, height, taps);
+ }
+ } else if (filter_index == 3) { // 2 tap.
+ SetupTaps<2>(&v_filter, taps);
+
+ if (width == 4) {
+ FilterVertical4xH<3, /*is_compound=*/true>(src, src_stride, dest, 4,
+ height, taps);
+ } else {
+ FilterVertical<3, /*is_compound=*/true>(src, src_stride, dest, width,
+ width, height, taps);
+ }
+ } else if (filter_index == 4) { // 4 tap.
+ SetupTaps<4>(&v_filter, taps);
+
+ if (width == 4) {
+ FilterVertical4xH<4, /*is_compound=*/true>(src, src_stride, dest, 4,
+ height, taps);
+ } else {
+ FilterVertical<4, /*is_compound=*/true>(src, src_stride, dest, width,
+ width, height, taps);
+ }
+ } else {
+ SetupTaps<4>(&v_filter, taps);
+
+ if (width == 4) {
+ FilterVertical4xH<5, /*is_compound=*/true>(src, src_stride, dest, 4,
+ height, taps);
+ } else {
+ FilterVertical<5, /*is_compound=*/true>(src, src_stride, dest, width,
+ width, height, taps);
+ }
+ }
+}
+
+void ConvolveHorizontal_SSE4_1(const void* const reference,
+ const ptrdiff_t reference_stride,
+ const int horizontal_filter_index,
+ const int /*vertical_filter_index*/,
+ const int horizontal_filter_id,
+ const int /*vertical_filter_id*/,
+ const int width, const int height,
+ void* prediction, const ptrdiff_t pred_stride) {
+ const int filter_index = GetFilterIndex(horizontal_filter_index, width);
+ // Set |src| to the outermost tap.
+ const auto* src = static_cast<const uint8_t*>(reference) - kHorizontalOffset;
+ auto* dest = static_cast<uint8_t*>(prediction);
+
+ DoHorizontalPass(src, reference_stride, dest, pred_stride, width, height,
+ horizontal_filter_id, filter_index);
+}
+
+void ConvolveCompoundHorizontal_SSE4_1(
+ const void* const reference, const ptrdiff_t reference_stride,
+ const int horizontal_filter_index, const int /*vertical_filter_index*/,
+ const int horizontal_filter_id, const int /*vertical_filter_id*/,
+ const int width, const int height, void* prediction,
+ const ptrdiff_t /*pred_stride*/) {
+ const int filter_index = GetFilterIndex(horizontal_filter_index, width);
+ const auto* src = static_cast<const uint8_t*>(reference) - kHorizontalOffset;
+ auto* dest = static_cast<uint16_t*>(prediction);
+
+ DoHorizontalPass</*is_2d=*/false, /*is_compound=*/true>(
+ src, reference_stride, dest, width, width, height, horizontal_filter_id,
+ filter_index);
+}
+
+void ConvolveCompound2D_SSE4_1(const void* const reference,
+ const ptrdiff_t reference_stride,
+ const int horizontal_filter_index,
+ const int vertical_filter_index,
+ const int horizontal_filter_id,
+ const int vertical_filter_id, const int width,
+ const int height, void* prediction,
+ const ptrdiff_t /*pred_stride*/) {
+ // The output of the horizontal filter, i.e. the intermediate_result, is
+ // guaranteed to fit in int16_t.
+ alignas(16) uint16_t
+ intermediate_result[kMaxSuperBlockSizeInPixels *
+ (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
+
+ // Horizontal filter.
+ // Filter types used for width <= 4 are different from those for width > 4.
+ // When width > 4, the valid filter index range is always [0, 3].
+ // When width <= 4, the valid filter index range is always [4, 5].
+ // Similarly for height.
+ const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
+ const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
+ const int vertical_taps = GetNumTapsInFilter(vert_filter_index);
+ const int intermediate_height = height + vertical_taps - 1;
+ const ptrdiff_t src_stride = reference_stride;
+ const auto* const src = static_cast<const uint8_t*>(reference) -
+ (vertical_taps / 2 - 1) * src_stride -
+ kHorizontalOffset;
+
+ DoHorizontalPass</*is_2d=*/true, /*is_compound=*/true>(
+ src, src_stride, intermediate_result, width, width, intermediate_height,
+ horizontal_filter_id, horiz_filter_index);
+
+ // Vertical filter.
+ auto* dest = static_cast<uint16_t*>(prediction);
+ assert(vertical_filter_id != 0);
+
+ const ptrdiff_t dest_stride = width;
+ __m128i taps[4];
+ const __m128i v_filter =
+ LoadLo8(kHalfSubPixelFilters[vert_filter_index][vertical_filter_id]);
+
+ if (vertical_taps == 8) {
+ SetupTaps<8, /*is_2d_vertical=*/true>(&v_filter, taps);
+ if (width == 4) {
+ Filter2DVertical4xH<8, /*is_compound=*/true>(intermediate_result, dest,
+ dest_stride, height, taps);
+ } else {
+ Filter2DVertical<8, /*is_compound=*/true>(
+ intermediate_result, dest, dest_stride, width, height, taps);
+ }
+ } else if (vertical_taps == 6) {
+ SetupTaps<6, /*is_2d_vertical=*/true>(&v_filter, taps);
+ if (width == 4) {
+ Filter2DVertical4xH<6, /*is_compound=*/true>(intermediate_result, dest,
+ dest_stride, height, taps);
+ } else {
+ Filter2DVertical<6, /*is_compound=*/true>(
+ intermediate_result, dest, dest_stride, width, height, taps);
+ }
+ } else if (vertical_taps == 4) {
+ SetupTaps<4, /*is_2d_vertical=*/true>(&v_filter, taps);
+ if (width == 4) {
+ Filter2DVertical4xH<4, /*is_compound=*/true>(intermediate_result, dest,
+ dest_stride, height, taps);
+ } else {
+ Filter2DVertical<4, /*is_compound=*/true>(
+ intermediate_result, dest, dest_stride, width, height, taps);
+ }
+ } else { // |vertical_taps| == 2
+ SetupTaps<2, /*is_2d_vertical=*/true>(&v_filter, taps);
+ if (width == 4) {
+ Filter2DVertical4xH<2, /*is_compound=*/true>(intermediate_result, dest,
+ dest_stride, height, taps);
+ } else {
+ Filter2DVertical<2, /*is_compound=*/true>(
+ intermediate_result, dest, dest_stride, width, height, taps);
+ }
+ }
+}
+
+// Pre-transposed filters.
+template <int filter_index>
+inline void GetHalfSubPixelFilter(__m128i* output) {
+ // Filter 0
+ alignas(
+ 16) static constexpr int8_t kHalfSubPixel6TapSignedFilterColumns[6][16] =
+ {{0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0},
+ {0, -3, -5, -6, -7, -7, -8, -7, -7, -6, -6, -6, -5, -4, -2, -1},
+ {64, 63, 61, 58, 55, 51, 47, 42, 38, 33, 29, 24, 19, 14, 9, 4},
+ {0, 4, 9, 14, 19, 24, 29, 33, 38, 42, 47, 51, 55, 58, 61, 63},
+ {0, -1, -2, -4, -5, -6, -6, -6, -7, -7, -8, -7, -7, -6, -5, -3},
+ {0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}};
+ // Filter 1
+ alignas(16) static constexpr int8_t
+ kHalfSubPixel6TapMixedSignedFilterColumns[6][16] = {
+ {0, 1, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, 0, 0, 0},
+ {0, 14, 13, 11, 10, 9, 8, 8, 7, 6, 5, 4, 3, 2, 2, 1},
+ {64, 31, 31, 31, 30, 29, 28, 27, 26, 24, 23, 22, 21, 20, 18, 17},
+ {0, 17, 18, 20, 21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 31, 31},
+ {0, 1, 2, 2, 3, 4, 5, 6, 7, 8, 8, 9, 10, 11, 13, 14},
+ {0, 0, 0, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, 0, 1}};
+ // Filter 2
+ alignas(
+ 16) static constexpr int8_t kHalfSubPixel8TapSignedFilterColumns[8][16] =
+ {{0, -1, -1, -1, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, 0},
+ {0, 1, 3, 4, 5, 5, 5, 5, 6, 5, 4, 4, 3, 3, 2, 1},
+ {0, -3, -6, -9, -11, -11, -12, -12, -12, -11, -10, -9, -7, -5, -3, -1},
+ {64, 63, 62, 60, 58, 54, 50, 45, 40, 35, 30, 24, 19, 13, 8, 4},
+ {0, 4, 8, 13, 19, 24, 30, 35, 40, 45, 50, 54, 58, 60, 62, 63},
+ {0, -1, -3, -5, -7, -9, -10, -11, -12, -12, -12, -11, -11, -9, -6, -3},
+ {0, 1, 2, 3, 3, 4, 4, 5, 6, 5, 5, 5, 5, 4, 3, 1},
+ {0, 0, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -1, -1, -1}};
+ // Filter 3
+ alignas(16) static constexpr uint8_t kHalfSubPixel2TapFilterColumns[2][16] = {
+ {64, 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4},
+ {0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60}};
+ // Filter 4
+ alignas(
+ 16) static constexpr int8_t kHalfSubPixel4TapSignedFilterColumns[4][16] =
+ {{0, -2, -4, -5, -6, -6, -7, -6, -6, -5, -5, -5, -4, -3, -2, -1},
+ {64, 63, 61, 58, 55, 51, 47, 42, 38, 33, 29, 24, 19, 14, 9, 4},
+ {0, 4, 9, 14, 19, 24, 29, 33, 38, 42, 47, 51, 55, 58, 61, 63},
+ {0, -1, -2, -3, -4, -5, -5, -5, -6, -6, -7, -6, -6, -5, -4, -2}};
+ // Filter 5
+ alignas(
+ 16) static constexpr uint8_t kSubPixel4TapPositiveFilterColumns[4][16] = {
+ {0, 15, 13, 11, 10, 9, 8, 7, 6, 6, 5, 4, 3, 2, 2, 1},
+ {64, 31, 31, 31, 30, 29, 28, 27, 26, 24, 23, 22, 21, 20, 18, 17},
+ {0, 17, 18, 20, 21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 31, 31},
+ {0, 1, 2, 2, 3, 4, 5, 6, 6, 7, 8, 9, 10, 11, 13, 15}};
+ switch (filter_index) {
+ case 0:
+ output[0] = LoadAligned16(kHalfSubPixel6TapSignedFilterColumns[0]);
+ output[1] = LoadAligned16(kHalfSubPixel6TapSignedFilterColumns[1]);
+ output[2] = LoadAligned16(kHalfSubPixel6TapSignedFilterColumns[2]);
+ output[3] = LoadAligned16(kHalfSubPixel6TapSignedFilterColumns[3]);
+ output[4] = LoadAligned16(kHalfSubPixel6TapSignedFilterColumns[4]);
+ output[5] = LoadAligned16(kHalfSubPixel6TapSignedFilterColumns[5]);
+ break;
+ case 1:
+ // The term "mixed" refers to the fact that the outer taps have a mix of
+ // negative and positive values.
+ output[0] = LoadAligned16(kHalfSubPixel6TapMixedSignedFilterColumns[0]);
+ output[1] = LoadAligned16(kHalfSubPixel6TapMixedSignedFilterColumns[1]);
+ output[2] = LoadAligned16(kHalfSubPixel6TapMixedSignedFilterColumns[2]);
+ output[3] = LoadAligned16(kHalfSubPixel6TapMixedSignedFilterColumns[3]);
+ output[4] = LoadAligned16(kHalfSubPixel6TapMixedSignedFilterColumns[4]);
+ output[5] = LoadAligned16(kHalfSubPixel6TapMixedSignedFilterColumns[5]);
+ break;
+ case 2:
+ output[0] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[0]);
+ output[1] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[1]);
+ output[2] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[2]);
+ output[3] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[3]);
+ output[4] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[4]);
+ output[5] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[5]);
+ output[6] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[6]);
+ output[7] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[7]);
+ break;
+ case 3:
+ output[0] = LoadAligned16(kHalfSubPixel2TapFilterColumns[0]);
+ output[1] = LoadAligned16(kHalfSubPixel2TapFilterColumns[1]);
+ break;
+ case 4:
+ output[0] = LoadAligned16(kHalfSubPixel4TapSignedFilterColumns[0]);
+ output[1] = LoadAligned16(kHalfSubPixel4TapSignedFilterColumns[1]);
+ output[2] = LoadAligned16(kHalfSubPixel4TapSignedFilterColumns[2]);
+ output[3] = LoadAligned16(kHalfSubPixel4TapSignedFilterColumns[3]);
+ break;
+ default:
+ assert(filter_index == 5);
+ output[0] = LoadAligned16(kSubPixel4TapPositiveFilterColumns[0]);
+ output[1] = LoadAligned16(kSubPixel4TapPositiveFilterColumns[1]);
+ output[2] = LoadAligned16(kSubPixel4TapPositiveFilterColumns[2]);
+ output[3] = LoadAligned16(kSubPixel4TapPositiveFilterColumns[3]);
+ break;
+ }
+}
+
+// There are many opportunities for overreading in scaled convolve, because
+// the range of starting points for filter windows is anywhere from 0 to 16
+// for 8 destination pixels, and the window sizes range from 2 to 8. To
+// accommodate this range concisely, we use |grade_x| to mean the most steps
+// in src that can be traversed in a single |step_x| increment, i.e. 1 or 2.
+// More importantly, |grade_x| answers the question "how many vector loads are
+// needed to cover the source values?"
+// When |grade_x| == 1, the maximum number of source values needed is 8 separate
+// starting positions plus 7 more to cover taps, all fitting into 16 bytes.
+// When |grade_x| > 1, we are guaranteed to exceed 8 whole steps in src for
+// every 8 |step_x| increments, on top of 8 possible taps. The first load covers
+// the starting sources for each kernel, while the final load covers the taps.
+// Since the offset value of src_x cannot exceed 8 and |num_taps| does not
+// exceed 4 when width <= 4, |grade_x| is set to 1 regardless of the value of
+// |step_x|.
+template <int num_taps, int grade_x>
+inline void PrepareSourceVectors(const uint8_t* src, const __m128i src_indices,
+ __m128i* const source /*[num_taps >> 1]*/) {
+ const __m128i src_vals = LoadUnaligned16(src);
+ source[0] = _mm_shuffle_epi8(src_vals, src_indices);
+ if (grade_x == 1) {
+ if (num_taps > 2) {
+ source[1] = _mm_shuffle_epi8(_mm_srli_si128(src_vals, 2), src_indices);
+ }
+ if (num_taps > 4) {
+ source[2] = _mm_shuffle_epi8(_mm_srli_si128(src_vals, 4), src_indices);
+ }
+ if (num_taps > 6) {
+ source[3] = _mm_shuffle_epi8(_mm_srli_si128(src_vals, 6), src_indices);
+ }
+ } else {
+ assert(grade_x > 1);
+ assert(num_taps != 4);
+ // grade_x > 1 also means width >= 8 && num_taps != 4
+ const __m128i src_vals_ext = LoadLo8(src + 16);
+ if (num_taps > 2) {
+ source[1] = _mm_shuffle_epi8(_mm_alignr_epi8(src_vals_ext, src_vals, 2),
+ src_indices);
+ source[2] = _mm_shuffle_epi8(_mm_alignr_epi8(src_vals_ext, src_vals, 4),
+ src_indices);
+ }
+ if (num_taps > 6) {
+ source[3] = _mm_shuffle_epi8(_mm_alignr_epi8(src_vals_ext, src_vals, 6),
+ src_indices);
+ }
+ }
+}
+
+template <int num_taps>
+inline void PrepareHorizontalTaps(const __m128i subpel_indices,
+ const __m128i* filter_taps,
+ __m128i* out_taps) {
+ const __m128i scale_index_offsets =
+ _mm_srli_epi16(subpel_indices, kFilterIndexShift);
+ const __m128i filter_index_mask = _mm_set1_epi8(kSubPixelMask);
+ const __m128i filter_indices =
+ _mm_and_si128(_mm_packus_epi16(scale_index_offsets, scale_index_offsets),
+ filter_index_mask);
+ // Line up taps for maddubs_epi16.
+ // The unpack is also assumed to be lighter than shift+alignr.
+ for (int k = 0; k < (num_taps >> 1); ++k) {
+ const __m128i taps0 = _mm_shuffle_epi8(filter_taps[2 * k], filter_indices);
+ const __m128i taps1 =
+ _mm_shuffle_epi8(filter_taps[2 * k + 1], filter_indices);
+ out_taps[k] = _mm_unpacklo_epi8(taps0, taps1);
+ }
+}
+
+inline __m128i HorizontalScaleIndices(const __m128i subpel_indices) {
+ const __m128i src_indices16 =
+ _mm_srli_epi16(subpel_indices, kScaleSubPixelBits);
+ const __m128i src_indices = _mm_packus_epi16(src_indices16, src_indices16);
+ return _mm_unpacklo_epi8(src_indices,
+ _mm_add_epi8(src_indices, _mm_set1_epi8(1)));
+}
+
+template <int grade_x, int filter_index, int num_taps>
+inline void ConvolveHorizontalScale(const uint8_t* src, ptrdiff_t src_stride,
+ int width, int subpixel_x, int step_x,
+ int intermediate_height,
+ int16_t* intermediate) {
+ // Account for the 0-taps that precede the 2 nonzero taps.
+ const int kernel_offset = (8 - num_taps) >> 1;
+ const int ref_x = subpixel_x >> kScaleSubPixelBits;
+ const int step_x8 = step_x << 3;
+ __m128i filter_taps[num_taps];
+ GetHalfSubPixelFilter<filter_index>(filter_taps);
+ const __m128i index_steps =
+ _mm_mullo_epi16(_mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0),
+ _mm_set1_epi16(static_cast<int16_t>(step_x)));
+
+ __m128i taps[num_taps >> 1];
+ __m128i source[num_taps >> 1];
+ int p = subpixel_x;
+ // Case when width <= 4 is possible.
+ if (filter_index >= 3) {
+ if (filter_index > 3 || width <= 4) {
+ const uint8_t* src_x =
+ &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
+ // Only add steps to the 10-bit truncated p to avoid overflow.
+ const __m128i p_fraction = _mm_set1_epi16(p & 1023);
+ const __m128i subpel_indices = _mm_add_epi16(index_steps, p_fraction);
+ PrepareHorizontalTaps<num_taps>(subpel_indices, filter_taps, taps);
+ const __m128i packed_indices = HorizontalScaleIndices(subpel_indices);
+
+ int y = intermediate_height;
+ do {
+ // Load and line up source values with the taps. Width 4 means no need
+ // to load extended source.
+ PrepareSourceVectors<num_taps, /*grade_x=*/1>(src_x, packed_indices,
+ source);
+
+ StoreLo8(intermediate, RightShiftWithRounding_S16(
+ SumOnePassTaps<filter_index>(source, taps),
+ kInterRoundBitsHorizontal - 1));
+ src_x += src_stride;
+ intermediate += kIntermediateStride;
+ } while (--y != 0);
+ return;
+ }
+ }
+
+ // |width| >= 8
+ int x = 0;
+ do {
+ const uint8_t* src_x =
+ &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
+ int16_t* intermediate_x = intermediate + x;
+ // Only add steps to the 10-bit truncated p to avoid overflow.
+ const __m128i p_fraction = _mm_set1_epi16(p & 1023);
+ const __m128i subpel_indices = _mm_add_epi16(index_steps, p_fraction);
+ PrepareHorizontalTaps<num_taps>(subpel_indices, filter_taps, taps);
+ const __m128i packed_indices = HorizontalScaleIndices(subpel_indices);
+
+ int y = intermediate_height;
+ do {
+ // For each x, a lane of src_k[k] contains src_x[k].
+ PrepareSourceVectors<num_taps, grade_x>(src_x, packed_indices, source);
+
+ // Shift by one less because the taps are halved.
+ StoreAligned16(
+ intermediate_x,
+ RightShiftWithRounding_S16(SumOnePassTaps<filter_index>(source, taps),
+ kInterRoundBitsHorizontal - 1));
+ src_x += src_stride;
+ intermediate_x += kIntermediateStride;
+ } while (--y != 0);
+ x += 8;
+ p += step_x8;
+ } while (x < width);
+}
+
+template <int num_taps>
+inline void PrepareVerticalTaps(const int8_t* taps, __m128i* output) {
+ // Avoid overreading the filter due to starting at kernel_offset.
+ // The only danger of overread is in the final filter, which has 4 taps.
+ const __m128i filter =
+ _mm_cvtepi8_epi16((num_taps > 4) ? LoadLo8(taps) : Load4(taps));
+ output[0] = _mm_shuffle_epi32(filter, 0);
+ if (num_taps > 2) {
+ output[1] = _mm_shuffle_epi32(filter, 0x55);
+ }
+ if (num_taps > 4) {
+ output[2] = _mm_shuffle_epi32(filter, 0xAA);
+ }
+ if (num_taps > 6) {
+ output[3] = _mm_shuffle_epi32(filter, 0xFF);
+ }
+}
+
+// Process eight 16 bit inputs and output eight 16 bit values.
+template <int num_taps, bool is_compound>
+inline __m128i Sum2DVerticalTaps(const __m128i* const src,
+ const __m128i* taps) {
+ const __m128i src_lo_01 = _mm_unpacklo_epi16(src[0], src[1]);
+ __m128i sum_lo = _mm_madd_epi16(src_lo_01, taps[0]);
+ const __m128i src_hi_01 = _mm_unpackhi_epi16(src[0], src[1]);
+ __m128i sum_hi = _mm_madd_epi16(src_hi_01, taps[0]);
+ if (num_taps > 2) {
+ const __m128i src_lo_23 = _mm_unpacklo_epi16(src[2], src[3]);
+ sum_lo = _mm_add_epi32(sum_lo, _mm_madd_epi16(src_lo_23, taps[1]));
+ const __m128i src_hi_23 = _mm_unpackhi_epi16(src[2], src[3]);
+ sum_hi = _mm_add_epi32(sum_hi, _mm_madd_epi16(src_hi_23, taps[1]));
+ }
+ if (num_taps > 4) {
+ const __m128i src_lo_45 = _mm_unpacklo_epi16(src[4], src[5]);
+ sum_lo = _mm_add_epi32(sum_lo, _mm_madd_epi16(src_lo_45, taps[2]));
+ const __m128i src_hi_45 = _mm_unpackhi_epi16(src[4], src[5]);
+ sum_hi = _mm_add_epi32(sum_hi, _mm_madd_epi16(src_hi_45, taps[2]));
+ }
+ if (num_taps > 6) {
+ const __m128i src_lo_67 = _mm_unpacklo_epi16(src[6], src[7]);
+ sum_lo = _mm_add_epi32(sum_lo, _mm_madd_epi16(src_lo_67, taps[3]));
+ const __m128i src_hi_67 = _mm_unpackhi_epi16(src[6], src[7]);
+ sum_hi = _mm_add_epi32(sum_hi, _mm_madd_epi16(src_hi_67, taps[3]));
+ }
+ if (is_compound) {
+ return _mm_packs_epi32(
+ RightShiftWithRounding_S32(sum_lo, kInterRoundBitsCompoundVertical - 1),
+ RightShiftWithRounding_S32(sum_hi,
+ kInterRoundBitsCompoundVertical - 1));
+ }
+ return _mm_packs_epi32(
+ RightShiftWithRounding_S32(sum_lo, kInterRoundBitsVertical - 1),
+ RightShiftWithRounding_S32(sum_hi, kInterRoundBitsVertical - 1));
+}
+
+// Bottom half of each src[k] is the source for one filter, and the top half
+// is the source for the other filter, for the next destination row.
+template <int num_taps, bool is_compound>
+__m128i Sum2DVerticalTaps4x2(const __m128i* const src, const __m128i* taps_lo,
+ const __m128i* taps_hi) {
+ const __m128i src_lo_01 = _mm_unpacklo_epi16(src[0], src[1]);
+ __m128i sum_lo = _mm_madd_epi16(src_lo_01, taps_lo[0]);
+ const __m128i src_hi_01 = _mm_unpackhi_epi16(src[0], src[1]);
+ __m128i sum_hi = _mm_madd_epi16(src_hi_01, taps_hi[0]);
+ if (num_taps > 2) {
+ const __m128i src_lo_23 = _mm_unpacklo_epi16(src[2], src[3]);
+ sum_lo = _mm_add_epi32(sum_lo, _mm_madd_epi16(src_lo_23, taps_lo[1]));
+ const __m128i src_hi_23 = _mm_unpackhi_epi16(src[2], src[3]);
+ sum_hi = _mm_add_epi32(sum_hi, _mm_madd_epi16(src_hi_23, taps_hi[1]));
+ }
+ if (num_taps > 4) {
+ const __m128i src_lo_45 = _mm_unpacklo_epi16(src[4], src[5]);
+ sum_lo = _mm_add_epi32(sum_lo, _mm_madd_epi16(src_lo_45, taps_lo[2]));
+ const __m128i src_hi_45 = _mm_unpackhi_epi16(src[4], src[5]);
+ sum_hi = _mm_add_epi32(sum_hi, _mm_madd_epi16(src_hi_45, taps_hi[2]));
+ }
+ if (num_taps > 6) {
+ const __m128i src_lo_67 = _mm_unpacklo_epi16(src[6], src[7]);
+ sum_lo = _mm_add_epi32(sum_lo, _mm_madd_epi16(src_lo_67, taps_lo[3]));
+ const __m128i src_hi_67 = _mm_unpackhi_epi16(src[6], src[7]);
+ sum_hi = _mm_add_epi32(sum_hi, _mm_madd_epi16(src_hi_67, taps_hi[3]));
+ }
+
+ if (is_compound) {
+ return _mm_packs_epi32(
+ RightShiftWithRounding_S32(sum_lo, kInterRoundBitsCompoundVertical - 1),
+ RightShiftWithRounding_S32(sum_hi,
+ kInterRoundBitsCompoundVertical - 1));
+ }
+ return _mm_packs_epi32(
+ RightShiftWithRounding_S32(sum_lo, kInterRoundBitsVertical - 1),
+ RightShiftWithRounding_S32(sum_hi, kInterRoundBitsVertical - 1));
+}
+
+// |width_class| is 2, 4, or 8, according to the Store function that should be
+// used.
+template <int num_taps, int width_class, bool is_compound>
+#if LIBGAV1_MSAN
+__attribute__((no_sanitize_memory)) void ConvolveVerticalScale(
+#else
+inline void ConvolveVerticalScale(
+#endif
+ const int16_t* src, const int width, const int subpixel_y,
+ const int filter_index, const int step_y, const int height, void* dest,
+ const ptrdiff_t dest_stride) {
+ constexpr ptrdiff_t src_stride = kIntermediateStride;
+ constexpr int kernel_offset = (8 - num_taps) / 2;
+ const int16_t* src_y = src;
+ // |dest| is 16-bit in compound mode, Pixel otherwise.
+ auto* dest16_y = static_cast<uint16_t*>(dest);
+ auto* dest_y = static_cast<uint8_t*>(dest);
+ __m128i s[num_taps];
+
+ int p = subpixel_y & 1023;
+ int y = height;
+ if (width_class <= 4) {
+ __m128i filter_taps_lo[num_taps >> 1];
+ __m128i filter_taps_hi[num_taps >> 1];
+ do { // y > 0
+ for (int i = 0; i < num_taps; ++i) {
+ s[i] = LoadLo8(src_y + i * src_stride);
+ }
+ int filter_id = (p >> 6) & kSubPixelMask;
+ const int8_t* filter0 =
+ kHalfSubPixelFilters[filter_index][filter_id] + kernel_offset;
+ PrepareVerticalTaps<num_taps>(filter0, filter_taps_lo);
+ p += step_y;
+ src_y = src + (p >> kScaleSubPixelBits) * src_stride;
+
+ for (int i = 0; i < num_taps; ++i) {
+ s[i] = LoadHi8(s[i], src_y + i * src_stride);
+ }
+ filter_id = (p >> 6) & kSubPixelMask;
+ const int8_t* filter1 =
+ kHalfSubPixelFilters[filter_index][filter_id] + kernel_offset;
+ PrepareVerticalTaps<num_taps>(filter1, filter_taps_hi);
+ p += step_y;
+ src_y = src + (p >> kScaleSubPixelBits) * src_stride;
+
+ const __m128i sums = Sum2DVerticalTaps4x2<num_taps, is_compound>(
+ s, filter_taps_lo, filter_taps_hi);
+ if (is_compound) {
+ assert(width_class > 2);
+ StoreLo8(dest16_y, sums);
+ dest16_y += dest_stride;
+ StoreHi8(dest16_y, sums);
+ dest16_y += dest_stride;
+ } else {
+ const __m128i result = _mm_packus_epi16(sums, sums);
+ if (width_class == 2) {
+ Store2(dest_y, result);
+ dest_y += dest_stride;
+ Store2(dest_y, _mm_srli_si128(result, 4));
+ } else {
+ Store4(dest_y, result);
+ dest_y += dest_stride;
+ Store4(dest_y, _mm_srli_si128(result, 4));
+ }
+ dest_y += dest_stride;
+ }
+ y -= 2;
+ } while (y != 0);
+ return;
+ }
+
+ // |width_class| >= 8
+ __m128i filter_taps[num_taps >> 1];
+ do { // y > 0
+ src_y = src + (p >> kScaleSubPixelBits) * src_stride;
+ const int filter_id = (p >> 6) & kSubPixelMask;
+ const int8_t* filter =
+ kHalfSubPixelFilters[filter_index][filter_id] + kernel_offset;
+ PrepareVerticalTaps<num_taps>(filter, filter_taps);
+
+ int x = 0;
+ do { // x < width
+ for (int i = 0; i < num_taps; ++i) {
+ s[i] = LoadUnaligned16(src_y + i * src_stride);
+ }
+
+ const __m128i sums =
+ Sum2DVerticalTaps<num_taps, is_compound>(s, filter_taps);
+ if (is_compound) {
+ StoreUnaligned16(dest16_y + x, sums);
+ } else {
+ StoreLo8(dest_y + x, _mm_packus_epi16(sums, sums));
+ }
+ x += 8;
+ src_y += 8;
+ } while (x < width);
+ p += step_y;
+ dest_y += dest_stride;
+ dest16_y += dest_stride;
+ } while (--y != 0);
+}
+
+template <bool is_compound>
+void ConvolveScale2D_SSE4_1(const void* const reference,
+ const ptrdiff_t reference_stride,
+ const int horizontal_filter_index,
+ const int vertical_filter_index,
+ const int subpixel_x, const int subpixel_y,
+ const int step_x, const int step_y, const int width,
+ const int height, void* prediction,
+ const ptrdiff_t pred_stride) {
+ const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
+ const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
+ assert(step_x <= 2048);
+ // The output of the horizontal filter, i.e. the intermediate_result, is
+ // guaranteed to fit in int16_t.
+ // TODO(petersonab): Reduce intermediate block stride to width to make smaller
+ // blocks faster.
+ alignas(16) int16_t
+ intermediate_result[kMaxSuperBlockSizeInPixels *
+ (2 * kMaxSuperBlockSizeInPixels + kSubPixelTaps)];
+ const int num_vert_taps = GetNumTapsInFilter(vert_filter_index);
+ const int intermediate_height =
+ (((height - 1) * step_y + (1 << kScaleSubPixelBits) - 1) >>
+ kScaleSubPixelBits) +
+ num_vert_taps;
+
+ // Horizontal filter.
+ // Filter types used for width <= 4 are different from those for width > 4.
+ // When width > 4, the valid filter index range is always [0, 3].
+ // When width <= 4, the valid filter index range is always [3, 5].
+ // Similarly for height.
+ int16_t* intermediate = intermediate_result;
+ const ptrdiff_t src_stride = reference_stride;
+ const auto* src = static_cast<const uint8_t*>(reference);
+ const int vert_kernel_offset = (8 - num_vert_taps) / 2;
+ src += vert_kernel_offset * src_stride;
+
+ // Derive the maximum value of |step_x| at which all source values fit in one
+ // 16-byte load. Final index is src_x + |num_taps| - 1 < 16
+ // step_x*7 is the final base sub-pixel index for the shuffle mask for filter
+ // inputs in each iteration on large blocks. When step_x is large, we need a
+ // second register and alignr in order to gather all filter inputs.
+ // |num_taps| - 1 is the offset for the shuffle of inputs to the final tap.
+ const int num_horiz_taps = GetNumTapsInFilter(horiz_filter_index);
+ const int kernel_start_ceiling = 16 - num_horiz_taps;
+ // This truncated quotient |grade_x_threshold| selects |step_x| such that:
+ // (step_x * 7) >> kScaleSubPixelBits < single load limit
+ const int grade_x_threshold =
+ (kernel_start_ceiling << kScaleSubPixelBits) / 7;
+ switch (horiz_filter_index) {
+ case 0:
+ if (step_x > grade_x_threshold) {
+ ConvolveHorizontalScale<2, 0, 6>(src, src_stride, width, subpixel_x,
+ step_x, intermediate_height,
+ intermediate);
+ } else {
+ ConvolveHorizontalScale<1, 0, 6>(src, src_stride, width, subpixel_x,
+ step_x, intermediate_height,
+ intermediate);
+ }
+ break;
+ case 1:
+ if (step_x > grade_x_threshold) {
+ ConvolveHorizontalScale<2, 1, 6>(src, src_stride, width, subpixel_x,
+ step_x, intermediate_height,
+ intermediate);
+
+ } else {
+ ConvolveHorizontalScale<1, 1, 6>(src, src_stride, width, subpixel_x,
+ step_x, intermediate_height,
+ intermediate);
+ }
+ break;
+ case 2:
+ if (step_x > grade_x_threshold) {
+ ConvolveHorizontalScale<2, 2, 8>(src, src_stride, width, subpixel_x,
+ step_x, intermediate_height,
+ intermediate);
+ } else {
+ ConvolveHorizontalScale<1, 2, 8>(src, src_stride, width, subpixel_x,
+ step_x, intermediate_height,
+ intermediate);
+ }
+ break;
+ case 3:
+ if (step_x > grade_x_threshold) {
+ ConvolveHorizontalScale<2, 3, 2>(src, src_stride, width, subpixel_x,
+ step_x, intermediate_height,
+ intermediate);
+ } else {
+ ConvolveHorizontalScale<1, 3, 2>(src, src_stride, width, subpixel_x,
+ step_x, intermediate_height,
+ intermediate);
+ }
+ break;
+ case 4:
+ assert(width <= 4);
+ ConvolveHorizontalScale<1, 4, 4>(src, src_stride, width, subpixel_x,
+ step_x, intermediate_height,
+ intermediate);
+ break;
+ default:
+ assert(horiz_filter_index == 5);
+ assert(width <= 4);
+ ConvolveHorizontalScale<1, 5, 4>(src, src_stride, width, subpixel_x,
+ step_x, intermediate_height,
+ intermediate);
+ }
+
+ // Vertical filter.
+ intermediate = intermediate_result;
+ switch (vert_filter_index) {
+ case 0:
+ case 1:
+ if (!is_compound && width == 2) {
+ ConvolveVerticalScale<6, 2, is_compound>(
+ intermediate, width, subpixel_y, vert_filter_index, step_y, height,
+ prediction, pred_stride);
+ } else if (width == 4) {
+ ConvolveVerticalScale<6, 4, is_compound>(
+ intermediate, width, subpixel_y, vert_filter_index, step_y, height,
+ prediction, pred_stride);
+ } else {
+ ConvolveVerticalScale<6, 8, is_compound>(
+ intermediate, width, subpixel_y, vert_filter_index, step_y, height,
+ prediction, pred_stride);
+ }
+ break;
+ case 2:
+ if (!is_compound && width == 2) {
+ ConvolveVerticalScale<8, 2, is_compound>(
+ intermediate, width, subpixel_y, vert_filter_index, step_y, height,
+ prediction, pred_stride);
+ } else if (width == 4) {
+ ConvolveVerticalScale<8, 4, is_compound>(
+ intermediate, width, subpixel_y, vert_filter_index, step_y, height,
+ prediction, pred_stride);
+ } else {
+ ConvolveVerticalScale<8, 8, is_compound>(
+ intermediate, width, subpixel_y, vert_filter_index, step_y, height,
+ prediction, pred_stride);
+ }
+ break;
+ case 3:
+ if (!is_compound && width == 2) {
+ ConvolveVerticalScale<2, 2, is_compound>(
+ intermediate, width, subpixel_y, vert_filter_index, step_y, height,
+ prediction, pred_stride);
+ } else if (width == 4) {
+ ConvolveVerticalScale<2, 4, is_compound>(
+ intermediate, width, subpixel_y, vert_filter_index, step_y, height,
+ prediction, pred_stride);
+ } else {
+ ConvolveVerticalScale<2, 8, is_compound>(
+ intermediate, width, subpixel_y, vert_filter_index, step_y, height,
+ prediction, pred_stride);
+ }
+ break;
+ default:
+ assert(vert_filter_index == 4 || vert_filter_index == 5);
+ if (!is_compound && width == 2) {
+ ConvolveVerticalScale<4, 2, is_compound>(
+ intermediate, width, subpixel_y, vert_filter_index, step_y, height,
+ prediction, pred_stride);
+ } else if (width == 4) {
+ ConvolveVerticalScale<4, 4, is_compound>(
+ intermediate, width, subpixel_y, vert_filter_index, step_y, height,
+ prediction, pred_stride);
+ } else {
+ ConvolveVerticalScale<4, 8, is_compound>(
+ intermediate, width, subpixel_y, vert_filter_index, step_y, height,
+ prediction, pred_stride);
+ }
+ }
+}
+
+inline void HalfAddHorizontal(const uint8_t* src, uint8_t* dst) {
+ const __m128i left = LoadUnaligned16(src);
+ const __m128i right = LoadUnaligned16(src + 1);
+ StoreUnaligned16(dst, _mm_avg_epu8(left, right));
+}
+
+template <int width>
+inline void IntraBlockCopyHorizontal(const uint8_t* src,
+ const ptrdiff_t src_stride,
+ const int height, uint8_t* dst,
+ const ptrdiff_t dst_stride) {
+ const ptrdiff_t src_remainder_stride = src_stride - (width - 16);
+ const ptrdiff_t dst_remainder_stride = dst_stride - (width - 16);
+
+ int y = height;
+ do {
+ HalfAddHorizontal(src, dst);
+ if (width >= 32) {
+ src += 16;
+ dst += 16;
+ HalfAddHorizontal(src, dst);
+ if (width >= 64) {
+ src += 16;
+ dst += 16;
+ HalfAddHorizontal(src, dst);
+ src += 16;
+ dst += 16;
+ HalfAddHorizontal(src, dst);
+ if (width == 128) {
+ src += 16;
+ dst += 16;
+ HalfAddHorizontal(src, dst);
+ src += 16;
+ dst += 16;
+ HalfAddHorizontal(src, dst);
+ src += 16;
+ dst += 16;
+ HalfAddHorizontal(src, dst);
+ src += 16;
+ dst += 16;
+ HalfAddHorizontal(src, dst);
+ }
+ }
+ }
+ src += src_remainder_stride;
+ dst += dst_remainder_stride;
+ } while (--y != 0);
+}
+
+void ConvolveIntraBlockCopyHorizontal_SSE4_1(
+ const void* const reference, const ptrdiff_t reference_stride,
+ const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
+ const int /*subpixel_x*/, const int /*subpixel_y*/, const int width,
+ const int height, void* const prediction, const ptrdiff_t pred_stride) {
+ const auto* src = static_cast<const uint8_t*>(reference);
+ auto* dest = static_cast<uint8_t*>(prediction);
+
+ if (width == 128) {
+ IntraBlockCopyHorizontal<128>(src, reference_stride, height, dest,
+ pred_stride);
+ } else if (width == 64) {
+ IntraBlockCopyHorizontal<64>(src, reference_stride, height, dest,
+ pred_stride);
+ } else if (width == 32) {
+ IntraBlockCopyHorizontal<32>(src, reference_stride, height, dest,
+ pred_stride);
+ } else if (width == 16) {
+ IntraBlockCopyHorizontal<16>(src, reference_stride, height, dest,
+ pred_stride);
+ } else if (width == 8) {
+ int y = height;
+ do {
+ const __m128i left = LoadLo8(src);
+ const __m128i right = LoadLo8(src + 1);
+ StoreLo8(dest, _mm_avg_epu8(left, right));
+
+ src += reference_stride;
+ dest += pred_stride;
+ } while (--y != 0);
+ } else if (width == 4) {
+ int y = height;
+ do {
+ __m128i left = Load4(src);
+ __m128i right = Load4(src + 1);
+ src += reference_stride;
+ left = _mm_unpacklo_epi32(left, Load4(src));
+ right = _mm_unpacklo_epi32(right, Load4(src + 1));
+ src += reference_stride;
+
+ const __m128i result = _mm_avg_epu8(left, right);
+
+ Store4(dest, result);
+ dest += pred_stride;
+ Store4(dest, _mm_srli_si128(result, 4));
+ dest += pred_stride;
+ y -= 2;
+ } while (y != 0);
+ } else {
+ assert(width == 2);
+ __m128i left = _mm_setzero_si128();
+ __m128i right = _mm_setzero_si128();
+ int y = height;
+ do {
+ left = Load2<0>(src, left);
+ right = Load2<0>(src + 1, right);
+ src += reference_stride;
+ left = Load2<1>(src, left);
+ right = Load2<1>(src + 1, right);
+ src += reference_stride;
+
+ const __m128i result = _mm_avg_epu8(left, right);
+
+ Store2(dest, result);
+ dest += pred_stride;
+ Store2(dest, _mm_srli_si128(result, 2));
+ dest += pred_stride;
+ y -= 2;
+ } while (y != 0);
+ }
+}
+
+template <int width>
+inline void IntraBlockCopyVertical(const uint8_t* src,
+ const ptrdiff_t src_stride, const int height,
+ uint8_t* dst, const ptrdiff_t dst_stride) {
+ const ptrdiff_t src_remainder_stride = src_stride - (width - 16);
+ const ptrdiff_t dst_remainder_stride = dst_stride - (width - 16);
+ __m128i row[8], below[8];
+
+ row[0] = LoadUnaligned16(src);
+ if (width >= 32) {
+ src += 16;
+ row[1] = LoadUnaligned16(src);
+ if (width >= 64) {
+ src += 16;
+ row[2] = LoadUnaligned16(src);
+ src += 16;
+ row[3] = LoadUnaligned16(src);
+ if (width == 128) {
+ src += 16;
+ row[4] = LoadUnaligned16(src);
+ src += 16;
+ row[5] = LoadUnaligned16(src);
+ src += 16;
+ row[6] = LoadUnaligned16(src);
+ src += 16;
+ row[7] = LoadUnaligned16(src);
+ }
+ }
+ }
+ src += src_remainder_stride;
+
+ int y = height;
+ do {
+ below[0] = LoadUnaligned16(src);
+ if (width >= 32) {
+ src += 16;
+ below[1] = LoadUnaligned16(src);
+ if (width >= 64) {
+ src += 16;
+ below[2] = LoadUnaligned16(src);
+ src += 16;
+ below[3] = LoadUnaligned16(src);
+ if (width == 128) {
+ src += 16;
+ below[4] = LoadUnaligned16(src);
+ src += 16;
+ below[5] = LoadUnaligned16(src);
+ src += 16;
+ below[6] = LoadUnaligned16(src);
+ src += 16;
+ below[7] = LoadUnaligned16(src);
+ }
+ }
+ }
+ src += src_remainder_stride;
+
+ StoreUnaligned16(dst, _mm_avg_epu8(row[0], below[0]));
+ row[0] = below[0];
+ if (width >= 32) {
+ dst += 16;
+ StoreUnaligned16(dst, _mm_avg_epu8(row[1], below[1]));
+ row[1] = below[1];
+ if (width >= 64) {
+ dst += 16;
+ StoreUnaligned16(dst, _mm_avg_epu8(row[2], below[2]));
+ row[2] = below[2];
+ dst += 16;
+ StoreUnaligned16(dst, _mm_avg_epu8(row[3], below[3]));
+ row[3] = below[3];
+ if (width >= 128) {
+ dst += 16;
+ StoreUnaligned16(dst, _mm_avg_epu8(row[4], below[4]));
+ row[4] = below[4];
+ dst += 16;
+ StoreUnaligned16(dst, _mm_avg_epu8(row[5], below[5]));
+ row[5] = below[5];
+ dst += 16;
+ StoreUnaligned16(dst, _mm_avg_epu8(row[6], below[6]));
+ row[6] = below[6];
+ dst += 16;
+ StoreUnaligned16(dst, _mm_avg_epu8(row[7], below[7]));
+ row[7] = below[7];
+ }
+ }
+ }
+ dst += dst_remainder_stride;
+ } while (--y != 0);
+}
+
+void ConvolveIntraBlockCopyVertical_SSE4_1(
+ const void* const reference, const ptrdiff_t reference_stride,
+ const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
+ const int /*horizontal_filter_id*/, const int /*vertical_filter_id*/,
+ const int width, const int height, void* const prediction,
+ const ptrdiff_t pred_stride) {
+ const auto* src = static_cast<const uint8_t*>(reference);
+ auto* dest = static_cast<uint8_t*>(prediction);
+
+ if (width == 128) {
+ IntraBlockCopyVertical<128>(src, reference_stride, height, dest,
+ pred_stride);
+ } else if (width == 64) {
+ IntraBlockCopyVertical<64>(src, reference_stride, height, dest,
+ pred_stride);
+ } else if (width == 32) {
+ IntraBlockCopyVertical<32>(src, reference_stride, height, dest,
+ pred_stride);
+ } else if (width == 16) {
+ IntraBlockCopyVertical<16>(src, reference_stride, height, dest,
+ pred_stride);
+ } else if (width == 8) {
+ __m128i row, below;
+ row = LoadLo8(src);
+ src += reference_stride;
+
+ int y = height;
+ do {
+ below = LoadLo8(src);
+ src += reference_stride;
+
+ StoreLo8(dest, _mm_avg_epu8(row, below));
+ dest += pred_stride;
+
+ row = below;
+ } while (--y != 0);
+ } else if (width == 4) {
+ __m128i row = Load4(src);
+ src += reference_stride;
+
+ int y = height;
+ do {
+ __m128i below = Load4(src);
+ src += reference_stride;
+
+ Store4(dest, _mm_avg_epu8(row, below));
+ dest += pred_stride;
+
+ row = below;
+ } while (--y != 0);
+ } else {
+ assert(width == 2);
+ __m128i row = Load2(src);
+ __m128i below = _mm_setzero_si128();
+ src += reference_stride;
+
+ int y = height;
+ do {
+ below = Load2<0>(src, below);
+ src += reference_stride;
+
+ Store2(dest, _mm_avg_epu8(row, below));
+ dest += pred_stride;
+
+ row = below;
+ } while (--y != 0);
+ }
+}
+
+// Load then add two uint8_t vectors. Return the uint16_t vector result.
+inline __m128i LoadU8AndAddLong(const uint8_t* src, const uint8_t* src1) {
+ const __m128i a = _mm_cvtepu8_epi16(LoadLo8(src));
+ const __m128i b = _mm_cvtepu8_epi16(LoadLo8(src1));
+ return _mm_add_epi16(a, b);
+}
+
+inline __m128i AddU16RightShift2AndPack(__m128i v0, __m128i v1) {
+ const __m128i a = _mm_add_epi16(v0, v1);
+ const __m128i b = _mm_srli_epi16(a, 1);
+ // Use avg here to shift right by 1 with round.
+ const __m128i c = _mm_avg_epu16(b, _mm_setzero_si128());
+ return _mm_packus_epi16(c, c);
+}
+
+template <int width>
+inline void IntraBlockCopy2D(const uint8_t* src, const ptrdiff_t src_stride,
+ const int height, uint8_t* dst,
+ const ptrdiff_t dst_stride) {
+ const ptrdiff_t src_remainder_stride = src_stride - (width - 8);
+ const ptrdiff_t dst_remainder_stride = dst_stride - (width - 8);
+ __m128i row[16];
+ row[0] = LoadU8AndAddLong(src, src + 1);
+ if (width >= 16) {
+ src += 8;
+ row[1] = LoadU8AndAddLong(src, src + 1);
+ if (width >= 32) {
+ src += 8;
+ row[2] = LoadU8AndAddLong(src, src + 1);
+ src += 8;
+ row[3] = LoadU8AndAddLong(src, src + 1);
+ if (width >= 64) {
+ src += 8;
+ row[4] = LoadU8AndAddLong(src, src + 1);
+ src += 8;
+ row[5] = LoadU8AndAddLong(src, src + 1);
+ src += 8;
+ row[6] = LoadU8AndAddLong(src, src + 1);
+ src += 8;
+ row[7] = LoadU8AndAddLong(src, src + 1);
+ if (width == 128) {
+ src += 8;
+ row[8] = LoadU8AndAddLong(src, src + 1);
+ src += 8;
+ row[9] = LoadU8AndAddLong(src, src + 1);
+ src += 8;
+ row[10] = LoadU8AndAddLong(src, src + 1);
+ src += 8;
+ row[11] = LoadU8AndAddLong(src, src + 1);
+ src += 8;
+ row[12] = LoadU8AndAddLong(src, src + 1);
+ src += 8;
+ row[13] = LoadU8AndAddLong(src, src + 1);
+ src += 8;
+ row[14] = LoadU8AndAddLong(src, src + 1);
+ src += 8;
+ row[15] = LoadU8AndAddLong(src, src + 1);
+ }
+ }
+ }
+ }
+ src += src_remainder_stride;
+
+ int y = height;
+ do {
+ const __m128i below_0 = LoadU8AndAddLong(src, src + 1);
+ StoreLo8(dst, AddU16RightShift2AndPack(row[0], below_0));
+ row[0] = below_0;
+ if (width >= 16) {
+ src += 8;
+ dst += 8;
+
+ const __m128i below_1 = LoadU8AndAddLong(src, src + 1);
+ StoreLo8(dst, AddU16RightShift2AndPack(row[1], below_1));
+ row[1] = below_1;
+ if (width >= 32) {
+ src += 8;
+ dst += 8;
+
+ const __m128i below_2 = LoadU8AndAddLong(src, src + 1);
+ StoreLo8(dst, AddU16RightShift2AndPack(row[2], below_2));
+ row[2] = below_2;
+ src += 8;
+ dst += 8;
+
+ const __m128i below_3 = LoadU8AndAddLong(src, src + 1);
+ StoreLo8(dst, AddU16RightShift2AndPack(row[3], below_3));
+ row[3] = below_3;
+ if (width >= 64) {
+ src += 8;
+ dst += 8;
+
+ const __m128i below_4 = LoadU8AndAddLong(src, src + 1);
+ StoreLo8(dst, AddU16RightShift2AndPack(row[4], below_4));
+ row[4] = below_4;
+ src += 8;
+ dst += 8;
+
+ const __m128i below_5 = LoadU8AndAddLong(src, src + 1);
+ StoreLo8(dst, AddU16RightShift2AndPack(row[5], below_5));
+ row[5] = below_5;
+ src += 8;
+ dst += 8;
+
+ const __m128i below_6 = LoadU8AndAddLong(src, src + 1);
+ StoreLo8(dst, AddU16RightShift2AndPack(row[6], below_6));
+ row[6] = below_6;
+ src += 8;
+ dst += 8;
+
+ const __m128i below_7 = LoadU8AndAddLong(src, src + 1);
+ StoreLo8(dst, AddU16RightShift2AndPack(row[7], below_7));
+ row[7] = below_7;
+ if (width == 128) {
+ src += 8;
+ dst += 8;
+
+ const __m128i below_8 = LoadU8AndAddLong(src, src + 1);
+ StoreLo8(dst, AddU16RightShift2AndPack(row[8], below_8));
+ row[8] = below_8;
+ src += 8;
+ dst += 8;
+
+ const __m128i below_9 = LoadU8AndAddLong(src, src + 1);
+ StoreLo8(dst, AddU16RightShift2AndPack(row[9], below_9));
+ row[9] = below_9;
+ src += 8;
+ dst += 8;
+
+ const __m128i below_10 = LoadU8AndAddLong(src, src + 1);
+ StoreLo8(dst, AddU16RightShift2AndPack(row[10], below_10));
+ row[10] = below_10;
+ src += 8;
+ dst += 8;
+
+ const __m128i below_11 = LoadU8AndAddLong(src, src + 1);
+ StoreLo8(dst, AddU16RightShift2AndPack(row[11], below_11));
+ row[11] = below_11;
+ src += 8;
+ dst += 8;
+
+ const __m128i below_12 = LoadU8AndAddLong(src, src + 1);
+ StoreLo8(dst, AddU16RightShift2AndPack(row[12], below_12));
+ row[12] = below_12;
+ src += 8;
+ dst += 8;
+
+ const __m128i below_13 = LoadU8AndAddLong(src, src + 1);
+ StoreLo8(dst, AddU16RightShift2AndPack(row[13], below_13));
+ row[13] = below_13;
+ src += 8;
+ dst += 8;
+
+ const __m128i below_14 = LoadU8AndAddLong(src, src + 1);
+ StoreLo8(dst, AddU16RightShift2AndPack(row[14], below_14));
+ row[14] = below_14;
+ src += 8;
+ dst += 8;
+
+ const __m128i below_15 = LoadU8AndAddLong(src, src + 1);
+ StoreLo8(dst, AddU16RightShift2AndPack(row[15], below_15));
+ row[15] = below_15;
+ }
+ }
+ }
+ }
+ src += src_remainder_stride;
+ dst += dst_remainder_stride;
+ } while (--y != 0);
+}
+
+void ConvolveIntraBlockCopy2D_SSE4_1(
+ const void* const reference, const ptrdiff_t reference_stride,
+ const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
+ const int /*horizontal_filter_id*/, const int /*vertical_filter_id*/,
+ const int width, const int height, void* const prediction,
+ const ptrdiff_t pred_stride) {
+ const auto* src = static_cast<const uint8_t*>(reference);
+ auto* dest = static_cast<uint8_t*>(prediction);
+ // Note: allow vertical access to height + 1. Because this function is only
+ // for u/v plane of intra block copy, such access is guaranteed to be within
+ // the prediction block.
+
+ if (width == 128) {
+ IntraBlockCopy2D<128>(src, reference_stride, height, dest, pred_stride);
+ } else if (width == 64) {
+ IntraBlockCopy2D<64>(src, reference_stride, height, dest, pred_stride);
+ } else if (width == 32) {
+ IntraBlockCopy2D<32>(src, reference_stride, height, dest, pred_stride);
+ } else if (width == 16) {
+ IntraBlockCopy2D<16>(src, reference_stride, height, dest, pred_stride);
+ } else if (width == 8) {
+ IntraBlockCopy2D<8>(src, reference_stride, height, dest, pred_stride);
+ } else if (width == 4) {
+ __m128i left = _mm_cvtepu8_epi16(Load4(src));
+ __m128i right = _mm_cvtepu8_epi16(Load4(src + 1));
+ src += reference_stride;
+
+ __m128i row = _mm_add_epi16(left, right);
+
+ int y = height;
+ do {
+ left = Load4(src);
+ right = Load4(src + 1);
+ src += reference_stride;
+ left = _mm_unpacklo_epi32(left, Load4(src));
+ right = _mm_unpacklo_epi32(right, Load4(src + 1));
+ src += reference_stride;
+
+ const __m128i below =
+ _mm_add_epi16(_mm_cvtepu8_epi16(left), _mm_cvtepu8_epi16(right));
+ const __m128i result =
+ AddU16RightShift2AndPack(_mm_unpacklo_epi64(row, below), below);
+
+ Store4(dest, result);
+ dest += pred_stride;
+ Store4(dest, _mm_srli_si128(result, 4));
+ dest += pred_stride;
+
+ row = _mm_srli_si128(below, 8);
+ y -= 2;
+ } while (y != 0);
+ } else {
+ __m128i left = Load2(src);
+ __m128i right = Load2(src + 1);
+ src += reference_stride;
+
+ __m128i row =
+ _mm_add_epi16(_mm_cvtepu8_epi16(left), _mm_cvtepu8_epi16(right));
+
+ int y = height;
+ do {
+ left = Load2<0>(src, left);
+ right = Load2<0>(src + 1, right);
+ src += reference_stride;
+ left = Load2<2>(src, left);
+ right = Load2<2>(src + 1, right);
+ src += reference_stride;
+
+ const __m128i below =
+ _mm_add_epi16(_mm_cvtepu8_epi16(left), _mm_cvtepu8_epi16(right));
+ const __m128i result =
+ AddU16RightShift2AndPack(_mm_unpacklo_epi64(row, below), below);
+
+ Store2(dest, result);
+ dest += pred_stride;
+ Store2(dest, _mm_srli_si128(result, 4));
+ dest += pred_stride;
+
+ row = _mm_srli_si128(below, 8);
+ y -= 2;
+ } while (y != 0);
+ }
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+ dsp->convolve[0][0][0][1] = ConvolveHorizontal_SSE4_1;
+ dsp->convolve[0][0][1][0] = ConvolveVertical_SSE4_1;
+ dsp->convolve[0][0][1][1] = Convolve2D_SSE4_1;
+
+ dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_SSE4;
+ dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_SSE4_1;
+ dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_SSE4_1;
+ dsp->convolve[0][1][1][1] = ConvolveCompound2D_SSE4_1;
+
+ dsp->convolve[1][0][0][1] = ConvolveIntraBlockCopyHorizontal_SSE4_1;
+ dsp->convolve[1][0][1][0] = ConvolveIntraBlockCopyVertical_SSE4_1;
+ dsp->convolve[1][0][1][1] = ConvolveIntraBlockCopy2D_SSE4_1;
+
+ dsp->convolve_scale[0] = ConvolveScale2D_SSE4_1<false>;
+ dsp->convolve_scale[1] = ConvolveScale2D_SSE4_1<true>;
+}
+
+} // namespace
+} // namespace low_bitdepth
+
+void ConvolveInit_SSE4_1() { low_bitdepth::Init8bpp(); }
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_TARGETING_SSE4_1
+namespace libgav1 {
+namespace dsp {
+
+void ConvolveInit_SSE4_1() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/convolve_sse4.h b/src/dsp/x86/convolve_sse4.h
new file mode 100644
index 0000000..d6c3155
--- /dev/null
+++ b/src/dsp/x86/convolve_sse4.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_CONVOLVE_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_CONVOLVE_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::convolve, see the defines below for specifics. This
+// function is not thread-safe.
+void ConvolveInit_SSE4_1();
+
+} // namespace dsp
+} // namespace libgav1
+
+// If sse4 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the sse4 implementation should be used.
+#if LIBGAV1_TARGETING_SSE4_1
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveHorizontal
+#define LIBGAV1_Dsp8bpp_ConvolveHorizontal LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveVertical
+#define LIBGAV1_Dsp8bpp_ConvolveVertical LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_Convolve2D
+#define LIBGAV1_Dsp8bpp_Convolve2D LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveCompoundCopy
+#define LIBGAV1_Dsp8bpp_ConvolveCompoundCopy LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveCompoundHorizontal
+#define LIBGAV1_Dsp8bpp_ConvolveCompoundHorizontal LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveCompoundVertical
+#define LIBGAV1_Dsp8bpp_ConvolveCompoundVertical LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveCompound2D
+#define LIBGAV1_Dsp8bpp_ConvolveCompound2D LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveScale2D
+#define LIBGAV1_Dsp8bpp_ConvolveScale2D LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveCompoundScale2D
+#define LIBGAV1_Dsp8bpp_ConvolveCompoundScale2D LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif // LIBGAV1_TARGETING_SSE4_1
+
+#endif // LIBGAV1_SRC_DSP_X86_CONVOLVE_SSE4_H_
diff --git a/src/dsp/x86/distance_weighted_blend_sse4.cc b/src/dsp/x86/distance_weighted_blend_sse4.cc
new file mode 100644
index 0000000..deb57ef
--- /dev/null
+++ b/src/dsp/x86/distance_weighted_blend_sse4.cc
@@ -0,0 +1,230 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/distance_weighted_blend.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <xmmintrin.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kInterPostRoundBit = 4;
+
+inline __m128i ComputeWeightedAverage8(const __m128i& pred0,
+ const __m128i& pred1,
+ const __m128i& weights) {
+ // TODO(https://issuetracker.google.com/issues/150325685): Investigate range.
+ const __m128i preds_lo = _mm_unpacklo_epi16(pred0, pred1);
+ const __m128i mult_lo = _mm_madd_epi16(preds_lo, weights);
+ const __m128i result_lo =
+ RightShiftWithRounding_S32(mult_lo, kInterPostRoundBit + 4);
+
+ const __m128i preds_hi = _mm_unpackhi_epi16(pred0, pred1);
+ const __m128i mult_hi = _mm_madd_epi16(preds_hi, weights);
+ const __m128i result_hi =
+ RightShiftWithRounding_S32(mult_hi, kInterPostRoundBit + 4);
+
+ return _mm_packs_epi32(result_lo, result_hi);
+}
+
+template <int height>
+inline void DistanceWeightedBlend4xH_SSE4_1(
+ const int16_t* pred_0, const int16_t* pred_1, const uint8_t weight_0,
+ const uint8_t weight_1, void* const dest, const ptrdiff_t dest_stride) {
+ auto* dst = static_cast<uint8_t*>(dest);
+ const __m128i weights = _mm_set1_epi32(weight_0 | (weight_1 << 16));
+
+ for (int y = 0; y < height; y += 4) {
+ // TODO(b/150326556): Use larger loads.
+ const __m128i src_00 = LoadLo8(pred_0);
+ const __m128i src_10 = LoadLo8(pred_1);
+ pred_0 += 4;
+ pred_1 += 4;
+ __m128i src_0 = LoadHi8(src_00, pred_0);
+ __m128i src_1 = LoadHi8(src_10, pred_1);
+ pred_0 += 4;
+ pred_1 += 4;
+ const __m128i res0 = ComputeWeightedAverage8(src_0, src_1, weights);
+
+ const __m128i src_01 = LoadLo8(pred_0);
+ const __m128i src_11 = LoadLo8(pred_1);
+ pred_0 += 4;
+ pred_1 += 4;
+ src_0 = LoadHi8(src_01, pred_0);
+ src_1 = LoadHi8(src_11, pred_1);
+ pred_0 += 4;
+ pred_1 += 4;
+ const __m128i res1 = ComputeWeightedAverage8(src_0, src_1, weights);
+
+ const __m128i result_pixels = _mm_packus_epi16(res0, res1);
+ Store4(dst, result_pixels);
+ dst += dest_stride;
+ const int result_1 = _mm_extract_epi32(result_pixels, 1);
+ memcpy(dst, &result_1, sizeof(result_1));
+ dst += dest_stride;
+ const int result_2 = _mm_extract_epi32(result_pixels, 2);
+ memcpy(dst, &result_2, sizeof(result_2));
+ dst += dest_stride;
+ const int result_3 = _mm_extract_epi32(result_pixels, 3);
+ memcpy(dst, &result_3, sizeof(result_3));
+ dst += dest_stride;
+ }
+}
+
+template <int height>
+inline void DistanceWeightedBlend8xH_SSE4_1(
+ const int16_t* pred_0, const int16_t* pred_1, const uint8_t weight_0,
+ const uint8_t weight_1, void* const dest, const ptrdiff_t dest_stride) {
+ auto* dst = static_cast<uint8_t*>(dest);
+ const __m128i weights = _mm_set1_epi32(weight_0 | (weight_1 << 16));
+
+ for (int y = 0; y < height; y += 2) {
+ const __m128i src_00 = LoadAligned16(pred_0);
+ const __m128i src_10 = LoadAligned16(pred_1);
+ pred_0 += 8;
+ pred_1 += 8;
+ const __m128i res0 = ComputeWeightedAverage8(src_00, src_10, weights);
+
+ const __m128i src_01 = LoadAligned16(pred_0);
+ const __m128i src_11 = LoadAligned16(pred_1);
+ pred_0 += 8;
+ pred_1 += 8;
+ const __m128i res1 = ComputeWeightedAverage8(src_01, src_11, weights);
+
+ const __m128i result_pixels = _mm_packus_epi16(res0, res1);
+ StoreLo8(dst, result_pixels);
+ dst += dest_stride;
+ StoreHi8(dst, result_pixels);
+ dst += dest_stride;
+ }
+}
+
+inline void DistanceWeightedBlendLarge_SSE4_1(
+ const int16_t* pred_0, const int16_t* pred_1, const uint8_t weight_0,
+ const uint8_t weight_1, const int width, const int height, void* const dest,
+ const ptrdiff_t dest_stride) {
+ auto* dst = static_cast<uint8_t*>(dest);
+ const __m128i weights = _mm_set1_epi32(weight_0 | (weight_1 << 16));
+
+ int y = height;
+ do {
+ int x = 0;
+ do {
+ const __m128i src_0_lo = LoadAligned16(pred_0 + x);
+ const __m128i src_1_lo = LoadAligned16(pred_1 + x);
+ const __m128i res_lo =
+ ComputeWeightedAverage8(src_0_lo, src_1_lo, weights);
+
+ const __m128i src_0_hi = LoadAligned16(pred_0 + x + 8);
+ const __m128i src_1_hi = LoadAligned16(pred_1 + x + 8);
+ const __m128i res_hi =
+ ComputeWeightedAverage8(src_0_hi, src_1_hi, weights);
+
+ StoreUnaligned16(dst + x, _mm_packus_epi16(res_lo, res_hi));
+ x += 16;
+ } while (x < width);
+ dst += dest_stride;
+ pred_0 += width;
+ pred_1 += width;
+ } while (--y != 0);
+}
+
+void DistanceWeightedBlend_SSE4_1(const void* prediction_0,
+ const void* prediction_1,
+ const uint8_t weight_0,
+ const uint8_t weight_1, const int width,
+ const int height, void* const dest,
+ const ptrdiff_t dest_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ if (width == 4) {
+ if (height == 4) {
+ DistanceWeightedBlend4xH_SSE4_1<4>(pred_0, pred_1, weight_0, weight_1,
+ dest, dest_stride);
+ } else if (height == 8) {
+ DistanceWeightedBlend4xH_SSE4_1<8>(pred_0, pred_1, weight_0, weight_1,
+ dest, dest_stride);
+ } else {
+ assert(height == 16);
+ DistanceWeightedBlend4xH_SSE4_1<16>(pred_0, pred_1, weight_0, weight_1,
+ dest, dest_stride);
+ }
+ return;
+ }
+
+ if (width == 8) {
+ switch (height) {
+ case 4:
+ DistanceWeightedBlend8xH_SSE4_1<4>(pred_0, pred_1, weight_0, weight_1,
+ dest, dest_stride);
+ return;
+ case 8:
+ DistanceWeightedBlend8xH_SSE4_1<8>(pred_0, pred_1, weight_0, weight_1,
+ dest, dest_stride);
+ return;
+ case 16:
+ DistanceWeightedBlend8xH_SSE4_1<16>(pred_0, pred_1, weight_0, weight_1,
+ dest, dest_stride);
+ return;
+ default:
+ assert(height == 32);
+ DistanceWeightedBlend8xH_SSE4_1<32>(pred_0, pred_1, weight_0, weight_1,
+ dest, dest_stride);
+
+ return;
+ }
+ }
+
+ DistanceWeightedBlendLarge_SSE4_1(pred_0, pred_1, weight_0, weight_1, width,
+ height, dest, dest_stride);
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+#if DSP_ENABLED_8BPP_SSE4_1(DistanceWeightedBlend)
+ dsp->distance_weighted_blend = DistanceWeightedBlend_SSE4_1;
+#endif
+}
+
+} // namespace
+
+void DistanceWeightedBlendInit_SSE4_1() { Init8bpp(); }
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_TARGETING_SSE4_1
+
+namespace libgav1 {
+namespace dsp {
+
+void DistanceWeightedBlendInit_SSE4_1() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/distance_weighted_blend_sse4.h b/src/dsp/x86/distance_weighted_blend_sse4.h
new file mode 100644
index 0000000..8646eca
--- /dev/null
+++ b/src/dsp/x86/distance_weighted_blend_sse4.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_DISTANCE_WEIGHTED_BLEND_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_DISTANCE_WEIGHTED_BLEND_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::distance_weighted_blend. This function is not thread-safe.
+void DistanceWeightedBlendInit_SSE4_1();
+
+} // namespace dsp
+} // namespace libgav1
+
+// If sse4 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the sse4 implementation should be used.
+#if LIBGAV1_TARGETING_SSE4_1
+#ifndef LIBGAV1_Dsp8bpp_DistanceWeightedBlend
+#define LIBGAV1_Dsp8bpp_DistanceWeightedBlend LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif // LIBGAV1_TARGETING_SSE4_1
+
+#endif // LIBGAV1_SRC_DSP_X86_DISTANCE_WEIGHTED_BLEND_SSE4_H_
diff --git a/src/dsp/x86/intra_edge_sse4.cc b/src/dsp/x86/intra_edge_sse4.cc
new file mode 100644
index 0000000..4a8658d
--- /dev/null
+++ b/src/dsp/x86/intra_edge_sse4.cc
@@ -0,0 +1,270 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intra_edge.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <xmmintrin.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring> // memcpy
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kKernelTaps = 5;
+constexpr int kKernels[3][kKernelTaps] = {
+ {0, 4, 8, 4, 0}, {0, 5, 6, 5, 0}, {2, 4, 4, 4, 2}};
+constexpr int kMaxEdgeBufferSize = 129;
+
+// This function applies the kernel [0, 4, 8, 4, 0] to 12 values.
+// Assumes |edge| has 16 packed byte values. Produces 12 filter outputs to
+// write as overlapping sets of 8-bytes.
+inline void ComputeKernel1Store12(uint8_t* dest, const uint8_t* source) {
+ const __m128i edge_lo = LoadUnaligned16(source);
+ const __m128i edge_hi = _mm_srli_si128(edge_lo, 6);
+ // Samples matched with the '4' tap, expanded to 16-bit.
+ const __m128i outers_lo = _mm_cvtepu8_epi16(edge_lo);
+ const __m128i outers_hi = _mm_cvtepu8_epi16(edge_hi);
+ // Samples matched with the '8' tap, expanded to 16-bit.
+ const __m128i centers_lo = _mm_srli_si128(outers_lo, 2);
+ const __m128i centers_hi = _mm_srli_si128(outers_hi, 2);
+
+ // Apply the taps by shifting.
+ const __m128i outers4_lo = _mm_slli_epi16(outers_lo, 2);
+ const __m128i outers4_hi = _mm_slli_epi16(outers_hi, 2);
+ const __m128i centers8_lo = _mm_slli_epi16(centers_lo, 3);
+ const __m128i centers8_hi = _mm_slli_epi16(centers_hi, 3);
+ // Move latter 4x values down to add with first 4x values for each output.
+ const __m128i partial_sums_lo =
+ _mm_add_epi16(outers4_lo, _mm_srli_si128(outers4_lo, 4));
+ const __m128i partial_sums_hi =
+ _mm_add_epi16(outers4_hi, _mm_srli_si128(outers4_hi, 4));
+ // Move 6x values down to add for the final kernel sum for each output.
+ const __m128i sums_lo = RightShiftWithRounding_U16(
+ _mm_add_epi16(partial_sums_lo, centers8_lo), 4);
+ const __m128i sums_hi = RightShiftWithRounding_U16(
+ _mm_add_epi16(partial_sums_hi, centers8_hi), 4);
+
+ const __m128i result_lo = _mm_packus_epi16(sums_lo, sums_lo);
+ const __m128i result_hi = _mm_packus_epi16(sums_hi, sums_hi);
+ const __m128i result =
+ _mm_alignr_epi8(result_hi, _mm_slli_si128(result_lo, 10), 10);
+ StoreUnaligned16(dest, result);
+}
+
+// This function applies the kernel [0, 5, 6, 5, 0] to 12 values.
+// Assumes |edge| has 8 packed byte values, and that the 2 invalid values will
+// be overwritten or safely discarded.
+inline void ComputeKernel2Store12(uint8_t* dest, const uint8_t* source) {
+ const __m128i edge_lo = LoadUnaligned16(source);
+ const __m128i edge_hi = _mm_srli_si128(edge_lo, 6);
+ const __m128i outers_lo = _mm_cvtepu8_epi16(edge_lo);
+ const __m128i centers_lo = _mm_srli_si128(outers_lo, 2);
+ const __m128i outers_hi = _mm_cvtepu8_epi16(edge_hi);
+ const __m128i centers_hi = _mm_srli_si128(outers_hi, 2);
+ // Samples matched with the '5' tap, expanded to 16-bit. Add x + 4x.
+ const __m128i outers5_lo =
+ _mm_add_epi16(outers_lo, _mm_slli_epi16(outers_lo, 2));
+ const __m128i outers5_hi =
+ _mm_add_epi16(outers_hi, _mm_slli_epi16(outers_hi, 2));
+ // Samples matched with the '6' tap, expanded to 16-bit. Add 2x + 4x.
+ const __m128i centers6_lo = _mm_add_epi16(_mm_slli_epi16(centers_lo, 1),
+ _mm_slli_epi16(centers_lo, 2));
+ const __m128i centers6_hi = _mm_add_epi16(_mm_slli_epi16(centers_hi, 1),
+ _mm_slli_epi16(centers_hi, 2));
+ // Move latter 5x values down to add with first 5x values for each output.
+ const __m128i partial_sums_lo =
+ _mm_add_epi16(outers5_lo, _mm_srli_si128(outers5_lo, 4));
+ // Move 6x values down to add for the final kernel sum for each output.
+ const __m128i sums_lo = RightShiftWithRounding_U16(
+ _mm_add_epi16(centers6_lo, partial_sums_lo), 4);
+ // Shift latter 5x values to add with first 5x values for each output.
+ const __m128i partial_sums_hi =
+ _mm_add_epi16(outers5_hi, _mm_srli_si128(outers5_hi, 4));
+ // Move 6x values down to add for the final kernel sum for each output.
+ const __m128i sums_hi = RightShiftWithRounding_U16(
+ _mm_add_epi16(centers6_hi, partial_sums_hi), 4);
+ // First 6 values are valid outputs.
+ const __m128i result_lo = _mm_packus_epi16(sums_lo, sums_lo);
+ const __m128i result_hi = _mm_packus_epi16(sums_hi, sums_hi);
+ const __m128i result =
+ _mm_alignr_epi8(result_hi, _mm_slli_si128(result_lo, 10), 10);
+ StoreUnaligned16(dest, result);
+}
+
+// This function applies the kernel [2, 4, 4, 4, 2] to 8 values.
+inline void ComputeKernel3Store8(uint8_t* dest, const uint8_t* source) {
+ const __m128i edge_lo = LoadUnaligned16(source);
+ const __m128i edge_hi = _mm_srli_si128(edge_lo, 4);
+ // Finish |edge_lo| life cycle quickly.
+ // Multiply for 2x.
+ const __m128i source2_lo = _mm_slli_epi16(_mm_cvtepu8_epi16(edge_lo), 1);
+ // Multiply 2x by 2 and align.
+ const __m128i source4_lo = _mm_srli_si128(_mm_slli_epi16(source2_lo, 1), 2);
+ // Finish |source2| life cycle quickly.
+ // Move latter 2x values down to add with first 2x values for each output.
+ __m128i sum = _mm_add_epi16(source2_lo, _mm_srli_si128(source2_lo, 8));
+ // First 4x values already aligned to add with running total.
+ sum = _mm_add_epi16(sum, source4_lo);
+ // Move second 4x values down to add with running total.
+ sum = _mm_add_epi16(sum, _mm_srli_si128(source4_lo, 2));
+ // Move third 4x values down to add with running total.
+ sum = _mm_add_epi16(sum, _mm_srli_si128(source4_lo, 4));
+ // Multiply for 2x.
+ const __m128i source2_hi = _mm_slli_epi16(_mm_cvtepu8_epi16(edge_hi), 1);
+ // Multiply 2x by 2 and align.
+ const __m128i source4_hi = _mm_srli_si128(_mm_slli_epi16(source2_hi, 1), 2);
+ // Move latter 2x values down to add with first 2x values for each output.
+ __m128i sum_hi = _mm_add_epi16(source2_hi, _mm_srli_si128(source2_hi, 8));
+ // First 4x values already aligned to add with running total.
+ sum_hi = _mm_add_epi16(sum_hi, source4_hi);
+ // Move second 4x values down to add with running total.
+ sum_hi = _mm_add_epi16(sum_hi, _mm_srli_si128(source4_hi, 2));
+ // Move third 4x values down to add with running total.
+ sum_hi = _mm_add_epi16(sum_hi, _mm_srli_si128(source4_hi, 4));
+
+ // Because we have only 8 values here, it is safe to align before packing down
+ // to 8-bit without losing data.
+ sum = _mm_alignr_epi8(sum_hi, _mm_slli_si128(sum, 8), 8);
+ sum = RightShiftWithRounding_U16(sum, 4);
+ StoreLo8(dest, _mm_packus_epi16(sum, sum));
+}
+
+void IntraEdgeFilter_SSE4_1(void* buffer, int size, int strength) {
+ uint8_t edge[kMaxEdgeBufferSize + 4];
+ memcpy(edge, buffer, size);
+ auto* dst_buffer = static_cast<uint8_t*>(buffer);
+
+ // Only process |size| - 1 elements. Nothing to do in this case.
+ if (size == 1) return;
+
+ int i = 0;
+ switch (strength) {
+ case 1:
+ // To avoid overwriting, we stop short from the total write size plus the
+ // initial offset. In this case 12 valid values are written in two blocks
+ // of 8 bytes each.
+ for (; i < size - 17; i += 12) {
+ ComputeKernel1Store12(dst_buffer + i + 1, edge + i);
+ }
+ break;
+ case 2:
+ // See the comment for case 1.
+ for (; i < size - 17; i += 12) {
+ ComputeKernel2Store12(dst_buffer + i + 1, edge + i);
+ }
+ break;
+ default:
+ assert(strength == 3);
+ // The first filter input is repeated for taps of value 2 and 4.
+ dst_buffer[1] = RightShiftWithRounding(
+ (6 * edge[0] + 4 * edge[1] + 4 * edge[2] + 2 * edge[3]), 4);
+ // In this case, one block of 8 bytes is written in each iteration, with
+ // an offset of 2.
+ for (; i < size - 10; i += 8) {
+ ComputeKernel3Store8(dst_buffer + i + 2, edge + i);
+ }
+ }
+ const int kernel_index = strength - 1;
+ for (int final_index = Clip3(i, 1, size - 2); final_index < size;
+ ++final_index) {
+ int sum = 0;
+ for (int j = 0; j < kKernelTaps; ++j) {
+ const int k = Clip3(final_index + j - 2, 0, size - 1);
+ sum += kKernels[kernel_index][j] * edge[k];
+ }
+ dst_buffer[final_index] = RightShiftWithRounding(sum, 4);
+ }
+}
+
+constexpr int kMaxUpsampleSize = 16;
+
+// Applies the upsampling kernel [-1, 9, 9, -1] to alternating pixels, and
+// interleaves the results with the original values. This implementation assumes
+// that it is safe to write the maximum number of upsampled pixels (32) to the
+// edge buffer, even when |size| is small.
+void IntraEdgeUpsampler_SSE4_1(void* buffer, int size) {
+ assert(size % 4 == 0 && size <= kMaxUpsampleSize);
+ auto* const pixel_buffer = static_cast<uint8_t*>(buffer);
+ uint8_t temp[kMaxUpsampleSize + 8];
+ temp[0] = temp[1] = pixel_buffer[-1];
+ memcpy(temp + 2, pixel_buffer, sizeof(temp[0]) * size);
+ temp[size + 2] = pixel_buffer[size - 1];
+
+ pixel_buffer[-2] = temp[0];
+ const __m128i data = LoadUnaligned16(temp);
+ const __m128i src_lo = _mm_cvtepu8_epi16(data);
+ const __m128i src_hi = _mm_unpackhi_epi8(data, _mm_setzero_si128());
+ const __m128i src9_hi = _mm_add_epi16(src_hi, _mm_slli_epi16(src_hi, 3));
+ const __m128i src9_lo = _mm_add_epi16(src_lo, _mm_slli_epi16(src_lo, 3));
+ __m128i sum_lo = _mm_sub_epi16(_mm_alignr_epi8(src9_hi, src9_lo, 2), src_lo);
+ sum_lo = _mm_add_epi16(sum_lo, _mm_alignr_epi8(src9_hi, src9_lo, 4));
+ sum_lo = _mm_sub_epi16(sum_lo, _mm_alignr_epi8(src_hi, src_lo, 6));
+ sum_lo = RightShiftWithRounding_S16(sum_lo, 4);
+ const __m128i result_lo = _mm_unpacklo_epi8(_mm_packus_epi16(sum_lo, sum_lo),
+ _mm_srli_si128(data, 2));
+ StoreUnaligned16(pixel_buffer - 1, result_lo);
+ if (size > 8) {
+ const __m128i src_hi_extra = _mm_cvtepu8_epi16(LoadLo8(temp + 16));
+ const __m128i src9_hi_extra =
+ _mm_add_epi16(src_hi_extra, _mm_slli_epi16(src_hi_extra, 3));
+ __m128i sum_hi =
+ _mm_sub_epi16(_mm_alignr_epi8(src9_hi_extra, src9_hi, 2), src_hi);
+ sum_hi = _mm_add_epi16(sum_hi, _mm_alignr_epi8(src9_hi_extra, src9_hi, 4));
+ sum_hi = _mm_sub_epi16(sum_hi, _mm_alignr_epi8(src_hi_extra, src_hi, 6));
+ sum_hi = RightShiftWithRounding_S16(sum_hi, 4);
+ const __m128i result_hi =
+ _mm_unpacklo_epi8(_mm_packus_epi16(sum_hi, sum_hi), LoadLo8(temp + 10));
+ StoreUnaligned16(pixel_buffer + 15, result_hi);
+ }
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+#if DSP_ENABLED_8BPP_SSE4_1(IntraEdgeFilter)
+ dsp->intra_edge_filter = IntraEdgeFilter_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(IntraEdgeUpsampler)
+ dsp->intra_edge_upsampler = IntraEdgeUpsampler_SSE4_1;
+#endif
+}
+
+} // namespace
+
+void IntraEdgeInit_SSE4_1() { Init8bpp(); }
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_TARGETING_SSE4_1
+namespace libgav1 {
+namespace dsp {
+
+void IntraEdgeInit_SSE4_1() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/intra_edge_sse4.h b/src/dsp/x86/intra_edge_sse4.h
new file mode 100644
index 0000000..6ed4d40
--- /dev/null
+++ b/src/dsp/x86/intra_edge_sse4.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_INTRA_EDGE_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_INTRA_EDGE_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::intra_edge_filter and Dsp::intra_edge_upsampler. This
+// function is not thread-safe.
+void IntraEdgeInit_SSE4_1();
+
+} // namespace dsp
+} // namespace libgav1
+
+// If sse4 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the sse4 implementation should be used.
+#if LIBGAV1_TARGETING_SSE4_1
+#ifndef LIBGAV1_Dsp8bpp_IntraEdgeFilter
+#define LIBGAV1_Dsp8bpp_IntraEdgeFilter LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_IntraEdgeUpsampler
+#define LIBGAV1_Dsp8bpp_IntraEdgeUpsampler LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif // LIBGAV1_TARGETING_SSE4_1
+
+#endif // LIBGAV1_SRC_DSP_X86_INTRA_EDGE_SSE4_H_
diff --git a/src/dsp/x86/intrapred_cfl_sse4.cc b/src/dsp/x86/intrapred_cfl_sse4.cc
new file mode 100644
index 0000000..fac1556
--- /dev/null
+++ b/src/dsp/x86/intrapred_cfl_sse4.cc
@@ -0,0 +1,976 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <smmintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+//------------------------------------------------------------------------------
+// CflIntraPredictor_SSE4_1
+
+inline __m128i CflPredictUnclipped(const __m128i* input, __m128i alpha_q12,
+ __m128i alpha_sign, __m128i dc_q0) {
+ __m128i ac_q3 = LoadUnaligned16(input);
+ __m128i ac_sign = _mm_sign_epi16(alpha_sign, ac_q3);
+ __m128i scaled_luma_q0 = _mm_mulhrs_epi16(_mm_abs_epi16(ac_q3), alpha_q12);
+ scaled_luma_q0 = _mm_sign_epi16(scaled_luma_q0, ac_sign);
+ return _mm_add_epi16(scaled_luma_q0, dc_q0);
+}
+
+template <int width, int height>
+void CflIntraPredictor_SSE4_1(
+ void* const dest, ptrdiff_t stride,
+ const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int alpha) {
+ auto* dst = static_cast<uint8_t*>(dest);
+ const __m128i alpha_sign = _mm_set1_epi16(alpha);
+ const __m128i alpha_q12 = _mm_slli_epi16(_mm_abs_epi16(alpha_sign), 9);
+ auto* row = reinterpret_cast<const __m128i*>(luma);
+ const int kCflLumaBufferStrideLog2_16i = 5;
+ const int kCflLumaBufferStrideLog2_128i = kCflLumaBufferStrideLog2_16i - 3;
+ const __m128i* row_end = row + (height << kCflLumaBufferStrideLog2_128i);
+ const __m128i dc_val = _mm_set1_epi16(dst[0]);
+ do {
+ __m128i res = CflPredictUnclipped(row, alpha_q12, alpha_sign, dc_val);
+ if (width < 16) {
+ res = _mm_packus_epi16(res, res);
+ if (width == 4) {
+ Store4(dst, res);
+ } else {
+ StoreLo8(dst, res);
+ }
+ } else {
+ __m128i next =
+ CflPredictUnclipped(row + 1, alpha_q12, alpha_sign, dc_val);
+ res = _mm_packus_epi16(res, next);
+ StoreUnaligned16(dst, res);
+ if (width == 32) {
+ res = CflPredictUnclipped(row + 2, alpha_q12, alpha_sign, dc_val);
+ next = CflPredictUnclipped(row + 3, alpha_q12, alpha_sign, dc_val);
+ res = _mm_packus_epi16(res, next);
+ StoreUnaligned16(dst + 16, res);
+ }
+ }
+ dst += stride;
+ } while ((row += (1 << kCflLumaBufferStrideLog2_128i)) < row_end);
+}
+
+template <int block_height_log2, bool is_inside>
+void CflSubsampler444_4xH_SSE4_1(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int /*max_luma_width*/, const int max_luma_height,
+ const void* const source, ptrdiff_t stride) {
+ static_assert(block_height_log2 <= 4, "");
+ const int block_height = 1 << block_height_log2;
+ const int visible_height = max_luma_height;
+ const auto* src = static_cast<const uint8_t*>(source);
+ __m128i sum = _mm_setzero_si128();
+ int16_t* luma_ptr = luma[0];
+ const __m128i zero = _mm_setzero_si128();
+ __m128i samples;
+ int y = 0;
+ do {
+ samples = Load4(src);
+ src += stride;
+ int src_bytes;
+ memcpy(&src_bytes, src, 4);
+ samples = _mm_insert_epi32(samples, src_bytes, 1);
+ src += stride;
+ samples = _mm_slli_epi16(_mm_cvtepu8_epi16(samples), 3);
+ StoreLo8(luma_ptr, samples);
+ luma_ptr += kCflLumaBufferStride;
+ StoreHi8(luma_ptr, samples);
+ luma_ptr += kCflLumaBufferStride;
+
+ // The maximum value here is 2**bd * H * 2**shift. Since the maximum H for
+ // 4XH is 16 = 2**4, we have 2**(8 + 4 + 3) = 2**15, which fits in 16 bits.
+ sum = _mm_add_epi16(sum, samples);
+ y += 2;
+ } while (y < visible_height);
+
+ if (!is_inside) {
+ int y = visible_height;
+ do {
+ StoreHi8(luma_ptr, samples);
+ luma_ptr += kCflLumaBufferStride;
+ sum = _mm_add_epi16(sum, samples);
+ ++y;
+ } while (y < block_height);
+ }
+
+ __m128i sum_tmp = _mm_unpackhi_epi16(sum, zero);
+ sum = _mm_cvtepu16_epi32(sum);
+ sum = _mm_add_epi32(sum, sum_tmp);
+ sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 8));
+ sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 4));
+
+ __m128i averages = RightShiftWithRounding_U32(
+ sum, block_height_log2 + 2 /* log2 of width 4 */);
+ averages = _mm_shufflelo_epi16(averages, 0);
+ luma_ptr = luma[0];
+ for (int y = 0; y < block_height; ++y, luma_ptr += kCflLumaBufferStride) {
+ const __m128i samples = LoadLo8(luma_ptr);
+ StoreLo8(luma_ptr, _mm_sub_epi16(samples, averages));
+ }
+}
+
+template <int block_height_log2>
+void CflSubsampler444_4xH_SSE4_1(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_width, const int max_luma_height,
+ const void* const source, ptrdiff_t stride) {
+ static_assert(block_height_log2 <= 4, "");
+ assert(max_luma_width >= 4);
+ assert(max_luma_height >= 4);
+ const int block_height = 1 << block_height_log2;
+ const int block_width = 4;
+
+ if (block_height <= max_luma_height && block_width <= max_luma_width) {
+ CflSubsampler444_4xH_SSE4_1<block_height_log2, true>(
+ luma, max_luma_width, max_luma_height, source, stride);
+ } else {
+ CflSubsampler444_4xH_SSE4_1<block_height_log2, false>(
+ luma, max_luma_width, max_luma_height, source, stride);
+ }
+}
+
+template <int block_height_log2, bool inside>
+void CflSubsampler444_8xH_SSE4_1(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_width, const int max_luma_height,
+ const void* const source, ptrdiff_t stride) {
+ static_assert(block_height_log2 <= 5, "");
+ const int block_height = 1 << block_height_log2, block_width = 8;
+ const int visible_height = max_luma_height;
+ const int invisible_width = inside ? 0 : block_width - max_luma_width;
+ const int visible_width = max_luma_width;
+ const __m128i blend_mask =
+ inside ? _mm_setzero_si128() : MaskHighNBytes(8 + invisible_width);
+ const __m128i dup16 = _mm_set1_epi32(0x01000100);
+ const auto* src = static_cast<const uint8_t*>(source);
+ int16_t* luma_ptr = luma[0];
+ const __m128i zero = _mm_setzero_si128();
+ // Since the maximum height is 32, if we split them by parity, each one only
+ // needs to accumulate 16 rows. Just like the calculation done in 4XH, we can
+ // store them in 16 bits without casting to 32 bits.
+ __m128i sum_even = _mm_setzero_si128(), sum_odd = _mm_setzero_si128();
+ __m128i sum;
+ __m128i samples1;
+
+ int y = 0;
+ do {
+ __m128i samples0 = LoadLo8(src);
+ if (!inside) {
+ const __m128i border0 =
+ _mm_set1_epi8(static_cast<int8_t>(src[visible_width - 1]));
+ samples0 = _mm_blendv_epi8(samples0, border0, blend_mask);
+ }
+ src += stride;
+ samples0 = _mm_slli_epi16(_mm_cvtepu8_epi16(samples0), 3);
+ StoreUnaligned16(luma_ptr, samples0);
+ luma_ptr += kCflLumaBufferStride;
+
+ sum_even = _mm_add_epi16(sum_even, samples0);
+
+ samples1 = LoadLo8(src);
+ if (!inside) {
+ const __m128i border1 =
+ _mm_set1_epi8(static_cast<int8_t>(src[visible_width - 1]));
+ samples1 = _mm_blendv_epi8(samples1, border1, blend_mask);
+ }
+ src += stride;
+ samples1 = _mm_slli_epi16(_mm_cvtepu8_epi16(samples1), 3);
+ StoreUnaligned16(luma_ptr, samples1);
+ luma_ptr += kCflLumaBufferStride;
+
+ sum_odd = _mm_add_epi16(sum_odd, samples1);
+ y += 2;
+ } while (y < visible_height);
+
+ if (!inside) {
+ for (int y = visible_height; y < block_height; y += 2) {
+ sum_even = _mm_add_epi16(sum_even, samples1);
+ StoreUnaligned16(luma_ptr, samples1);
+ luma_ptr += kCflLumaBufferStride;
+
+ sum_odd = _mm_add_epi16(sum_odd, samples1);
+ StoreUnaligned16(luma_ptr, samples1);
+ luma_ptr += kCflLumaBufferStride;
+ }
+ }
+
+ sum = _mm_add_epi32(_mm_unpackhi_epi16(sum_even, zero),
+ _mm_cvtepu16_epi32(sum_even));
+ sum = _mm_add_epi32(sum, _mm_unpackhi_epi16(sum_odd, zero));
+ sum = _mm_add_epi32(sum, _mm_cvtepu16_epi32(sum_odd));
+
+ sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 8));
+ sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 4));
+
+ __m128i averages = RightShiftWithRounding_U32(
+ sum, block_height_log2 + 3 /* log2 of width 8 */);
+ averages = _mm_shuffle_epi8(averages, dup16);
+ luma_ptr = luma[0];
+ for (int y = 0; y < block_height; ++y, luma_ptr += kCflLumaBufferStride) {
+ const __m128i samples = LoadUnaligned16(luma_ptr);
+ StoreUnaligned16(luma_ptr, _mm_sub_epi16(samples, averages));
+ }
+}
+
+template <int block_height_log2>
+void CflSubsampler444_8xH_SSE4_1(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_width, const int max_luma_height,
+ const void* const source, ptrdiff_t stride) {
+ static_assert(block_height_log2 <= 5, "");
+ assert(max_luma_width >= 4);
+ assert(max_luma_height >= 4);
+ const int block_height = 1 << block_height_log2;
+ const int block_width = 8;
+
+ const int horz_inside = block_width <= max_luma_width;
+ const int vert_inside = block_height <= max_luma_height;
+ if (horz_inside && vert_inside) {
+ CflSubsampler444_8xH_SSE4_1<block_height_log2, true>(
+ luma, max_luma_width, max_luma_height, source, stride);
+ } else {
+ CflSubsampler444_8xH_SSE4_1<block_height_log2, false>(
+ luma, max_luma_width, max_luma_height, source, stride);
+ }
+}
+
+// This function will only work for block_width 16 and 32.
+template <int block_width_log2, int block_height_log2, bool inside>
+void CflSubsampler444_SSE4_1(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_width, const int max_luma_height,
+ const void* const source, ptrdiff_t stride) {
+ static_assert(block_width_log2 == 4 || block_width_log2 == 5, "");
+ static_assert(block_height_log2 <= 5, "");
+ assert(max_luma_width >= 4);
+ assert(max_luma_height >= 4);
+ const int block_height = 1 << block_height_log2;
+ const int block_width = 1 << block_width_log2;
+
+ const int visible_height = max_luma_height;
+ const int visible_width_16 = inside ? 16 : std::min(16, max_luma_width);
+ const int invisible_width_16 = 16 - visible_width_16;
+ const __m128i blend_mask_16 = MaskHighNBytes(invisible_width_16);
+ const int visible_width_32 = inside ? 32 : max_luma_width;
+ const int invisible_width_32 = 32 - visible_width_32;
+ const __m128i blend_mask_32 =
+ MaskHighNBytes(std::min(16, invisible_width_32));
+
+ const __m128i dup16 = _mm_set1_epi32(0x01000100);
+ const __m128i zero = _mm_setzero_si128();
+ const auto* src = static_cast<const uint8_t*>(source);
+ int16_t* luma_ptr = luma[0];
+ __m128i sum = _mm_setzero_si128();
+
+ __m128i samples0, samples1;
+ __m128i samples2, samples3;
+ __m128i inner_sum_lo, inner_sum_hi;
+ int y = 0;
+ do {
+#if LIBGAV1_MSAN // We can load uninitialized values here. Even though they are
+ // then masked off by blendv, MSAN isn't smart enough to
+ // understand that. So we switch to a C implementation here.
+ uint16_t c_arr[16];
+ for (int x = 0; x < 16; x++) {
+ const int x_index = std::min(x, visible_width_16 - 1);
+ c_arr[x] = src[x_index] << 3;
+ }
+ samples0 = LoadUnaligned16(c_arr);
+ samples1 = LoadUnaligned16(c_arr + 8);
+ static_cast<void>(blend_mask_16);
+#else
+ __m128i samples01 = LoadUnaligned16(src);
+
+ if (!inside) {
+ const __m128i border16 =
+ _mm_set1_epi8(static_cast<int8_t>(src[visible_width_16 - 1]));
+ samples01 = _mm_blendv_epi8(samples01, border16, blend_mask_16);
+ }
+ samples0 = _mm_slli_epi16(_mm_cvtepu8_epi16(samples01), 3);
+ samples1 = _mm_slli_epi16(_mm_unpackhi_epi8(samples01, zero), 3);
+#endif // LIBGAV1_MSAN
+
+ StoreUnaligned16(luma_ptr, samples0);
+ StoreUnaligned16(luma_ptr + 8, samples1);
+ __m128i inner_sum = _mm_add_epi16(samples0, samples1);
+
+ if (block_width == 32) {
+#if LIBGAV1_MSAN // We can load uninitialized values here. Even though they are
+ // then masked off by blendv, MSAN isn't smart enough to
+ // understand that. So we switch to a C implementation here.
+ uint16_t c_arr[16];
+ for (int x = 16; x < 32; x++) {
+ const int x_index = std::min(x, visible_width_32 - 1);
+ c_arr[x - 16] = src[x_index] << 3;
+ }
+ samples2 = LoadUnaligned16(c_arr);
+ samples3 = LoadUnaligned16(c_arr + 8);
+ static_cast<void>(blend_mask_32);
+#else
+ __m128i samples23 = LoadUnaligned16(src + 16);
+ if (!inside) {
+ const __m128i border32 =
+ _mm_set1_epi8(static_cast<int8_t>(src[visible_width_32 - 1]));
+ samples23 = _mm_blendv_epi8(samples23, border32, blend_mask_32);
+ }
+ samples2 = _mm_slli_epi16(_mm_cvtepu8_epi16(samples23), 3);
+ samples3 = _mm_slli_epi16(_mm_unpackhi_epi8(samples23, zero), 3);
+#endif // LIBGAV1_MSAN
+
+ StoreUnaligned16(luma_ptr + 16, samples2);
+ StoreUnaligned16(luma_ptr + 24, samples3);
+ inner_sum = _mm_add_epi16(samples2, inner_sum);
+ inner_sum = _mm_add_epi16(samples3, inner_sum);
+ }
+
+ inner_sum_lo = _mm_cvtepu16_epi32(inner_sum);
+ inner_sum_hi = _mm_unpackhi_epi16(inner_sum, zero);
+ sum = _mm_add_epi32(sum, inner_sum_lo);
+ sum = _mm_add_epi32(sum, inner_sum_hi);
+ luma_ptr += kCflLumaBufferStride;
+ src += stride;
+ } while (++y < visible_height);
+
+ if (!inside) {
+ for (int y = visible_height; y < block_height;
+ luma_ptr += kCflLumaBufferStride, ++y) {
+ sum = _mm_add_epi32(sum, inner_sum_lo);
+ StoreUnaligned16(luma_ptr, samples0);
+ sum = _mm_add_epi32(sum, inner_sum_hi);
+ StoreUnaligned16(luma_ptr + 8, samples1);
+ if (block_width == 32) {
+ StoreUnaligned16(luma_ptr + 16, samples2);
+ StoreUnaligned16(luma_ptr + 24, samples3);
+ }
+ }
+ }
+
+ sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 8));
+ sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 4));
+
+ __m128i averages =
+ RightShiftWithRounding_U32(sum, block_width_log2 + block_height_log2);
+ averages = _mm_shuffle_epi8(averages, dup16);
+ luma_ptr = luma[0];
+ for (int y = 0; y < block_height; ++y, luma_ptr += kCflLumaBufferStride) {
+ for (int x = 0; x < block_width; x += 8) {
+ __m128i samples = LoadUnaligned16(&luma_ptr[x]);
+ StoreUnaligned16(&luma_ptr[x], _mm_sub_epi16(samples, averages));
+ }
+ }
+}
+
+template <int block_width_log2, int block_height_log2>
+void CflSubsampler444_SSE4_1(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_width, const int max_luma_height,
+ const void* const source, ptrdiff_t stride) {
+ static_assert(block_width_log2 == 4 || block_width_log2 == 5, "");
+ static_assert(block_height_log2 <= 5, "");
+ assert(max_luma_width >= 4);
+ assert(max_luma_height >= 4);
+
+ const int block_height = 1 << block_height_log2;
+ const int block_width = 1 << block_width_log2;
+ const int horz_inside = block_width <= max_luma_width;
+ const int vert_inside = block_height <= max_luma_height;
+ if (horz_inside && vert_inside) {
+ CflSubsampler444_SSE4_1<block_width_log2, block_height_log2, true>(
+ luma, max_luma_width, max_luma_height, source, stride);
+ } else {
+ CflSubsampler444_SSE4_1<block_width_log2, block_height_log2, false>(
+ luma, max_luma_width, max_luma_height, source, stride);
+ }
+}
+
+// Takes in two sums of input row pairs, and completes the computation for two
+// output rows.
+inline __m128i StoreLumaResults4_420(const __m128i vertical_sum0,
+ const __m128i vertical_sum1,
+ int16_t* luma_ptr) {
+ __m128i result = _mm_hadd_epi16(vertical_sum0, vertical_sum1);
+ result = _mm_slli_epi16(result, 1);
+ StoreLo8(luma_ptr, result);
+ StoreHi8(luma_ptr + kCflLumaBufferStride, result);
+ return result;
+}
+
+// Takes two halves of a vertically added pair of rows and completes the
+// computation for one output row.
+inline __m128i StoreLumaResults8_420(const __m128i vertical_sum0,
+ const __m128i vertical_sum1,
+ int16_t* luma_ptr) {
+ __m128i result = _mm_hadd_epi16(vertical_sum0, vertical_sum1);
+ result = _mm_slli_epi16(result, 1);
+ StoreUnaligned16(luma_ptr, result);
+ return result;
+}
+
+template <int block_height_log2>
+void CflSubsampler420_4xH_SSE4_1(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int /*max_luma_width*/, const int max_luma_height,
+ const void* const source, ptrdiff_t stride) {
+ const int block_height = 1 << block_height_log2;
+ const auto* src = static_cast<const uint8_t*>(source);
+ int16_t* luma_ptr = luma[0];
+ const __m128i zero = _mm_setzero_si128();
+ __m128i final_sum = zero;
+ const int luma_height = std::min(block_height, max_luma_height >> 1);
+ int y = 0;
+ do {
+ // Note that double sampling and converting to 16bit makes a row fill the
+ // vector.
+ const __m128i samples_row0 = _mm_cvtepu8_epi16(LoadLo8(src));
+ src += stride;
+ const __m128i samples_row1 = _mm_cvtepu8_epi16(LoadLo8(src));
+ src += stride;
+ const __m128i luma_sum01 = _mm_add_epi16(samples_row0, samples_row1);
+
+ const __m128i samples_row2 = _mm_cvtepu8_epi16(LoadLo8(src));
+ src += stride;
+ const __m128i samples_row3 = _mm_cvtepu8_epi16(LoadLo8(src));
+ src += stride;
+ const __m128i luma_sum23 = _mm_add_epi16(samples_row2, samples_row3);
+ __m128i sum = StoreLumaResults4_420(luma_sum01, luma_sum23, luma_ptr);
+ luma_ptr += kCflLumaBufferStride << 1;
+
+ const __m128i samples_row4 = _mm_cvtepu8_epi16(LoadLo8(src));
+ src += stride;
+ const __m128i samples_row5 = _mm_cvtepu8_epi16(LoadLo8(src));
+ src += stride;
+ const __m128i luma_sum45 = _mm_add_epi16(samples_row4, samples_row5);
+
+ const __m128i samples_row6 = _mm_cvtepu8_epi16(LoadLo8(src));
+ src += stride;
+ const __m128i samples_row7 = _mm_cvtepu8_epi16(LoadLo8(src));
+ src += stride;
+ const __m128i luma_sum67 = _mm_add_epi16(samples_row6, samples_row7);
+ sum = _mm_add_epi16(
+ sum, StoreLumaResults4_420(luma_sum45, luma_sum67, luma_ptr));
+ luma_ptr += kCflLumaBufferStride << 1;
+
+ final_sum = _mm_add_epi32(final_sum, _mm_cvtepu16_epi32(sum));
+ final_sum = _mm_add_epi32(final_sum, _mm_unpackhi_epi16(sum, zero));
+ y += 4;
+ } while (y < luma_height);
+ const __m128i final_fill = LoadLo8(luma_ptr - kCflLumaBufferStride);
+ const __m128i final_fill_to_sum = _mm_cvtepu16_epi32(final_fill);
+ for (; y < block_height; ++y) {
+ StoreLo8(luma_ptr, final_fill);
+ luma_ptr += kCflLumaBufferStride;
+
+ final_sum = _mm_add_epi32(final_sum, final_fill_to_sum);
+ }
+ final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 8));
+ final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 4));
+
+ __m128i averages = RightShiftWithRounding_U32(
+ final_sum, block_height_log2 + 2 /*log2 of width 4*/);
+
+ averages = _mm_shufflelo_epi16(averages, 0);
+ luma_ptr = luma[0];
+ for (int y = 0; y < block_height; ++y, luma_ptr += kCflLumaBufferStride) {
+ const __m128i samples = LoadLo8(luma_ptr);
+ StoreLo8(luma_ptr, _mm_sub_epi16(samples, averages));
+ }
+}
+
+// This duplicates the last two 16-bit values in |row|.
+inline __m128i LastRowSamples(const __m128i row) {
+ return _mm_shuffle_epi32(row, 0xFF);
+}
+
+// This duplicates the last 16-bit value in |row|.
+inline __m128i LastRowResult(const __m128i row) {
+ const __m128i dup_row = _mm_shufflehi_epi16(row, 0xFF);
+ return _mm_shuffle_epi32(dup_row, 0xFF);
+}
+
+template <int block_height_log2, int max_luma_width>
+inline void CflSubsampler420Impl_8xH_SSE4_1(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int /*max_luma_width*/, const int max_luma_height,
+ const void* const source, ptrdiff_t stride) {
+ const int block_height = 1 << block_height_log2;
+ const auto* src = static_cast<const uint8_t*>(source);
+ const __m128i zero = _mm_setzero_si128();
+ __m128i final_sum = zero;
+ int16_t* luma_ptr = luma[0];
+ const int luma_height = std::min(block_height, max_luma_height >> 1);
+ int y = 0;
+
+ do {
+ const __m128i samples_row00 = _mm_cvtepu8_epi16(LoadLo8(src));
+ const __m128i samples_row01 = (max_luma_width == 16)
+ ? _mm_cvtepu8_epi16(LoadLo8(src + 8))
+ : LastRowSamples(samples_row00);
+ src += stride;
+ const __m128i samples_row10 = _mm_cvtepu8_epi16(LoadLo8(src));
+ const __m128i samples_row11 = (max_luma_width == 16)
+ ? _mm_cvtepu8_epi16(LoadLo8(src + 8))
+ : LastRowSamples(samples_row10);
+ src += stride;
+ const __m128i luma_sum00 = _mm_add_epi16(samples_row00, samples_row10);
+ const __m128i luma_sum01 = _mm_add_epi16(samples_row01, samples_row11);
+ __m128i sum = StoreLumaResults8_420(luma_sum00, luma_sum01, luma_ptr);
+ luma_ptr += kCflLumaBufferStride;
+
+ const __m128i samples_row20 = _mm_cvtepu8_epi16(LoadLo8(src));
+ const __m128i samples_row21 = (max_luma_width == 16)
+ ? _mm_cvtepu8_epi16(LoadLo8(src + 8))
+ : LastRowSamples(samples_row20);
+ src += stride;
+ const __m128i samples_row30 = _mm_cvtepu8_epi16(LoadLo8(src));
+ const __m128i samples_row31 = (max_luma_width == 16)
+ ? _mm_cvtepu8_epi16(LoadLo8(src + 8))
+ : LastRowSamples(samples_row30);
+ src += stride;
+ const __m128i luma_sum10 = _mm_add_epi16(samples_row20, samples_row30);
+ const __m128i luma_sum11 = _mm_add_epi16(samples_row21, samples_row31);
+ sum = _mm_add_epi16(
+ sum, StoreLumaResults8_420(luma_sum10, luma_sum11, luma_ptr));
+ luma_ptr += kCflLumaBufferStride;
+
+ const __m128i samples_row40 = _mm_cvtepu8_epi16(LoadLo8(src));
+ const __m128i samples_row41 = (max_luma_width == 16)
+ ? _mm_cvtepu8_epi16(LoadLo8(src + 8))
+ : LastRowSamples(samples_row40);
+ src += stride;
+ const __m128i samples_row50 = _mm_cvtepu8_epi16(LoadLo8(src));
+ const __m128i samples_row51 = (max_luma_width == 16)
+ ? _mm_cvtepu8_epi16(LoadLo8(src + 8))
+ : LastRowSamples(samples_row50);
+ src += stride;
+ const __m128i luma_sum20 = _mm_add_epi16(samples_row40, samples_row50);
+ const __m128i luma_sum21 = _mm_add_epi16(samples_row41, samples_row51);
+ sum = _mm_add_epi16(
+ sum, StoreLumaResults8_420(luma_sum20, luma_sum21, luma_ptr));
+ luma_ptr += kCflLumaBufferStride;
+
+ const __m128i samples_row60 = _mm_cvtepu8_epi16(LoadLo8(src));
+ const __m128i samples_row61 = (max_luma_width == 16)
+ ? _mm_cvtepu8_epi16(LoadLo8(src + 8))
+ : LastRowSamples(samples_row60);
+ src += stride;
+ const __m128i samples_row70 = _mm_cvtepu8_epi16(LoadLo8(src));
+ const __m128i samples_row71 = (max_luma_width == 16)
+ ? _mm_cvtepu8_epi16(LoadLo8(src + 8))
+ : LastRowSamples(samples_row70);
+ src += stride;
+ const __m128i luma_sum30 = _mm_add_epi16(samples_row60, samples_row70);
+ const __m128i luma_sum31 = _mm_add_epi16(samples_row61, samples_row71);
+ sum = _mm_add_epi16(
+ sum, StoreLumaResults8_420(luma_sum30, luma_sum31, luma_ptr));
+ luma_ptr += kCflLumaBufferStride;
+
+ final_sum = _mm_add_epi32(final_sum, _mm_cvtepu16_epi32(sum));
+ final_sum = _mm_add_epi32(final_sum, _mm_unpackhi_epi16(sum, zero));
+ y += 4;
+ } while (y < luma_height);
+ // Duplicate the final row downward to the end after max_luma_height.
+ const __m128i final_fill = LoadUnaligned16(luma_ptr - kCflLumaBufferStride);
+ const __m128i final_fill_to_sum0 = _mm_cvtepi16_epi32(final_fill);
+ const __m128i final_fill_to_sum1 =
+ _mm_cvtepi16_epi32(_mm_srli_si128(final_fill, 8));
+ const __m128i final_fill_to_sum =
+ _mm_add_epi32(final_fill_to_sum0, final_fill_to_sum1);
+ for (; y < block_height; ++y) {
+ StoreUnaligned16(luma_ptr, final_fill);
+ luma_ptr += kCflLumaBufferStride;
+
+ final_sum = _mm_add_epi32(final_sum, final_fill_to_sum);
+ }
+ final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 8));
+ final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 4));
+
+ __m128i averages = RightShiftWithRounding_S32(
+ final_sum, block_height_log2 + 3 /*log2 of width 8*/);
+
+ averages = _mm_shufflelo_epi16(averages, 0);
+ averages = _mm_shuffle_epi32(averages, 0);
+ luma_ptr = luma[0];
+ for (int y = 0; y < block_height; ++y, luma_ptr += kCflLumaBufferStride) {
+ const __m128i samples = LoadUnaligned16(luma_ptr);
+ StoreUnaligned16(luma_ptr, _mm_sub_epi16(samples, averages));
+ }
+}
+
+template <int block_height_log2>
+void CflSubsampler420_8xH_SSE4_1(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_width, const int max_luma_height,
+ const void* const source, ptrdiff_t stride) {
+ if (max_luma_width == 8) {
+ CflSubsampler420Impl_8xH_SSE4_1<block_height_log2, 8>(
+ luma, max_luma_width, max_luma_height, source, stride);
+ } else {
+ CflSubsampler420Impl_8xH_SSE4_1<block_height_log2, 16>(
+ luma, max_luma_width, max_luma_height, source, stride);
+ }
+}
+
+template <int block_width_log2, int block_height_log2, int max_luma_width>
+inline void CflSubsampler420Impl_WxH_SSE4_1(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int /*max_luma_width*/, const int max_luma_height,
+ const void* const source, ptrdiff_t stride) {
+ const auto* src = static_cast<const uint8_t*>(source);
+ const __m128i zero = _mm_setzero_si128();
+ __m128i final_sum = zero;
+ const int block_height = 1 << block_height_log2;
+ const int luma_height = std::min(block_height, max_luma_height >> 1);
+
+ int16_t* luma_ptr = luma[0];
+ __m128i final_row_result;
+ // Begin first y section, covering width up to 16.
+ int y = 0;
+ do {
+ const uint8_t* src_next = src + stride;
+ const __m128i samples_row0_lo = LoadUnaligned16(src);
+ const __m128i samples_row00 = _mm_cvtepu8_epi16(samples_row0_lo);
+ const __m128i samples_row01 = (max_luma_width >= 16)
+ ? _mm_unpackhi_epi8(samples_row0_lo, zero)
+ : LastRowSamples(samples_row00);
+ const __m128i samples_row0_hi = LoadUnaligned16(src + 16);
+ const __m128i samples_row02 = (max_luma_width >= 24)
+ ? _mm_cvtepu8_epi16(samples_row0_hi)
+ : LastRowSamples(samples_row01);
+ const __m128i samples_row03 = (max_luma_width == 32)
+ ? _mm_unpackhi_epi8(samples_row0_hi, zero)
+ : LastRowSamples(samples_row02);
+ const __m128i samples_row1_lo = LoadUnaligned16(src_next);
+ const __m128i samples_row10 = _mm_cvtepu8_epi16(samples_row1_lo);
+ const __m128i samples_row11 = (max_luma_width >= 16)
+ ? _mm_unpackhi_epi8(samples_row1_lo, zero)
+ : LastRowSamples(samples_row10);
+ const __m128i samples_row1_hi = LoadUnaligned16(src_next + 16);
+ const __m128i samples_row12 = (max_luma_width >= 24)
+ ? _mm_cvtepu8_epi16(samples_row1_hi)
+ : LastRowSamples(samples_row11);
+ const __m128i samples_row13 = (max_luma_width == 32)
+ ? _mm_unpackhi_epi8(samples_row1_hi, zero)
+ : LastRowSamples(samples_row12);
+ const __m128i luma_sum0 = _mm_add_epi16(samples_row00, samples_row10);
+ const __m128i luma_sum1 = _mm_add_epi16(samples_row01, samples_row11);
+ const __m128i luma_sum2 = _mm_add_epi16(samples_row02, samples_row12);
+ const __m128i luma_sum3 = _mm_add_epi16(samples_row03, samples_row13);
+ __m128i sum = StoreLumaResults8_420(luma_sum0, luma_sum1, luma_ptr);
+ final_row_result =
+ StoreLumaResults8_420(luma_sum2, luma_sum3, luma_ptr + 8);
+ sum = _mm_add_epi16(sum, final_row_result);
+ final_sum = _mm_add_epi32(final_sum, _mm_cvtepu16_epi32(sum));
+ final_sum = _mm_add_epi32(final_sum, _mm_unpackhi_epi16(sum, zero));
+ src += stride << 1;
+ luma_ptr += kCflLumaBufferStride;
+ } while (++y < luma_height);
+
+ // Because max_luma_width is at most 32, any values beyond x=16 will
+ // necessarily be duplicated.
+ if (block_width_log2 == 5) {
+ const __m128i wide_fill = LastRowResult(final_row_result);
+ // Multiply duplicated value by number of occurrences, height * 4, since
+ // there are 16 in each row and the value appears in the vector 4 times.
+ final_sum = _mm_add_epi32(
+ final_sum,
+ _mm_slli_epi32(_mm_cvtepi16_epi32(wide_fill), block_height_log2 + 2));
+ }
+
+ // Begin second y section.
+ if (y < block_height) {
+ const __m128i final_fill0 =
+ LoadUnaligned16(luma_ptr - kCflLumaBufferStride);
+ const __m128i final_fill1 =
+ LoadUnaligned16(luma_ptr - kCflLumaBufferStride + 8);
+ const __m128i final_inner_sum = _mm_add_epi16(final_fill0, final_fill1);
+ const __m128i final_inner_sum0 = _mm_cvtepu16_epi32(final_inner_sum);
+ const __m128i final_inner_sum1 = _mm_unpackhi_epi16(final_inner_sum, zero);
+ const __m128i final_fill_to_sum =
+ _mm_add_epi32(final_inner_sum0, final_inner_sum1);
+
+ do {
+ StoreUnaligned16(luma_ptr, final_fill0);
+ StoreUnaligned16(luma_ptr + 8, final_fill1);
+ luma_ptr += kCflLumaBufferStride;
+
+ final_sum = _mm_add_epi32(final_sum, final_fill_to_sum);
+ } while (++y < block_height);
+ } // End second y section.
+
+ final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 8));
+ final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 4));
+
+ __m128i averages = RightShiftWithRounding_S32(
+ final_sum, block_width_log2 + block_height_log2);
+ averages = _mm_shufflelo_epi16(averages, 0);
+ averages = _mm_shuffle_epi32(averages, 0);
+
+ luma_ptr = luma[0];
+ for (int y = 0; y < block_height; ++y, luma_ptr += kCflLumaBufferStride) {
+ const __m128i samples0 = LoadUnaligned16(luma_ptr);
+ StoreUnaligned16(luma_ptr, _mm_sub_epi16(samples0, averages));
+ const __m128i samples1 = LoadUnaligned16(luma_ptr + 8);
+ final_row_result = _mm_sub_epi16(samples1, averages);
+ StoreUnaligned16(luma_ptr + 8, final_row_result);
+ }
+ if (block_width_log2 == 5) {
+ int16_t* wide_luma_ptr = luma[0] + 16;
+ const __m128i wide_fill = LastRowResult(final_row_result);
+ for (int i = 0; i < block_height;
+ ++i, wide_luma_ptr += kCflLumaBufferStride) {
+ StoreUnaligned16(wide_luma_ptr, wide_fill);
+ StoreUnaligned16(wide_luma_ptr + 8, wide_fill);
+ }
+ }
+}
+
+template <int block_width_log2, int block_height_log2>
+void CflSubsampler420_WxH_SSE4_1(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_width, const int max_luma_height,
+ const void* const source, ptrdiff_t stride) {
+ switch (max_luma_width) {
+ case 8:
+ CflSubsampler420Impl_WxH_SSE4_1<block_width_log2, block_height_log2, 8>(
+ luma, max_luma_width, max_luma_height, source, stride);
+ return;
+ case 16:
+ CflSubsampler420Impl_WxH_SSE4_1<block_width_log2, block_height_log2, 16>(
+ luma, max_luma_width, max_luma_height, source, stride);
+ return;
+ case 24:
+ CflSubsampler420Impl_WxH_SSE4_1<block_width_log2, block_height_log2, 24>(
+ luma, max_luma_width, max_luma_height, source, stride);
+ return;
+ default:
+ assert(max_luma_width == 32);
+ CflSubsampler420Impl_WxH_SSE4_1<block_width_log2, block_height_log2, 32>(
+ luma, max_luma_width, max_luma_height, source, stride);
+ return;
+ }
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType420] =
+ CflSubsampler420_4xH_SSE4_1<2>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType420] =
+ CflSubsampler420_4xH_SSE4_1<3>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType420] =
+ CflSubsampler420_4xH_SSE4_1<4>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType420] =
+ CflSubsampler420_8xH_SSE4_1<2>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType420] =
+ CflSubsampler420_8xH_SSE4_1<3>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType420] =
+ CflSubsampler420_8xH_SSE4_1<4>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType420] =
+ CflSubsampler420_8xH_SSE4_1<5>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType420] =
+ CflSubsampler420_WxH_SSE4_1<4, 2>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType420] =
+ CflSubsampler420_WxH_SSE4_1<4, 3>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType420] =
+ CflSubsampler420_WxH_SSE4_1<4, 4>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType420] =
+ CflSubsampler420_WxH_SSE4_1<4, 5>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType420] =
+ CflSubsampler420_WxH_SSE4_1<5, 3>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType420] =
+ CflSubsampler420_WxH_SSE4_1<5, 4>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType420] =
+ CflSubsampler420_WxH_SSE4_1<5, 5>;
+#endif
+
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType444] =
+ CflSubsampler444_4xH_SSE4_1<2>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType444] =
+ CflSubsampler444_4xH_SSE4_1<3>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType444] =
+ CflSubsampler444_4xH_SSE4_1<4>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType444] =
+ CflSubsampler444_8xH_SSE4_1<2>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType444] =
+ CflSubsampler444_8xH_SSE4_1<3>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType444] =
+ CflSubsampler444_8xH_SSE4_1<4>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType444] =
+ CflSubsampler444_8xH_SSE4_1<5>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType444] =
+ CflSubsampler444_SSE4_1<4, 2>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType444] =
+ CflSubsampler444_SSE4_1<4, 3>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType444] =
+ CflSubsampler444_SSE4_1<4, 4>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType444] =
+ CflSubsampler444_SSE4_1<4, 5>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType444] =
+ CflSubsampler444_SSE4_1<5, 3>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType444] =
+ CflSubsampler444_SSE4_1<5, 4>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType444] =
+ CflSubsampler444_SSE4_1<5, 5>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize4x4] = CflIntraPredictor_SSE4_1<4, 4>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize4x8] = CflIntraPredictor_SSE4_1<4, 8>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize4x16] =
+ CflIntraPredictor_SSE4_1<4, 16>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize8x4] = CflIntraPredictor_SSE4_1<8, 4>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize8x8] = CflIntraPredictor_SSE4_1<8, 8>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize8x16] =
+ CflIntraPredictor_SSE4_1<8, 16>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize8x32] =
+ CflIntraPredictor_SSE4_1<8, 32>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize16x4] =
+ CflIntraPredictor_SSE4_1<16, 4>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize16x8] =
+ CflIntraPredictor_SSE4_1<16, 8>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize16x16] =
+ CflIntraPredictor_SSE4_1<16, 16>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize16x32] =
+ CflIntraPredictor_SSE4_1<16, 32>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize32x8] =
+ CflIntraPredictor_SSE4_1<32, 8>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize32x16] =
+ CflIntraPredictor_SSE4_1<32, 16>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize32x32] =
+ CflIntraPredictor_SSE4_1<32, 32>;
+#endif
+}
+
+} // namespace
+} // namespace low_bitdepth
+
+void IntraPredCflInit_SSE4_1() { low_bitdepth::Init8bpp(); }
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_TARGETING_SSE4_1
+
+namespace libgav1 {
+namespace dsp {
+
+void IntraPredCflInit_SSE4_1() {}
+
+} // namespace dsp
+} // namespace libgav1
+
+#endif // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/intrapred_smooth_sse4.cc b/src/dsp/x86/intrapred_smooth_sse4.cc
new file mode 100644
index 0000000..e944ea3
--- /dev/null
+++ b/src/dsp/x86/intrapred_smooth_sse4.cc
@@ -0,0 +1,2662 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <xmmintrin.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring> // memcpy
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+// Note these constants are duplicated from intrapred.cc to allow the compiler
+// to have visibility of the values. This helps reduce loads and in the
+// creation of the inverse weights.
+constexpr uint8_t kSmoothWeights[] = {
+ // block dimension = 4
+ 255, 149, 85, 64,
+ // block dimension = 8
+ 255, 197, 146, 105, 73, 50, 37, 32,
+ // block dimension = 16
+ 255, 225, 196, 170, 145, 123, 102, 84, 68, 54, 43, 33, 26, 20, 17, 16,
+ // block dimension = 32
+ 255, 240, 225, 210, 196, 182, 169, 157, 145, 133, 122, 111, 101, 92, 83, 74,
+ 66, 59, 52, 45, 39, 34, 29, 25, 21, 17, 14, 12, 10, 9, 8, 8,
+ // block dimension = 64
+ 255, 248, 240, 233, 225, 218, 210, 203, 196, 189, 182, 176, 169, 163, 156,
+ 150, 144, 138, 133, 127, 121, 116, 111, 106, 101, 96, 91, 86, 82, 77, 73,
+ 69, 65, 61, 57, 54, 50, 47, 44, 41, 38, 35, 32, 29, 27, 25, 22, 20, 18, 16,
+ 15, 13, 12, 10, 9, 8, 7, 6, 6, 5, 5, 4, 4, 4};
+
+template <int y_mask>
+inline void WriteSmoothHorizontalSum4(void* const dest, const __m128i& left,
+ const __m128i& weights,
+ const __m128i& scaled_top_right,
+ const __m128i& round) {
+ const __m128i left_y = _mm_shuffle_epi32(left, y_mask);
+ const __m128i weighted_left_y = _mm_mullo_epi16(left_y, weights);
+ const __m128i pred_sum = _mm_add_epi32(scaled_top_right, weighted_left_y);
+ // Equivalent to RightShiftWithRounding(pred[x][y], 8).
+ const __m128i pred = _mm_srli_epi32(_mm_add_epi32(pred_sum, round), 8);
+ const __m128i cvtepi32_epi8 = _mm_set1_epi32(0x0C080400);
+ Store4(dest, _mm_shuffle_epi8(pred, cvtepi32_epi8));
+}
+
+template <int y_mask>
+inline __m128i SmoothVerticalSum4(const __m128i& top, const __m128i& weights,
+ const __m128i& scaled_bottom_left) {
+ const __m128i weights_y = _mm_shuffle_epi32(weights, y_mask);
+ const __m128i weighted_top_y = _mm_mullo_epi16(top, weights_y);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi32(scaled_bottom_left, y_mask);
+ return _mm_add_epi32(scaled_bottom_left_y, weighted_top_y);
+}
+
+template <int y_mask>
+inline void WriteSmoothVerticalSum4(uint8_t* dest, const __m128i& top,
+ const __m128i& weights,
+ const __m128i& scaled_bottom_left,
+ const __m128i& round) {
+ __m128i pred_sum =
+ SmoothVerticalSum4<y_mask>(top, weights, scaled_bottom_left);
+ // Equivalent to RightShiftWithRounding(pred[x][y], 8).
+ pred_sum = _mm_srli_epi32(_mm_add_epi32(pred_sum, round), 8);
+ const __m128i cvtepi32_epi8 = _mm_set1_epi32(0x0C080400);
+ Store4(dest, _mm_shuffle_epi8(pred_sum, cvtepi32_epi8));
+}
+
+// For SMOOTH_H, |pixels| is the repeated left value for the row. For SMOOTH_V,
+// |pixels| is a segment of the top row or the whole top row, and |weights| is
+// repeated.
+inline __m128i SmoothDirectionalSum8(const __m128i& pixels,
+ const __m128i& weights,
+ const __m128i& scaled_corner) {
+ const __m128i weighted_px = _mm_mullo_epi16(pixels, weights);
+ return _mm_add_epi16(scaled_corner, weighted_px);
+}
+
+inline void WriteSmoothDirectionalSum8(uint8_t* dest, const __m128i& pixels,
+ const __m128i& weights,
+ const __m128i& scaled_corner,
+ const __m128i& round) {
+ const __m128i pred_sum =
+ SmoothDirectionalSum8(pixels, weights, scaled_corner);
+ // Equivalent to RightShiftWithRounding(pred[x][y], 8).
+ const __m128i pred = _mm_srli_epi16(_mm_add_epi16(pred_sum, round), 8);
+ StoreLo8(dest, _mm_packus_epi16(pred, pred));
+}
+
+// For Horizontal, pixels1 and pixels2 are the same repeated value. For
+// Vertical, weights1 and weights2 are the same, and scaled_corner1 and
+// scaled_corner2 are the same.
+inline void WriteSmoothDirectionalSum16(uint8_t* dest, const __m128i& pixels1,
+ const __m128i& pixels2,
+ const __m128i& weights1,
+ const __m128i& weights2,
+ const __m128i& scaled_corner1,
+ const __m128i& scaled_corner2,
+ const __m128i& round) {
+ const __m128i weighted_px1 = _mm_mullo_epi16(pixels1, weights1);
+ const __m128i weighted_px2 = _mm_mullo_epi16(pixels2, weights2);
+ const __m128i pred_sum1 = _mm_add_epi16(scaled_corner1, weighted_px1);
+ const __m128i pred_sum2 = _mm_add_epi16(scaled_corner2, weighted_px2);
+ // Equivalent to RightShiftWithRounding(pred[x][y], 8).
+ const __m128i pred1 = _mm_srli_epi16(_mm_add_epi16(pred_sum1, round), 8);
+ const __m128i pred2 = _mm_srli_epi16(_mm_add_epi16(pred_sum2, round), 8);
+ StoreUnaligned16(dest, _mm_packus_epi16(pred1, pred2));
+}
+
+template <int y_mask>
+inline void WriteSmoothPredSum4(uint8_t* const dest, const __m128i& top,
+ const __m128i& left, const __m128i& weights_x,
+ const __m128i& weights_y,
+ const __m128i& scaled_bottom_left,
+ const __m128i& scaled_top_right,
+ const __m128i& round) {
+ const __m128i left_y = _mm_shuffle_epi32(left, y_mask);
+ const __m128i weighted_left_y = _mm_mullo_epi32(left_y, weights_x);
+ const __m128i weight_y = _mm_shuffle_epi32(weights_y, y_mask);
+ const __m128i weighted_top = _mm_mullo_epi32(weight_y, top);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi32(scaled_bottom_left, y_mask);
+ const __m128i col_pred = _mm_add_epi32(scaled_bottom_left_y, weighted_left_y);
+ const __m128i row_pred = _mm_add_epi32(scaled_top_right, weighted_top);
+ const __m128i pred_sum = _mm_add_epi32(row_pred, col_pred);
+
+ // Equivalent to RightShiftWithRounding(pred[x][y], 9).
+ const __m128i pred = _mm_srli_epi32(_mm_add_epi32(pred_sum, round), 9);
+
+ const __m128i cvtepi32_epi8 = _mm_set1_epi32(0x0C080400);
+ Store4(dest, _mm_shuffle_epi8(pred, cvtepi32_epi8));
+}
+
+// pixels[0]: above and below_pred interleave vector
+// pixels[1]: left vector
+// pixels[2]: right_pred vector
+inline void LoadSmoothPixels4(const uint8_t* above, const uint8_t* left,
+ const int height, __m128i* pixels) {
+ if (height == 4) {
+ pixels[1] = Load4(left);
+ } else if (height == 8) {
+ pixels[1] = LoadLo8(left);
+ } else {
+ pixels[1] = LoadUnaligned16(left);
+ }
+
+ const __m128i bottom_left = _mm_set1_epi16(left[height - 1]);
+ const __m128i top = _mm_cvtepu8_epi16(Load4(above));
+ pixels[0] = _mm_unpacklo_epi16(top, bottom_left);
+ pixels[2] = _mm_set1_epi16(above[3]);
+}
+
+// weight_h[0]: weight_h vector
+// weight_h[1]: scale - weight_h vector
+// weight_h[2]: same as [0], second half for height = 16 only
+// weight_h[3]: same as [1], second half for height = 16 only
+// weight_w[0]: weights_w and scale - weights_w interleave vector
+inline void LoadSmoothWeights4(const uint8_t* weight_array, const int height,
+ __m128i* weight_h, __m128i* weight_w) {
+ const __m128i scale = _mm_set1_epi16(256);
+ const __m128i x_weights = Load4(weight_array);
+ weight_h[0] = _mm_cvtepu8_epi16(x_weights);
+ weight_h[1] = _mm_sub_epi16(scale, weight_h[0]);
+ weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
+
+ if (height == 8) {
+ const __m128i y_weights = LoadLo8(weight_array + 4);
+ weight_h[0] = _mm_cvtepu8_epi16(y_weights);
+ weight_h[1] = _mm_sub_epi16(scale, weight_h[0]);
+ } else if (height == 16) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i y_weights = LoadUnaligned16(weight_array + 12);
+ weight_h[0] = _mm_cvtepu8_epi16(y_weights);
+ weight_h[1] = _mm_sub_epi16(scale, weight_h[0]);
+ weight_h[2] = _mm_unpackhi_epi8(y_weights, zero);
+ weight_h[3] = _mm_sub_epi16(scale, weight_h[2]);
+ }
+}
+
+inline void WriteSmoothPred4x8(const __m128i* pixel, const __m128i* weights_y,
+ const __m128i* weight_x, uint8_t* dst,
+ const ptrdiff_t stride,
+ const bool use_second_half) {
+ const __m128i round = _mm_set1_epi32(256);
+ const __m128i mask_increment = _mm_set1_epi16(0x0202);
+ const __m128i cvtepi32_epi8 = _mm_set1_epi32(0x0C080400);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i left = use_second_half ? _mm_unpackhi_epi8(pixel[1], zero)
+ : _mm_unpacklo_epi8(pixel[1], zero);
+ __m128i y_select = _mm_set1_epi16(0x0100);
+
+ for (int i = 0; i < 8; ++i) {
+ const __m128i weight_y = _mm_shuffle_epi8(weights_y[0], y_select);
+ const __m128i inverted_weight_y = _mm_shuffle_epi8(weights_y[1], y_select);
+ const __m128i interleaved_weights =
+ _mm_unpacklo_epi16(weight_y, inverted_weight_y);
+ __m128i vertical_pred = _mm_madd_epi16(pixel[0], interleaved_weights);
+
+ __m128i horizontal_vect = _mm_shuffle_epi8(left, y_select);
+ horizontal_vect = _mm_unpacklo_epi16(horizontal_vect, pixel[2]);
+ __m128i sum = _mm_madd_epi16(horizontal_vect, weight_x[0]);
+
+ sum = _mm_add_epi32(vertical_pred, sum);
+ sum = _mm_add_epi32(sum, round);
+ sum = _mm_srai_epi32(sum, 9);
+
+ sum = _mm_shuffle_epi8(sum, cvtepi32_epi8);
+ Store4(dst, sum);
+ dst += stride;
+
+ y_select = _mm_add_epi16(y_select, mask_increment);
+ }
+}
+
+// The interleaving approach has some overhead that causes it to underperform in
+// the 4x4 case.
+void Smooth4x4_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* top_row, const void* left_column) {
+ const __m128i top = _mm_cvtepu8_epi32(Load4(top_row));
+ const __m128i left = _mm_cvtepu8_epi32(Load4(left_column));
+ const __m128i weights = _mm_cvtepu8_epi32(Load4(kSmoothWeights));
+ const __m128i scale = _mm_set1_epi32(256);
+ // Fourth short is top_row[3].
+ const __m128i top_right = _mm_shuffle_epi32(top, 0xFF);
+ // Fourth short is left_column[3].
+ const __m128i bottom_left = _mm_shuffle_epi32(left, 0xFF);
+ const __m128i inverted_weights = _mm_sub_epi32(scale, weights);
+ const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
+ const __m128i scaled_bottom_left =
+ _mm_mullo_epi16(inverted_weights, bottom_left);
+ auto* dst = static_cast<uint8_t*>(dest);
+ // AV1 spec 7.11.2.6 (3) describes the sum:
+ // smoothPred[y][x:x+3] = weighted_top + scaled_right + weighted_left[y] +
+ // scaled_bottom[y] This could be a loop, but for the immediate value in the
+ // shuffles.
+ WriteSmoothPredSum4<0>(dst, top, left, weights, weights, scaled_bottom_left,
+ scaled_top_right, scale);
+ dst += stride;
+ WriteSmoothPredSum4<0x55>(dst, top, left, weights, weights,
+ scaled_bottom_left, scaled_top_right, scale);
+ dst += stride;
+ WriteSmoothPredSum4<0xAA>(dst, top, left, weights, weights,
+ scaled_bottom_left, scaled_top_right, scale);
+ dst += stride;
+ WriteSmoothPredSum4<0xFF>(dst, top, left, weights, weights,
+ scaled_bottom_left, scaled_top_right, scale);
+}
+
+void Smooth4x8_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* top_row, const void* left_column) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ __m128i weights_x[1];
+ __m128i weights_y[2];
+ LoadSmoothWeights4(kSmoothWeights, 8, weights_y, weights_x);
+ __m128i pixels[3];
+ LoadSmoothPixels4(top_ptr, left_ptr, 8, pixels);
+ auto* dst = static_cast<uint8_t*>(dest);
+ WriteSmoothPred4x8(pixels, weights_y, weights_x, dst, stride, false);
+}
+
+void Smooth4x16_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* top_row, const void* left_column) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ __m128i weights_x[1];
+ __m128i weights_y[4];
+ LoadSmoothWeights4(kSmoothWeights, 16, weights_y, weights_x);
+ __m128i pixels[3];
+ LoadSmoothPixels4(top_ptr, left_ptr, 16, pixels);
+ auto* dst = static_cast<uint8_t*>(dest);
+ WriteSmoothPred4x8(pixels, weights_y, weights_x, dst, stride, false);
+ dst += stride << 3;
+ WriteSmoothPred4x8(pixels, &weights_y[2], weights_x, dst, stride, true);
+}
+
+// pixels[0]: above and below_pred interleave vector, first half
+// pixels[1]: above and below_pred interleave vector, second half
+// pixels[2]: left vector
+// pixels[3]: right_pred vector
+// pixels[4]: above and below_pred interleave vector, first half
+// pixels[5]: above and below_pred interleave vector, second half
+// pixels[6]: left vector + 16
+// pixels[7]: right_pred vector
+inline void LoadSmoothPixels8(const uint8_t* above, const uint8_t* left,
+ const int height, __m128i* pixels) {
+ const __m128i bottom_left = _mm_set1_epi16(left[height - 1]);
+ __m128i top_row = _mm_cvtepu8_epi16(LoadLo8(above));
+ pixels[0] = _mm_unpacklo_epi16(top_row, bottom_left);
+ pixels[1] = _mm_unpackhi_epi16(top_row, bottom_left);
+
+ pixels[3] = _mm_set1_epi16(above[7]);
+
+ if (height == 4) {
+ pixels[2] = Load4(left);
+ } else if (height == 8) {
+ pixels[2] = LoadLo8(left);
+ } else if (height == 16) {
+ pixels[2] = LoadUnaligned16(left);
+ } else {
+ pixels[2] = LoadUnaligned16(left);
+ pixels[4] = pixels[0];
+ pixels[5] = pixels[1];
+ pixels[6] = LoadUnaligned16(left + 16);
+ pixels[7] = pixels[3];
+ }
+}
+
+// weight_h[0]: weight_h vector
+// weight_h[1]: scale - weight_h vector
+// weight_h[2]: same as [0], offset 8
+// weight_h[3]: same as [1], offset 8
+// weight_h[4]: same as [0], offset 16
+// weight_h[5]: same as [1], offset 16
+// weight_h[6]: same as [0], offset 24
+// weight_h[7]: same as [1], offset 24
+// weight_w[0]: weights_w and scale - weights_w interleave vector, first half
+// weight_w[1]: weights_w and scale - weights_w interleave vector, second half
+inline void LoadSmoothWeights8(const uint8_t* weight_array, const int height,
+ __m128i* weight_w, __m128i* weight_h) {
+ const int offset = (height < 8) ? 0 : 4;
+ __m128i loaded_weights = LoadUnaligned16(&weight_array[offset]);
+ weight_h[0] = _mm_cvtepu8_epi16(loaded_weights);
+ const __m128i inverter = _mm_set1_epi16(256);
+ weight_h[1] = _mm_sub_epi16(inverter, weight_h[0]);
+
+ if (height == 4) {
+ loaded_weights = _mm_srli_si128(loaded_weights, 4);
+ __m128i weights_x = _mm_cvtepu8_epi16(loaded_weights);
+ __m128i inverted_weights_x = _mm_sub_epi16(inverter, weights_x);
+ weight_w[0] = _mm_unpacklo_epi16(weights_x, inverted_weights_x);
+ weight_w[1] = _mm_unpackhi_epi16(weights_x, inverted_weights_x);
+ } else {
+ weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
+ weight_w[1] = _mm_unpackhi_epi16(weight_h[0], weight_h[1]);
+ }
+
+ if (height == 16) {
+ const __m128i zero = _mm_setzero_si128();
+ loaded_weights = LoadUnaligned16(weight_array + 12);
+ weight_h[0] = _mm_cvtepu8_epi16(loaded_weights);
+ weight_h[1] = _mm_sub_epi16(inverter, weight_h[0]);
+ weight_h[2] = _mm_unpackhi_epi8(loaded_weights, zero);
+ weight_h[3] = _mm_sub_epi16(inverter, weight_h[2]);
+ } else if (height == 32) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i weight_lo = LoadUnaligned16(weight_array + 28);
+ weight_h[0] = _mm_cvtepu8_epi16(weight_lo);
+ weight_h[1] = _mm_sub_epi16(inverter, weight_h[0]);
+ weight_h[2] = _mm_unpackhi_epi8(weight_lo, zero);
+ weight_h[3] = _mm_sub_epi16(inverter, weight_h[2]);
+ const __m128i weight_hi = LoadUnaligned16(weight_array + 44);
+ weight_h[4] = _mm_cvtepu8_epi16(weight_hi);
+ weight_h[5] = _mm_sub_epi16(inverter, weight_h[4]);
+ weight_h[6] = _mm_unpackhi_epi8(weight_hi, zero);
+ weight_h[7] = _mm_sub_epi16(inverter, weight_h[6]);
+ }
+}
+
+inline void WriteSmoothPred8xH(const __m128i* pixels, const __m128i* weights_x,
+ const __m128i* weights_y, const int height,
+ uint8_t* dst, const ptrdiff_t stride,
+ const bool use_second_half) {
+ const __m128i round = _mm_set1_epi32(256);
+ const __m128i mask_increment = _mm_set1_epi16(0x0202);
+ const __m128i cvt_epu16_epi8 = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
+
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i left = use_second_half ? _mm_unpackhi_epi8(pixels[2], zero)
+ : _mm_unpacklo_epi8(pixels[2], zero);
+ __m128i y_select = _mm_set1_epi16(0x100);
+
+ for (int i = 0; i < height; ++i) {
+ const __m128i weight_y = _mm_shuffle_epi8(weights_y[0], y_select);
+ const __m128i inverted_weight_y = _mm_shuffle_epi8(weights_y[1], y_select);
+ const __m128i interleaved_weights =
+ _mm_unpacklo_epi16(weight_y, inverted_weight_y);
+ const __m128i vertical_sum0 =
+ _mm_madd_epi16(pixels[0], interleaved_weights);
+ const __m128i vertical_sum1 =
+ _mm_madd_epi16(pixels[1], interleaved_weights);
+
+ __m128i horizontal_pixels = _mm_shuffle_epi8(left, y_select);
+ horizontal_pixels = _mm_unpacklo_epi16(horizontal_pixels, pixels[3]);
+ const __m128i horizontal_sum0 =
+ _mm_madd_epi16(horizontal_pixels, weights_x[0]);
+ const __m128i horizontal_sum1 =
+ _mm_madd_epi16(horizontal_pixels, weights_x[1]);
+
+ __m128i sum0 = _mm_add_epi32(vertical_sum0, horizontal_sum0);
+ sum0 = _mm_add_epi32(sum0, round);
+ sum0 = _mm_srai_epi32(sum0, 9);
+
+ __m128i sum1 = _mm_add_epi32(vertical_sum1, horizontal_sum1);
+ sum1 = _mm_add_epi32(sum1, round);
+ sum1 = _mm_srai_epi32(sum1, 9);
+
+ sum0 = _mm_packus_epi16(sum0, sum1);
+ sum0 = _mm_shuffle_epi8(sum0, cvt_epu16_epi8);
+ StoreLo8(dst, sum0);
+ dst += stride;
+
+ y_select = _mm_add_epi16(y_select, mask_increment);
+ }
+}
+
+void Smooth8x4_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* top_row, const void* left_column) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ __m128i pixels[4];
+ LoadSmoothPixels8(top_ptr, left_ptr, 4, pixels);
+
+ __m128i weights_x[2], weights_y[2];
+ LoadSmoothWeights8(kSmoothWeights, 4, weights_x, weights_y);
+
+ auto* dst = static_cast<uint8_t*>(dest);
+ WriteSmoothPred8xH(pixels, weights_x, weights_y, 4, dst, stride, false);
+}
+
+void Smooth8x8_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* top_row, const void* left_column) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+
+ __m128i pixels[4];
+ LoadSmoothPixels8(top_ptr, left_ptr, 8, pixels);
+
+ __m128i weights_x[2], weights_y[2];
+ LoadSmoothWeights8(kSmoothWeights, 8, weights_x, weights_y);
+
+ auto* dst = static_cast<uint8_t*>(dest);
+ WriteSmoothPred8xH(pixels, weights_x, weights_y, 8, dst, stride, false);
+}
+
+void Smooth8x16_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* top_row, const void* left_column) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ __m128i pixels[4];
+ LoadSmoothPixels8(top_ptr, left_ptr, 16, pixels);
+
+ __m128i weights_x[2], weights_y[4];
+ LoadSmoothWeights8(kSmoothWeights, 16, weights_x, weights_y);
+
+ auto* dst = static_cast<uint8_t*>(dest);
+ WriteSmoothPred8xH(pixels, weights_x, weights_y, 8, dst, stride, false);
+ dst += stride << 3;
+ WriteSmoothPred8xH(pixels, weights_x, &weights_y[2], 8, dst, stride, true);
+}
+
+void Smooth8x32_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* top_row, const void* left_column) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ __m128i pixels[8];
+ LoadSmoothPixels8(top_ptr, left_ptr, 32, pixels);
+
+ __m128i weights_x[2], weights_y[8];
+ LoadSmoothWeights8(kSmoothWeights, 32, weights_x, weights_y);
+
+ auto* dst = static_cast<uint8_t*>(dest);
+ WriteSmoothPred8xH(pixels, weights_x, weights_y, 8, dst, stride, false);
+ dst += stride << 3;
+ WriteSmoothPred8xH(pixels, weights_x, &weights_y[2], 8, dst, stride, true);
+ dst += stride << 3;
+ WriteSmoothPred8xH(&pixels[4], weights_x, &weights_y[4], 8, dst, stride,
+ false);
+ dst += stride << 3;
+ WriteSmoothPred8xH(&pixels[4], weights_x, &weights_y[6], 8, dst, stride,
+ true);
+}
+
+template <int width, int height>
+void SmoothWxH(void* const dest, const ptrdiff_t stride,
+ const void* const top_row, const void* const left_column) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ const uint8_t* const sm_weights_h = kSmoothWeights + height - 4;
+ const uint8_t* const sm_weights_w = kSmoothWeights + width - 4;
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i scale_value = _mm_set1_epi16(256);
+ const __m128i bottom_left = _mm_cvtsi32_si128(left_ptr[height - 1]);
+ const __m128i top_right = _mm_set1_epi16(top_ptr[width - 1]);
+ const __m128i round = _mm_set1_epi32(256);
+ auto* dst = static_cast<uint8_t*>(dest);
+ for (int y = 0; y < height; ++y) {
+ const __m128i weights_y = _mm_cvtsi32_si128(sm_weights_h[y]);
+ const __m128i left_y = _mm_cvtsi32_si128(left_ptr[y]);
+ const __m128i scale_m_weights_y = _mm_sub_epi16(scale_value, weights_y);
+ __m128i scaled_bottom_left =
+ _mm_mullo_epi16(scale_m_weights_y, bottom_left);
+ const __m128i weight_left_y =
+ _mm_shuffle_epi32(_mm_unpacklo_epi16(weights_y, left_y), 0);
+ scaled_bottom_left = _mm_add_epi32(scaled_bottom_left, round);
+ scaled_bottom_left = _mm_shuffle_epi32(scaled_bottom_left, 0);
+ for (int x = 0; x < width; x += 8) {
+ const __m128i top_x = LoadLo8(top_ptr + x);
+ const __m128i weights_x = LoadLo8(sm_weights_w + x);
+ const __m128i top_weights_x = _mm_unpacklo_epi8(top_x, weights_x);
+ const __m128i top_weights_x_lo = _mm_cvtepu8_epi16(top_weights_x);
+ const __m128i top_weights_x_hi = _mm_unpackhi_epi8(top_weights_x, zero);
+
+ // Here opposite weights and pixels are multiplied, where the order of
+ // interleaving is indicated in the names.
+ __m128i pred_lo = _mm_madd_epi16(top_weights_x_lo, weight_left_y);
+ __m128i pred_hi = _mm_madd_epi16(top_weights_x_hi, weight_left_y);
+
+ // |scaled_bottom_left| is always scaled by the same weight each row, so
+ // we only derive |scaled_top_right| values here.
+ const __m128i inverted_weights_x =
+ _mm_sub_epi16(scale_value, _mm_cvtepu8_epi16(weights_x));
+ const __m128i scaled_top_right =
+ _mm_mullo_epi16(inverted_weights_x, top_right);
+ const __m128i scaled_top_right_lo = _mm_cvtepu16_epi32(scaled_top_right);
+ const __m128i scaled_top_right_hi =
+ _mm_unpackhi_epi16(scaled_top_right, zero);
+ pred_lo = _mm_add_epi32(pred_lo, scaled_bottom_left);
+ pred_hi = _mm_add_epi32(pred_hi, scaled_bottom_left);
+ pred_lo = _mm_add_epi32(pred_lo, scaled_top_right_lo);
+ pred_hi = _mm_add_epi32(pred_hi, scaled_top_right_hi);
+
+ // The round value for RightShiftWithRounding was added with
+ // |scaled_bottom_left|.
+ pred_lo = _mm_srli_epi32(pred_lo, 9);
+ pred_hi = _mm_srli_epi32(pred_hi, 9);
+ const __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
+ StoreLo8(dst + x, _mm_packus_epi16(pred, pred));
+ }
+ dst += stride;
+ }
+}
+
+void SmoothHorizontal4x4_SSE4_1(void* dest, const ptrdiff_t stride,
+ const void* top_row, const void* left_column) {
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ const __m128i top_right = _mm_set1_epi32(top_ptr[3]);
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ const __m128i left = _mm_cvtepu8_epi32(Load4(left_ptr));
+ const __m128i weights = _mm_cvtepu8_epi32(Load4(kSmoothWeights));
+ __m128i scale = _mm_set1_epi32(256);
+ const __m128i inverted_weights = _mm_sub_epi32(scale, weights);
+ const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
+ scale = _mm_set1_epi32(128);
+ auto* dst = static_cast<uint8_t*>(dest);
+ WriteSmoothHorizontalSum4<0>(dst, left, weights, scaled_top_right, scale);
+ dst += stride;
+ WriteSmoothHorizontalSum4<0x55>(dst, left, weights, scaled_top_right, scale);
+ dst += stride;
+ WriteSmoothHorizontalSum4<0xAA>(dst, left, weights, scaled_top_right, scale);
+ dst += stride;
+ WriteSmoothHorizontalSum4<0xFF>(dst, left, weights, scaled_top_right, scale);
+}
+
+void SmoothHorizontal4x8_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ const __m128i top_right = _mm_set1_epi32(top[3]);
+ const __m128i weights = _mm_cvtepu8_epi32(Load4(kSmoothWeights));
+ __m128i scale = _mm_set1_epi32(256);
+ const __m128i inverted_weights = _mm_sub_epi32(scale, weights);
+ const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
+ scale = _mm_set1_epi32(128);
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ __m128i left = _mm_cvtepu8_epi32(Load4(left_column));
+ auto* dst = static_cast<uint8_t*>(dest);
+ WriteSmoothHorizontalSum4<0>(dst, left, weights, scaled_top_right, scale);
+ dst += stride;
+ WriteSmoothHorizontalSum4<0x55>(dst, left, weights, scaled_top_right, scale);
+ dst += stride;
+ WriteSmoothHorizontalSum4<0xAA>(dst, left, weights, scaled_top_right, scale);
+ dst += stride;
+ WriteSmoothHorizontalSum4<0xFF>(dst, left, weights, scaled_top_right, scale);
+ dst += stride;
+
+ left = _mm_cvtepu8_epi32(Load4(left_ptr + 4));
+ WriteSmoothHorizontalSum4<0>(dst, left, weights, scaled_top_right, scale);
+ dst += stride;
+ WriteSmoothHorizontalSum4<0x55>(dst, left, weights, scaled_top_right, scale);
+ dst += stride;
+ WriteSmoothHorizontalSum4<0xAA>(dst, left, weights, scaled_top_right, scale);
+ dst += stride;
+ WriteSmoothHorizontalSum4<0xFF>(dst, left, weights, scaled_top_right, scale);
+}
+
+void SmoothHorizontal4x16_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ const __m128i top_right = _mm_set1_epi32(top[3]);
+ const __m128i weights = _mm_cvtepu8_epi32(Load4(kSmoothWeights));
+ __m128i scale = _mm_set1_epi32(256);
+ const __m128i inverted_weights = _mm_sub_epi32(scale, weights);
+ const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
+ scale = _mm_set1_epi32(128);
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ __m128i left = _mm_cvtepu8_epi32(Load4(left_column));
+ auto* dst = static_cast<uint8_t*>(dest);
+ WriteSmoothHorizontalSum4<0>(dst, left, weights, scaled_top_right, scale);
+ dst += stride;
+ WriteSmoothHorizontalSum4<0x55>(dst, left, weights, scaled_top_right, scale);
+ dst += stride;
+ WriteSmoothHorizontalSum4<0xAA>(dst, left, weights, scaled_top_right, scale);
+ dst += stride;
+ WriteSmoothHorizontalSum4<0xFF>(dst, left, weights, scaled_top_right, scale);
+ dst += stride;
+
+ left = _mm_cvtepu8_epi32(Load4(left_ptr + 4));
+ WriteSmoothHorizontalSum4<0>(dst, left, weights, scaled_top_right, scale);
+ dst += stride;
+ WriteSmoothHorizontalSum4<0x55>(dst, left, weights, scaled_top_right, scale);
+ dst += stride;
+ WriteSmoothHorizontalSum4<0xAA>(dst, left, weights, scaled_top_right, scale);
+ dst += stride;
+ WriteSmoothHorizontalSum4<0xFF>(dst, left, weights, scaled_top_right, scale);
+ dst += stride;
+
+ left = _mm_cvtepu8_epi32(Load4(left_ptr + 8));
+ WriteSmoothHorizontalSum4<0>(dst, left, weights, scaled_top_right, scale);
+ dst += stride;
+ WriteSmoothHorizontalSum4<0x55>(dst, left, weights, scaled_top_right, scale);
+ dst += stride;
+ WriteSmoothHorizontalSum4<0xAA>(dst, left, weights, scaled_top_right, scale);
+ dst += stride;
+ WriteSmoothHorizontalSum4<0xFF>(dst, left, weights, scaled_top_right, scale);
+ dst += stride;
+
+ left = _mm_cvtepu8_epi32(Load4(left_ptr + 12));
+ WriteSmoothHorizontalSum4<0>(dst, left, weights, scaled_top_right, scale);
+ dst += stride;
+ WriteSmoothHorizontalSum4<0x55>(dst, left, weights, scaled_top_right, scale);
+ dst += stride;
+ WriteSmoothHorizontalSum4<0xAA>(dst, left, weights, scaled_top_right, scale);
+ dst += stride;
+ WriteSmoothHorizontalSum4<0xFF>(dst, left, weights, scaled_top_right, scale);
+}
+
+void SmoothHorizontal8x4_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ const __m128i top_right = _mm_set1_epi16(top[7]);
+ const __m128i left = _mm_cvtepu8_epi16(Load4(left_column));
+ const __m128i weights = _mm_cvtepu8_epi16(LoadLo8(kSmoothWeights + 4));
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+ const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
+ scale = _mm_set1_epi16(128);
+ __m128i y_select = _mm_set1_epi32(0x01000100);
+ __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ auto* dst = static_cast<uint8_t*>(dest);
+ WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
+ dst += stride;
+ y_select = _mm_set1_epi32(0x03020302);
+ left_y = _mm_shuffle_epi8(left, y_select);
+ WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
+ dst += stride;
+ y_select = _mm_set1_epi32(0x05040504);
+ left_y = _mm_shuffle_epi8(left, y_select);
+ WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
+ dst += stride;
+ y_select = _mm_set1_epi32(0x07060706);
+ left_y = _mm_shuffle_epi8(left, y_select);
+ WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
+}
+
+void SmoothHorizontal8x8_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ const __m128i top_right = _mm_set1_epi16(top[7]);
+ const __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column));
+ const __m128i weights = _mm_cvtepu8_epi16(LoadLo8(kSmoothWeights + 4));
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+ const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
+ scale = _mm_set1_epi16(128);
+ auto* dst = static_cast<uint8_t*>(dest);
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
+ dst += stride;
+ }
+}
+
+void SmoothHorizontal8x16_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ const __m128i top_right = _mm_set1_epi16(top[7]);
+ const __m128i weights = _mm_cvtepu8_epi16(LoadLo8(kSmoothWeights + 4));
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+ const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
+ scale = _mm_set1_epi16(128);
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column));
+ auto* dst = static_cast<uint8_t*>(dest);
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
+ dst += stride;
+ }
+ left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 8));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
+ dst += stride;
+ }
+}
+
+void SmoothHorizontal8x32_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ const __m128i top_right = _mm_set1_epi16(top[7]);
+ const __m128i weights = _mm_cvtepu8_epi16(LoadLo8(kSmoothWeights + 4));
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+ const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
+ scale = _mm_set1_epi16(128);
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column));
+ auto* dst = static_cast<uint8_t*>(dest);
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
+ dst += stride;
+ }
+ left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 8));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
+ dst += stride;
+ }
+ left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 16));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
+ dst += stride;
+ }
+ left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 24));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
+ dst += stride;
+ }
+}
+
+void SmoothHorizontal16x4_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ const __m128i top_right = _mm_set1_epi16(top[15]);
+ const __m128i left = _mm_cvtepu8_epi16(Load4(left_column));
+ const __m128i weights = LoadUnaligned16(kSmoothWeights + 12);
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i weights1 = _mm_cvtepu8_epi16(weights);
+ const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights, 8));
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i scaled_top_right1 =
+ _mm_mullo_epi16(inverted_weights1, top_right);
+ const __m128i scaled_top_right2 =
+ _mm_mullo_epi16(inverted_weights2, top_right);
+ scale = _mm_set1_epi16(128);
+ __m128i y_mask = _mm_set1_epi32(0x01000100);
+ __m128i left_y = _mm_shuffle_epi8(left, y_mask);
+ auto* dst = static_cast<uint8_t*>(dest);
+ WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, scale);
+ dst += stride;
+ y_mask = _mm_set1_epi32(0x03020302);
+ left_y = _mm_shuffle_epi8(left, y_mask);
+ WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, scale);
+ dst += stride;
+ y_mask = _mm_set1_epi32(0x05040504);
+ left_y = _mm_shuffle_epi8(left, y_mask);
+ WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, scale);
+ dst += stride;
+ y_mask = _mm_set1_epi32(0x07060706);
+ left_y = _mm_shuffle_epi8(left, y_mask);
+ WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, scale);
+}
+
+void SmoothHorizontal16x8_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ const __m128i top_right = _mm_set1_epi16(top[15]);
+ const __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column));
+ const __m128i weights = LoadUnaligned16(kSmoothWeights + 12);
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i weights1 = _mm_cvtepu8_epi16(weights);
+ const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights, 8));
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i scaled_top_right1 =
+ _mm_mullo_epi16(inverted_weights1, top_right);
+ const __m128i scaled_top_right2 =
+ _mm_mullo_epi16(inverted_weights2, top_right);
+ scale = _mm_set1_epi16(128);
+ auto* dst = static_cast<uint8_t*>(dest);
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, scale);
+ dst += stride;
+ }
+}
+
+void SmoothHorizontal16x16_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ const __m128i top_right = _mm_set1_epi16(top[15]);
+ const __m128i weights = LoadUnaligned16(kSmoothWeights + 12);
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i weights1 = _mm_cvtepu8_epi16(weights);
+ const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights, 8));
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i scaled_top_right1 =
+ _mm_mullo_epi16(inverted_weights1, top_right);
+ const __m128i scaled_top_right2 =
+ _mm_mullo_epi16(inverted_weights2, top_right);
+ scale = _mm_set1_epi16(128);
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column));
+ auto* dst = static_cast<uint8_t*>(dest);
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, scale);
+ dst += stride;
+ }
+ left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 8));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, scale);
+ dst += stride;
+ }
+}
+
+void SmoothHorizontal16x32_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ const __m128i top_right = _mm_set1_epi16(top[15]);
+ const __m128i weights = LoadUnaligned16(kSmoothWeights + 12);
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i weights1 = _mm_cvtepu8_epi16(weights);
+ const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights, 8));
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i scaled_top_right1 =
+ _mm_mullo_epi16(inverted_weights1, top_right);
+ const __m128i scaled_top_right2 =
+ _mm_mullo_epi16(inverted_weights2, top_right);
+ scale = _mm_set1_epi16(128);
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column));
+ auto* dst = static_cast<uint8_t*>(dest);
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, scale);
+ dst += stride;
+ }
+ left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 8));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, scale);
+ dst += stride;
+ }
+ left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 16));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, scale);
+ dst += stride;
+ }
+ left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 24));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, scale);
+ dst += stride;
+ }
+}
+
+void SmoothHorizontal16x64_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ const __m128i top_right = _mm_set1_epi16(top[15]);
+ const __m128i weights = LoadUnaligned16(kSmoothWeights + 12);
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i weights1 = _mm_cvtepu8_epi16(weights);
+ const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights, 8));
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i scaled_top_right1 =
+ _mm_mullo_epi16(inverted_weights1, top_right);
+ const __m128i scaled_top_right2 =
+ _mm_mullo_epi16(inverted_weights2, top_right);
+ scale = _mm_set1_epi16(128);
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ auto* dst = static_cast<uint8_t*>(dest);
+ for (int left_offset = 0; left_offset < 64; left_offset += 8) {
+ const __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + left_offset));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, scale);
+ dst += stride;
+ }
+ }
+}
+
+void SmoothHorizontal32x8_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ const __m128i top_right = _mm_set1_epi16(top[31]);
+ const __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column));
+ const __m128i weights_lo = LoadUnaligned16(kSmoothWeights + 28);
+ const __m128i weights_hi = LoadUnaligned16(kSmoothWeights + 44);
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i weights1 = _mm_cvtepu8_epi16(weights_lo);
+ const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
+ const __m128i weights3 = _mm_cvtepu8_epi16(weights_hi);
+ const __m128i weights4 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hi, 8));
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+ const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+ const __m128i scaled_top_right1 =
+ _mm_mullo_epi16(inverted_weights1, top_right);
+ const __m128i scaled_top_right2 =
+ _mm_mullo_epi16(inverted_weights2, top_right);
+ const __m128i scaled_top_right3 =
+ _mm_mullo_epi16(inverted_weights3, top_right);
+ const __m128i scaled_top_right4 =
+ _mm_mullo_epi16(inverted_weights4, top_right);
+ scale = _mm_set1_epi16(128);
+ auto* dst = static_cast<uint8_t*>(dest);
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ __m128i y_select = _mm_set1_epi32(y_mask);
+ __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, scale);
+ WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+ scaled_top_right3, scaled_top_right4, scale);
+ dst += stride;
+ }
+}
+
+void SmoothHorizontal32x16_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ const __m128i top_right = _mm_set1_epi16(top[31]);
+ const __m128i left1 = _mm_cvtepu8_epi16(LoadLo8(left_column));
+ const __m128i weights_lo = LoadUnaligned16(kSmoothWeights + 28);
+ const __m128i weights_hi = LoadUnaligned16(kSmoothWeights + 44);
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i weights1 = _mm_cvtepu8_epi16(weights_lo);
+ const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
+ const __m128i weights3 = _mm_cvtepu8_epi16(weights_hi);
+ const __m128i weights4 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hi, 8));
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+ const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+ const __m128i scaled_top_right1 =
+ _mm_mullo_epi16(inverted_weights1, top_right);
+ const __m128i scaled_top_right2 =
+ _mm_mullo_epi16(inverted_weights2, top_right);
+ const __m128i scaled_top_right3 =
+ _mm_mullo_epi16(inverted_weights3, top_right);
+ const __m128i scaled_top_right4 =
+ _mm_mullo_epi16(inverted_weights4, top_right);
+ scale = _mm_set1_epi16(128);
+ auto* dst = static_cast<uint8_t*>(dest);
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ __m128i y_select = _mm_set1_epi32(y_mask);
+ __m128i left_y = _mm_shuffle_epi8(left1, y_select);
+ WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, scale);
+ WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+ scaled_top_right3, scaled_top_right4, scale);
+ dst += stride;
+ }
+ const __m128i left2 =
+ _mm_cvtepu8_epi16(LoadLo8(static_cast<const uint8_t*>(left_column) + 8));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ __m128i y_select = _mm_set1_epi32(y_mask);
+ __m128i left_y = _mm_shuffle_epi8(left2, y_select);
+ WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, scale);
+ WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+ scaled_top_right3, scaled_top_right4, scale);
+ dst += stride;
+ }
+}
+
+void SmoothHorizontal32x32_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ const __m128i top_right = _mm_set1_epi16(top[31]);
+ const __m128i weights_lo = LoadUnaligned16(kSmoothWeights + 28);
+ const __m128i weights_hi = LoadUnaligned16(kSmoothWeights + 44);
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i weights1 = _mm_cvtepu8_epi16(weights_lo);
+ const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
+ const __m128i weights3 = _mm_cvtepu8_epi16(weights_hi);
+ const __m128i weights4 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hi, 8));
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+ const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+ const __m128i scaled_top_right1 =
+ _mm_mullo_epi16(inverted_weights1, top_right);
+ const __m128i scaled_top_right2 =
+ _mm_mullo_epi16(inverted_weights2, top_right);
+ const __m128i scaled_top_right3 =
+ _mm_mullo_epi16(inverted_weights3, top_right);
+ const __m128i scaled_top_right4 =
+ _mm_mullo_epi16(inverted_weights4, top_right);
+ scale = _mm_set1_epi16(128);
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column));
+ auto* dst = static_cast<uint8_t*>(dest);
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ __m128i y_select = _mm_set1_epi32(y_mask);
+ __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, scale);
+ WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+ scaled_top_right3, scaled_top_right4, scale);
+ dst += stride;
+ }
+ left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 8));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ __m128i y_select = _mm_set1_epi32(y_mask);
+ __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, scale);
+ WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+ scaled_top_right3, scaled_top_right4, scale);
+ dst += stride;
+ }
+ left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 16));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ __m128i y_select = _mm_set1_epi32(y_mask);
+ __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, scale);
+ WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+ scaled_top_right3, scaled_top_right4, scale);
+ dst += stride;
+ }
+ left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 24));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ __m128i y_select = _mm_set1_epi32(y_mask);
+ __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, scale);
+ WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+ scaled_top_right3, scaled_top_right4, scale);
+ dst += stride;
+ }
+}
+
+void SmoothHorizontal32x64_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ const __m128i top_right = _mm_set1_epi16(top[31]);
+ const __m128i weights_lo = LoadUnaligned16(kSmoothWeights + 28);
+ const __m128i weights_hi = LoadUnaligned16(kSmoothWeights + 44);
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i weights1 = _mm_cvtepu8_epi16(weights_lo);
+ const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
+ const __m128i weights3 = _mm_cvtepu8_epi16(weights_hi);
+ const __m128i weights4 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hi, 8));
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+ const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+ const __m128i scaled_top_right1 =
+ _mm_mullo_epi16(inverted_weights1, top_right);
+ const __m128i scaled_top_right2 =
+ _mm_mullo_epi16(inverted_weights2, top_right);
+ const __m128i scaled_top_right3 =
+ _mm_mullo_epi16(inverted_weights3, top_right);
+ const __m128i scaled_top_right4 =
+ _mm_mullo_epi16(inverted_weights4, top_right);
+ scale = _mm_set1_epi16(128);
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ auto* dst = static_cast<uint8_t*>(dest);
+ for (int left_offset = 0; left_offset < 64; left_offset += 8) {
+ const __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + left_offset));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, scale);
+ WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+ scaled_top_right3, scaled_top_right4, scale);
+ dst += stride;
+ }
+ }
+}
+
+void SmoothHorizontal64x16_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ const __m128i top_right = _mm_set1_epi16(top[63]);
+ const __m128i left1 = _mm_cvtepu8_epi16(LoadLo8(left_column));
+ const __m128i weights_lolo = LoadUnaligned16(kSmoothWeights + 60);
+ const __m128i weights_lohi = LoadUnaligned16(kSmoothWeights + 76);
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i weights1 = _mm_cvtepu8_epi16(weights_lolo);
+ const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lolo, 8));
+ const __m128i weights3 = _mm_cvtepu8_epi16(weights_lohi);
+ const __m128i weights4 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lohi, 8));
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+ const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+ const __m128i scaled_top_right1 =
+ _mm_mullo_epi16(inverted_weights1, top_right);
+ const __m128i scaled_top_right2 =
+ _mm_mullo_epi16(inverted_weights2, top_right);
+ const __m128i scaled_top_right3 =
+ _mm_mullo_epi16(inverted_weights3, top_right);
+ const __m128i scaled_top_right4 =
+ _mm_mullo_epi16(inverted_weights4, top_right);
+ const __m128i weights_hilo = LoadUnaligned16(kSmoothWeights + 92);
+ const __m128i weights_hihi = LoadUnaligned16(kSmoothWeights + 108);
+ const __m128i weights5 = _mm_cvtepu8_epi16(weights_hilo);
+ const __m128i weights6 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hilo, 8));
+ const __m128i weights7 = _mm_cvtepu8_epi16(weights_hihi);
+ const __m128i weights8 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hihi, 8));
+ const __m128i inverted_weights5 = _mm_sub_epi16(scale, weights5);
+ const __m128i inverted_weights6 = _mm_sub_epi16(scale, weights6);
+ const __m128i inverted_weights7 = _mm_sub_epi16(scale, weights7);
+ const __m128i inverted_weights8 = _mm_sub_epi16(scale, weights8);
+ const __m128i scaled_top_right5 =
+ _mm_mullo_epi16(inverted_weights5, top_right);
+ const __m128i scaled_top_right6 =
+ _mm_mullo_epi16(inverted_weights6, top_right);
+ const __m128i scaled_top_right7 =
+ _mm_mullo_epi16(inverted_weights7, top_right);
+ const __m128i scaled_top_right8 =
+ _mm_mullo_epi16(inverted_weights8, top_right);
+ scale = _mm_set1_epi16(128);
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ auto* dst = static_cast<uint8_t*>(dest);
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ __m128i y_select = _mm_set1_epi32(y_mask);
+ __m128i left_y = _mm_shuffle_epi8(left1, y_select);
+ WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, scale);
+ WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+ scaled_top_right3, scaled_top_right4, scale);
+ WriteSmoothDirectionalSum16(dst + 32, left_y, left_y, weights5, weights6,
+ scaled_top_right5, scaled_top_right6, scale);
+ WriteSmoothDirectionalSum16(dst + 48, left_y, left_y, weights7, weights8,
+ scaled_top_right7, scaled_top_right8, scale);
+ dst += stride;
+ }
+ const __m128i left2 = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 8));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ __m128i y_select = _mm_set1_epi32(y_mask);
+ __m128i left_y = _mm_shuffle_epi8(left2, y_select);
+ WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, scale);
+ WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+ scaled_top_right3, scaled_top_right4, scale);
+ WriteSmoothDirectionalSum16(dst + 32, left_y, left_y, weights5, weights6,
+ scaled_top_right5, scaled_top_right6, scale);
+ WriteSmoothDirectionalSum16(dst + 48, left_y, left_y, weights7, weights8,
+ scaled_top_right7, scaled_top_right8, scale);
+ dst += stride;
+ }
+}
+
+void SmoothHorizontal64x32_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ const __m128i top_right = _mm_set1_epi16(top[63]);
+ const __m128i left1 = _mm_cvtepu8_epi16(LoadLo8(left_column));
+ const __m128i weights_lolo = LoadUnaligned16(kSmoothWeights + 60);
+ const __m128i weights_lohi = LoadUnaligned16(kSmoothWeights + 76);
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i weights1 = _mm_cvtepu8_epi16(weights_lolo);
+ const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lolo, 8));
+ const __m128i weights3 = _mm_cvtepu8_epi16(weights_lohi);
+ const __m128i weights4 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lohi, 8));
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+ const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+ const __m128i scaled_top_right1 =
+ _mm_mullo_epi16(inverted_weights1, top_right);
+ const __m128i scaled_top_right2 =
+ _mm_mullo_epi16(inverted_weights2, top_right);
+ const __m128i scaled_top_right3 =
+ _mm_mullo_epi16(inverted_weights3, top_right);
+ const __m128i scaled_top_right4 =
+ _mm_mullo_epi16(inverted_weights4, top_right);
+ const __m128i weights_hilo = LoadUnaligned16(kSmoothWeights + 92);
+ const __m128i weights_hihi = LoadUnaligned16(kSmoothWeights + 108);
+ const __m128i weights5 = _mm_cvtepu8_epi16(weights_hilo);
+ const __m128i weights6 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hilo, 8));
+ const __m128i weights7 = _mm_cvtepu8_epi16(weights_hihi);
+ const __m128i weights8 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hihi, 8));
+ const __m128i inverted_weights5 = _mm_sub_epi16(scale, weights5);
+ const __m128i inverted_weights6 = _mm_sub_epi16(scale, weights6);
+ const __m128i inverted_weights7 = _mm_sub_epi16(scale, weights7);
+ const __m128i inverted_weights8 = _mm_sub_epi16(scale, weights8);
+ const __m128i scaled_top_right5 =
+ _mm_mullo_epi16(inverted_weights5, top_right);
+ const __m128i scaled_top_right6 =
+ _mm_mullo_epi16(inverted_weights6, top_right);
+ const __m128i scaled_top_right7 =
+ _mm_mullo_epi16(inverted_weights7, top_right);
+ const __m128i scaled_top_right8 =
+ _mm_mullo_epi16(inverted_weights8, top_right);
+ scale = _mm_set1_epi16(128);
+ auto* dst = static_cast<uint8_t*>(dest);
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left1, y_select);
+ WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, scale);
+ WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+ scaled_top_right3, scaled_top_right4, scale);
+ WriteSmoothDirectionalSum16(dst + 32, left_y, left_y, weights5, weights6,
+ scaled_top_right5, scaled_top_right6, scale);
+ WriteSmoothDirectionalSum16(dst + 48, left_y, left_y, weights7, weights8,
+ scaled_top_right7, scaled_top_right8, scale);
+ dst += stride;
+ }
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ const __m128i left2 = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 8));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left2, y_select);
+ WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, scale);
+ WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+ scaled_top_right3, scaled_top_right4, scale);
+ WriteSmoothDirectionalSum16(dst + 32, left_y, left_y, weights5, weights6,
+ scaled_top_right5, scaled_top_right6, scale);
+ WriteSmoothDirectionalSum16(dst + 48, left_y, left_y, weights7, weights8,
+ scaled_top_right7, scaled_top_right8, scale);
+ dst += stride;
+ }
+ const __m128i left3 = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 16));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left3, y_select);
+ WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, scale);
+ WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+ scaled_top_right3, scaled_top_right4, scale);
+ WriteSmoothDirectionalSum16(dst + 32, left_y, left_y, weights5, weights6,
+ scaled_top_right5, scaled_top_right6, scale);
+ WriteSmoothDirectionalSum16(dst + 48, left_y, left_y, weights7, weights8,
+ scaled_top_right7, scaled_top_right8, scale);
+ dst += stride;
+ }
+ const __m128i left4 = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 24));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left4, y_select);
+ WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, scale);
+ WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+ scaled_top_right3, scaled_top_right4, scale);
+ WriteSmoothDirectionalSum16(dst + 32, left_y, left_y, weights5, weights6,
+ scaled_top_right5, scaled_top_right6, scale);
+ WriteSmoothDirectionalSum16(dst + 48, left_y, left_y, weights7, weights8,
+ scaled_top_right7, scaled_top_right8, scale);
+ dst += stride;
+ }
+}
+
+void SmoothHorizontal64x64_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ const __m128i top_right = _mm_set1_epi16(top[63]);
+ const __m128i weights_lolo = LoadUnaligned16(kSmoothWeights + 60);
+ const __m128i weights_lohi = LoadUnaligned16(kSmoothWeights + 76);
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i weights1 = _mm_cvtepu8_epi16(weights_lolo);
+ const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lolo, 8));
+ const __m128i weights3 = _mm_cvtepu8_epi16(weights_lohi);
+ const __m128i weights4 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lohi, 8));
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+ const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+ const __m128i scaled_top_right1 =
+ _mm_mullo_epi16(inverted_weights1, top_right);
+ const __m128i scaled_top_right2 =
+ _mm_mullo_epi16(inverted_weights2, top_right);
+ const __m128i scaled_top_right3 =
+ _mm_mullo_epi16(inverted_weights3, top_right);
+ const __m128i scaled_top_right4 =
+ _mm_mullo_epi16(inverted_weights4, top_right);
+ const __m128i weights_hilo = LoadUnaligned16(kSmoothWeights + 92);
+ const __m128i weights_hihi = LoadUnaligned16(kSmoothWeights + 108);
+ const __m128i weights5 = _mm_cvtepu8_epi16(weights_hilo);
+ const __m128i weights6 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hilo, 8));
+ const __m128i weights7 = _mm_cvtepu8_epi16(weights_hihi);
+ const __m128i weights8 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hihi, 8));
+ const __m128i inverted_weights5 = _mm_sub_epi16(scale, weights5);
+ const __m128i inverted_weights6 = _mm_sub_epi16(scale, weights6);
+ const __m128i inverted_weights7 = _mm_sub_epi16(scale, weights7);
+ const __m128i inverted_weights8 = _mm_sub_epi16(scale, weights8);
+ const __m128i scaled_top_right5 =
+ _mm_mullo_epi16(inverted_weights5, top_right);
+ const __m128i scaled_top_right6 =
+ _mm_mullo_epi16(inverted_weights6, top_right);
+ const __m128i scaled_top_right7 =
+ _mm_mullo_epi16(inverted_weights7, top_right);
+ const __m128i scaled_top_right8 =
+ _mm_mullo_epi16(inverted_weights8, top_right);
+ scale = _mm_set1_epi16(128);
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ auto* dst = static_cast<uint8_t*>(dest);
+ for (int left_offset = 0; left_offset < 64; left_offset += 8) {
+ const __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + left_offset));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, scale);
+ WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+ scaled_top_right3, scaled_top_right4, scale);
+ WriteSmoothDirectionalSum16(dst + 32, left_y, left_y, weights5, weights6,
+ scaled_top_right5, scaled_top_right6, scale);
+ WriteSmoothDirectionalSum16(dst + 48, left_y, left_y, weights7, weights8,
+ scaled_top_right7, scaled_top_right8, scale);
+ dst += stride;
+ }
+ }
+}
+
+inline void LoadSmoothVerticalPixels4(const uint8_t* above, const uint8_t* left,
+ const int height, __m128i* pixels) {
+ __m128i top = Load4(above);
+ const __m128i bottom_left = _mm_set1_epi16(left[height - 1]);
+ top = _mm_cvtepu8_epi16(top);
+ pixels[0] = _mm_unpacklo_epi16(top, bottom_left);
+}
+
+// |weight_array| alternates weight vectors from the table with their inverted
+// (256-w) counterparts. This is precomputed by the compiler when the weights
+// table is visible to this module. Removing this visibility can cut speed by up
+// to half in both 4xH and 8xH transforms.
+inline void LoadSmoothVerticalWeights4(const uint8_t* weight_array,
+ const int height, __m128i* weights) {
+ const __m128i inverter = _mm_set1_epi16(256);
+
+ if (height == 4) {
+ const __m128i weight = Load4(weight_array);
+ weights[0] = _mm_cvtepu8_epi16(weight);
+ weights[1] = _mm_sub_epi16(inverter, weights[0]);
+ } else if (height == 8) {
+ const __m128i weight = LoadLo8(weight_array + 4);
+ weights[0] = _mm_cvtepu8_epi16(weight);
+ weights[1] = _mm_sub_epi16(inverter, weights[0]);
+ } else {
+ const __m128i weight = LoadUnaligned16(weight_array + 12);
+ const __m128i zero = _mm_setzero_si128();
+ weights[0] = _mm_cvtepu8_epi16(weight);
+ weights[1] = _mm_sub_epi16(inverter, weights[0]);
+ weights[2] = _mm_unpackhi_epi8(weight, zero);
+ weights[3] = _mm_sub_epi16(inverter, weights[2]);
+ }
+}
+
+inline void WriteSmoothVertical4xH(const __m128i* pixel, const __m128i* weight,
+ const int height, uint8_t* dst,
+ const ptrdiff_t stride) {
+ const __m128i pred_round = _mm_set1_epi32(128);
+ const __m128i mask_increment = _mm_set1_epi16(0x0202);
+ const __m128i cvtepu8_epi32 = _mm_set1_epi32(0xC080400);
+ __m128i y_select = _mm_set1_epi16(0x0100);
+
+ for (int y = 0; y < height; ++y) {
+ const __m128i weight_y = _mm_shuffle_epi8(weight[0], y_select);
+ const __m128i inverted_weight_y = _mm_shuffle_epi8(weight[1], y_select);
+ const __m128i alternate_weights =
+ _mm_unpacklo_epi16(weight_y, inverted_weight_y);
+ // Here the pixel vector is top_row[0], corner, top_row[1], corner, ...
+ // The madd instruction yields four results of the form:
+ // (top_row[x] * weight[y] + corner * inverted_weight[y])
+ __m128i sum = _mm_madd_epi16(pixel[0], alternate_weights);
+ sum = _mm_add_epi32(sum, pred_round);
+ sum = _mm_srai_epi32(sum, 8);
+ sum = _mm_shuffle_epi8(sum, cvtepu8_epi32);
+ Store4(dst, sum);
+ dst += stride;
+ y_select = _mm_add_epi16(y_select, mask_increment);
+ }
+}
+
+void SmoothVertical4x4_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const auto* const left = static_cast<const uint8_t*>(left_column);
+ const auto* const above = static_cast<const uint8_t*>(top_row);
+ auto* dst = static_cast<uint8_t*>(dest);
+ __m128i pixels;
+ LoadSmoothVerticalPixels4(above, left, 4, &pixels);
+
+ __m128i weights[2];
+ LoadSmoothVerticalWeights4(kSmoothWeights, 4, weights);
+
+ WriteSmoothVertical4xH(&pixels, weights, 4, dst, stride);
+}
+
+void SmoothVertical4x8_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const auto* const left = static_cast<const uint8_t*>(left_column);
+ const auto* const above = static_cast<const uint8_t*>(top_row);
+ auto* dst = static_cast<uint8_t*>(dest);
+ __m128i pixels;
+ LoadSmoothVerticalPixels4(above, left, 8, &pixels);
+
+ __m128i weights[2];
+ LoadSmoothVerticalWeights4(kSmoothWeights, 8, weights);
+
+ WriteSmoothVertical4xH(&pixels, weights, 8, dst, stride);
+}
+
+void SmoothVertical4x16_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const auto* const left = static_cast<const uint8_t*>(left_column);
+ const auto* const above = static_cast<const uint8_t*>(top_row);
+ auto* dst = static_cast<uint8_t*>(dest);
+ __m128i pixels;
+ LoadSmoothVerticalPixels4(above, left, 16, &pixels);
+
+ __m128i weights[4];
+ LoadSmoothVerticalWeights4(kSmoothWeights, 16, weights);
+
+ WriteSmoothVertical4xH(&pixels, weights, 8, dst, stride);
+ dst += stride << 3;
+ WriteSmoothVertical4xH(&pixels, &weights[2], 8, dst, stride);
+}
+
+void SmoothVertical8x4_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ const __m128i bottom_left = _mm_set1_epi16(left_ptr[3]);
+ const __m128i weights = _mm_cvtepu8_epi16(Load4(kSmoothWeights));
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+ const __m128i scaled_bottom_left =
+ _mm_mullo_epi16(inverted_weights, bottom_left);
+ scale = _mm_set1_epi16(128);
+
+ auto* dst = static_cast<uint8_t*>(dest);
+ __m128i y_select = _mm_set1_epi32(0x01000100);
+ const __m128i top = _mm_cvtepu8_epi16(LoadLo8(top_row));
+ __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
+ __m128i scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
+ WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y, scale);
+ dst += stride;
+ y_select = _mm_set1_epi32(0x03020302);
+ weights_y = _mm_shuffle_epi8(weights, y_select);
+ scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
+ WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y, scale);
+ dst += stride;
+ y_select = _mm_set1_epi32(0x05040504);
+ weights_y = _mm_shuffle_epi8(weights, y_select);
+ scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
+ WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y, scale);
+ dst += stride;
+ y_select = _mm_set1_epi32(0x07060706);
+ weights_y = _mm_shuffle_epi8(weights, y_select);
+ scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
+ WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y, scale);
+}
+
+void SmoothVertical8x8_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ const __m128i bottom_left = _mm_set1_epi16(left_ptr[7]);
+ const __m128i weights = _mm_cvtepu8_epi16(LoadLo8(kSmoothWeights + 4));
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+ const __m128i scaled_bottom_left =
+ _mm_mullo_epi16(inverted_weights, bottom_left);
+ scale = _mm_set1_epi16(128);
+ const __m128i top = _mm_cvtepu8_epi16(LoadLo8(top_row));
+ auto* dst = static_cast<uint8_t*>(dest);
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left, y_select);
+ WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ }
+}
+
+void SmoothVertical8x16_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ const __m128i bottom_left = _mm_set1_epi16(left_ptr[15]);
+ const __m128i weights = LoadUnaligned16(kSmoothWeights + 12);
+
+ const __m128i weights1 = _mm_cvtepu8_epi16(weights);
+ const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights, 8));
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i scaled_bottom_left1 =
+ _mm_mullo_epi16(inverted_weights1, bottom_left);
+ const __m128i scaled_bottom_left2 =
+ _mm_mullo_epi16(inverted_weights2, bottom_left);
+ scale = _mm_set1_epi16(128);
+ const __m128i top = _mm_cvtepu8_epi16(LoadLo8(top_row));
+ auto* dst = static_cast<uint8_t*>(dest);
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left1, y_select);
+ WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left2, y_select);
+ WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ }
+}
+
+void SmoothVertical8x32_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i bottom_left = _mm_set1_epi16(left_ptr[31]);
+ const __m128i weights_lo = LoadUnaligned16(kSmoothWeights + 28);
+ const __m128i weights_hi = LoadUnaligned16(kSmoothWeights + 44);
+ const __m128i weights1 = _mm_cvtepu8_epi16(weights_lo);
+ const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero);
+ const __m128i weights3 = _mm_cvtepu8_epi16(weights_hi);
+ const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero);
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+ const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+ const __m128i scaled_bottom_left1 =
+ _mm_mullo_epi16(inverted_weights1, bottom_left);
+ const __m128i scaled_bottom_left2 =
+ _mm_mullo_epi16(inverted_weights2, bottom_left);
+ const __m128i scaled_bottom_left3 =
+ _mm_mullo_epi16(inverted_weights3, bottom_left);
+ const __m128i scaled_bottom_left4 =
+ _mm_mullo_epi16(inverted_weights4, bottom_left);
+ scale = _mm_set1_epi16(128);
+ auto* dst = static_cast<uint8_t*>(dest);
+ const __m128i top = _mm_cvtepu8_epi16(LoadLo8(top_row));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left1, y_select);
+ WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left2, y_select);
+ WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left3, y_select);
+ WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left4, y_select);
+ WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ }
+}
+
+void SmoothVertical16x4_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ auto* dst = static_cast<uint8_t*>(dest);
+ const __m128i bottom_left = _mm_set1_epi16(left_ptr[3]);
+ const __m128i weights = _mm_cvtepu8_epi16(Load4(kSmoothWeights));
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+ const __m128i scaled_bottom_left =
+ _mm_mullo_epi16(inverted_weights, bottom_left);
+ scale = _mm_set1_epi16(128);
+ const __m128i top = LoadUnaligned16(top_row);
+ const __m128i top_lo = _mm_cvtepu8_epi16(top);
+ const __m128i top_hi = _mm_cvtepu8_epi16(_mm_srli_si128(top, 8));
+
+ __m128i y_select = _mm_set1_epi32(0x01000100);
+ __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
+ __m128i scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
+ WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ y_select = _mm_set1_epi32(0x03020302);
+ weights_y = _mm_shuffle_epi8(weights, y_select);
+ scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
+ WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ y_select = _mm_set1_epi32(0x05040504);
+ weights_y = _mm_shuffle_epi8(weights, y_select);
+ scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
+ WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ y_select = _mm_set1_epi32(0x07060706);
+ weights_y = _mm_shuffle_epi8(weights, y_select);
+ scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
+ WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+}
+
+void SmoothVertical16x8_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ auto* dst = static_cast<uint8_t*>(dest);
+ const __m128i bottom_left = _mm_set1_epi16(left_ptr[7]);
+ const __m128i weights = _mm_cvtepu8_epi16(LoadLo8(kSmoothWeights + 4));
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+ const __m128i scaled_bottom_left =
+ _mm_mullo_epi16(inverted_weights, bottom_left);
+ scale = _mm_set1_epi16(128);
+
+ const __m128i top = LoadUnaligned16(top_row);
+ const __m128i top_lo = _mm_cvtepu8_epi16(top);
+ const __m128i top_hi = _mm_cvtepu8_epi16(_mm_srli_si128(top, 8));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left, y_select);
+ WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ }
+}
+
+void SmoothVertical16x16_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ auto* dst = static_cast<uint8_t*>(dest);
+ const __m128i bottom_left = _mm_set1_epi16(left_ptr[15]);
+ const __m128i zero = _mm_setzero_si128();
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i weights = LoadUnaligned16(kSmoothWeights + 12);
+ const __m128i weights_lo = _mm_cvtepu8_epi16(weights);
+ const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero);
+ const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo);
+ const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi);
+ const __m128i scaled_bottom_left_lo =
+ _mm_mullo_epi16(inverted_weights_lo, bottom_left);
+ const __m128i scaled_bottom_left_hi =
+ _mm_mullo_epi16(inverted_weights_hi, bottom_left);
+ scale = _mm_set1_epi16(128);
+
+ const __m128i top = LoadUnaligned16(top_row);
+ const __m128i top_lo = _mm_cvtepu8_epi16(top);
+ const __m128i top_hi = _mm_unpackhi_epi8(top, zero);
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left_lo, y_select);
+ WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left_hi, y_select);
+ WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ }
+}
+
+void SmoothVertical16x32_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ auto* dst = static_cast<uint8_t*>(dest);
+ const __m128i bottom_left = _mm_set1_epi16(left_ptr[31]);
+ const __m128i weights_lo = LoadUnaligned16(kSmoothWeights + 28);
+ const __m128i weights_hi = LoadUnaligned16(kSmoothWeights + 44);
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i weights1 = _mm_cvtepu8_epi16(weights_lo);
+ const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero);
+ const __m128i weights3 = _mm_cvtepu8_epi16(weights_hi);
+ const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero);
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+ const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+ const __m128i scaled_bottom_left1 =
+ _mm_mullo_epi16(inverted_weights1, bottom_left);
+ const __m128i scaled_bottom_left2 =
+ _mm_mullo_epi16(inverted_weights2, bottom_left);
+ const __m128i scaled_bottom_left3 =
+ _mm_mullo_epi16(inverted_weights3, bottom_left);
+ const __m128i scaled_bottom_left4 =
+ _mm_mullo_epi16(inverted_weights4, bottom_left);
+ scale = _mm_set1_epi16(128);
+
+ const __m128i top = LoadUnaligned16(top_row);
+ const __m128i top_lo = _mm_cvtepu8_epi16(top);
+ const __m128i top_hi = _mm_unpackhi_epi8(top, zero);
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left1, y_select);
+ WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left2, y_select);
+ WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left3, y_select);
+ WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left4, y_select);
+ WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ }
+}
+
+void SmoothVertical16x64_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ auto* dst = static_cast<uint8_t*>(dest);
+ const __m128i bottom_left = _mm_set1_epi16(left_ptr[63]);
+ const __m128i scale = _mm_set1_epi16(256);
+ const __m128i round = _mm_set1_epi16(128);
+ const __m128i zero = _mm_setzero_si128();
+
+ const __m128i top = LoadUnaligned16(top_row);
+ const __m128i top_lo = _mm_cvtepu8_epi16(top);
+ const __m128i top_hi = _mm_unpackhi_epi8(top, zero);
+ const uint8_t* weights_base_ptr = kSmoothWeights + 60;
+ for (int left_offset = 0; left_offset < 64; left_offset += 16) {
+ const __m128i weights = LoadUnaligned16(weights_base_ptr + left_offset);
+ const __m128i weights_lo = _mm_cvtepu8_epi16(weights);
+ const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero);
+ const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo);
+ const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi);
+ const __m128i scaled_bottom_left_lo =
+ _mm_mullo_epi16(inverted_weights_lo, bottom_left);
+ const __m128i scaled_bottom_left_hi =
+ _mm_mullo_epi16(inverted_weights_hi, bottom_left);
+
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left_lo, y_select);
+ WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left_hi, y_select);
+ WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ dst += stride;
+ }
+ }
+}
+
+void SmoothVertical32x8_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ auto* dst = static_cast<uint8_t*>(dest);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i bottom_left = _mm_set1_epi16(left_ptr[7]);
+ const __m128i top_lo = LoadUnaligned16(top_ptr);
+ const __m128i top_hi = LoadUnaligned16(top_ptr + 16);
+ const __m128i top1 = _mm_cvtepu8_epi16(top_lo);
+ const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero);
+ const __m128i top3 = _mm_cvtepu8_epi16(top_hi);
+ const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero);
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i weights = _mm_cvtepu8_epi16(LoadLo8(kSmoothWeights + 4));
+ const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+ const __m128i scaled_bottom_left =
+ _mm_mullo_epi16(inverted_weights, bottom_left);
+ scale = _mm_set1_epi16(128);
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left, y_select);
+ WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ }
+}
+
+void SmoothVertical32x16_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ auto* dst = static_cast<uint8_t*>(dest);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i bottom_left = _mm_set1_epi16(left_ptr[15]);
+ const __m128i top_lo = LoadUnaligned16(top_ptr);
+ const __m128i top_hi = LoadUnaligned16(top_ptr + 16);
+ const __m128i top1 = _mm_cvtepu8_epi16(top_lo);
+ const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero);
+ const __m128i top3 = _mm_cvtepu8_epi16(top_hi);
+ const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero);
+ const __m128i weights = LoadUnaligned16(kSmoothWeights + 12);
+ const __m128i weights1 = _mm_cvtepu8_epi16(weights);
+ const __m128i weights2 = _mm_unpackhi_epi8(weights, zero);
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i scaled_bottom_left1 =
+ _mm_mullo_epi16(inverted_weights1, bottom_left);
+ const __m128i scaled_bottom_left2 =
+ _mm_mullo_epi16(inverted_weights2, bottom_left);
+ scale = _mm_set1_epi16(128);
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left1, y_select);
+ WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left2, y_select);
+ WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ }
+}
+
+void SmoothVertical32x32_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ auto* dst = static_cast<uint8_t*>(dest);
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ const __m128i bottom_left = _mm_set1_epi16(left_ptr[31]);
+ const __m128i weights_lo = LoadUnaligned16(kSmoothWeights + 28);
+ const __m128i weights_hi = LoadUnaligned16(kSmoothWeights + 44);
+ const __m128i zero = _mm_setzero_si128();
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i top_lo = LoadUnaligned16(top_ptr);
+ const __m128i top_hi = LoadUnaligned16(top_ptr + 16);
+ const __m128i top1 = _mm_cvtepu8_epi16(top_lo);
+ const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero);
+ const __m128i top3 = _mm_cvtepu8_epi16(top_hi);
+ const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero);
+ const __m128i weights1 = _mm_cvtepu8_epi16(weights_lo);
+ const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero);
+ const __m128i weights3 = _mm_cvtepu8_epi16(weights_hi);
+ const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero);
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+ const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+ const __m128i scaled_bottom_left1 =
+ _mm_mullo_epi16(inverted_weights1, bottom_left);
+ const __m128i scaled_bottom_left2 =
+ _mm_mullo_epi16(inverted_weights2, bottom_left);
+ const __m128i scaled_bottom_left3 =
+ _mm_mullo_epi16(inverted_weights3, bottom_left);
+ const __m128i scaled_bottom_left4 =
+ _mm_mullo_epi16(inverted_weights4, bottom_left);
+ scale = _mm_set1_epi16(128);
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left1, y_select);
+ WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left2, y_select);
+ WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left3, y_select);
+ WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left4, y_select);
+ WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ }
+}
+
+void SmoothVertical32x64_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ auto* dst = static_cast<uint8_t*>(dest);
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i bottom_left = _mm_set1_epi16(left_ptr[63]);
+ const __m128i top_lo = LoadUnaligned16(top_ptr);
+ const __m128i top_hi = LoadUnaligned16(top_ptr + 16);
+ const __m128i top1 = _mm_cvtepu8_epi16(top_lo);
+ const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero);
+ const __m128i top3 = _mm_cvtepu8_epi16(top_hi);
+ const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero);
+ const __m128i scale = _mm_set1_epi16(256);
+ const __m128i round = _mm_set1_epi16(128);
+ const uint8_t* weights_base_ptr = kSmoothWeights + 60;
+ for (int left_offset = 0; left_offset < 64; left_offset += 16) {
+ const __m128i weights = LoadUnaligned16(weights_base_ptr + left_offset);
+ const __m128i weights_lo = _mm_cvtepu8_epi16(weights);
+ const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero);
+ const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo);
+ const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi);
+ const __m128i scaled_bottom_left_lo =
+ _mm_mullo_epi16(inverted_weights_lo, bottom_left);
+ const __m128i scaled_bottom_left_hi =
+ _mm_mullo_epi16(inverted_weights_hi, bottom_left);
+
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left_lo, y_select);
+ WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left_hi, y_select);
+ WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ dst += stride;
+ }
+ }
+}
+
+void SmoothVertical64x16_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ auto* dst = static_cast<uint8_t*>(dest);
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ const __m128i bottom_left = _mm_set1_epi16(left_ptr[15]);
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i top_lolo = LoadUnaligned16(top_ptr);
+ const __m128i top_lohi = LoadUnaligned16(top_ptr + 16);
+ const __m128i top1 = _mm_cvtepu8_epi16(top_lolo);
+ const __m128i top2 = _mm_unpackhi_epi8(top_lolo, zero);
+ const __m128i top3 = _mm_cvtepu8_epi16(top_lohi);
+ const __m128i top4 = _mm_unpackhi_epi8(top_lohi, zero);
+
+ const __m128i weights = LoadUnaligned16(kSmoothWeights + 12);
+ const __m128i weights1 = _mm_cvtepu8_epi16(weights);
+ const __m128i weights2 = _mm_unpackhi_epi8(weights, zero);
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i top_hilo = LoadUnaligned16(top_ptr + 32);
+ const __m128i top_hihi = LoadUnaligned16(top_ptr + 48);
+ const __m128i top5 = _mm_cvtepu8_epi16(top_hilo);
+ const __m128i top6 = _mm_unpackhi_epi8(top_hilo, zero);
+ const __m128i top7 = _mm_cvtepu8_epi16(top_hihi);
+ const __m128i top8 = _mm_unpackhi_epi8(top_hihi, zero);
+ const __m128i scaled_bottom_left1 =
+ _mm_mullo_epi16(inverted_weights1, bottom_left);
+ const __m128i scaled_bottom_left2 =
+ _mm_mullo_epi16(inverted_weights2, bottom_left);
+ scale = _mm_set1_epi16(128);
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left1, y_select);
+ WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ WriteSmoothDirectionalSum16(dst + 32, top5, top6, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ WriteSmoothDirectionalSum16(dst + 48, top7, top8, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left2, y_select);
+ WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ WriteSmoothDirectionalSum16(dst + 32, top5, top6, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ WriteSmoothDirectionalSum16(dst + 48, top7, top8, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ }
+}
+
+void SmoothVertical64x32_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ auto* dst = static_cast<uint8_t*>(dest);
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i bottom_left = _mm_set1_epi16(left_ptr[31]);
+ const __m128i top_lolo = LoadUnaligned16(top_ptr);
+ const __m128i top_lohi = LoadUnaligned16(top_ptr + 16);
+ const __m128i top1 = _mm_cvtepu8_epi16(top_lolo);
+ const __m128i top2 = _mm_unpackhi_epi8(top_lolo, zero);
+ const __m128i top3 = _mm_cvtepu8_epi16(top_lohi);
+ const __m128i top4 = _mm_unpackhi_epi8(top_lohi, zero);
+ const __m128i top_hilo = LoadUnaligned16(top_ptr + 32);
+ const __m128i top_hihi = LoadUnaligned16(top_ptr + 48);
+ const __m128i top5 = _mm_cvtepu8_epi16(top_hilo);
+ const __m128i top6 = _mm_unpackhi_epi8(top_hilo, zero);
+ const __m128i top7 = _mm_cvtepu8_epi16(top_hihi);
+ const __m128i top8 = _mm_unpackhi_epi8(top_hihi, zero);
+ const __m128i weights_lo = LoadUnaligned16(kSmoothWeights + 28);
+ const __m128i weights_hi = LoadUnaligned16(kSmoothWeights + 44);
+ const __m128i weights1 = _mm_cvtepu8_epi16(weights_lo);
+ const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero);
+ const __m128i weights3 = _mm_cvtepu8_epi16(weights_hi);
+ const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero);
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+ const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+ const __m128i scaled_bottom_left1 =
+ _mm_mullo_epi16(inverted_weights1, bottom_left);
+ const __m128i scaled_bottom_left2 =
+ _mm_mullo_epi16(inverted_weights2, bottom_left);
+ const __m128i scaled_bottom_left3 =
+ _mm_mullo_epi16(inverted_weights3, bottom_left);
+ const __m128i scaled_bottom_left4 =
+ _mm_mullo_epi16(inverted_weights4, bottom_left);
+ scale = _mm_set1_epi16(128);
+
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left1, y_select);
+ WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ WriteSmoothDirectionalSum16(dst + 32, top5, top6, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ WriteSmoothDirectionalSum16(dst + 48, top7, top8, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left2, y_select);
+ WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ WriteSmoothDirectionalSum16(dst + 32, top5, top6, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ WriteSmoothDirectionalSum16(dst + 48, top7, top8, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left3, y_select);
+ WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ WriteSmoothDirectionalSum16(dst + 32, top5, top6, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ WriteSmoothDirectionalSum16(dst + 48, top7, top8, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left4, y_select);
+ WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ WriteSmoothDirectionalSum16(dst + 32, top5, top6, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ WriteSmoothDirectionalSum16(dst + 48, top7, top8, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ scale);
+ dst += stride;
+ }
+}
+
+void SmoothVertical64x64_SSE4_1(void* const dest, const ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ auto* dst = static_cast<uint8_t*>(dest);
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i bottom_left = _mm_set1_epi16(left_ptr[63]);
+ const __m128i top_lolo = LoadUnaligned16(top_ptr);
+ const __m128i top_lohi = LoadUnaligned16(top_ptr + 16);
+ const __m128i top1 = _mm_cvtepu8_epi16(top_lolo);
+ const __m128i top2 = _mm_unpackhi_epi8(top_lolo, zero);
+ const __m128i top3 = _mm_cvtepu8_epi16(top_lohi);
+ const __m128i top4 = _mm_unpackhi_epi8(top_lohi, zero);
+ const __m128i top_hilo = LoadUnaligned16(top_ptr + 32);
+ const __m128i top_hihi = LoadUnaligned16(top_ptr + 48);
+ const __m128i top5 = _mm_cvtepu8_epi16(top_hilo);
+ const __m128i top6 = _mm_unpackhi_epi8(top_hilo, zero);
+ const __m128i top7 = _mm_cvtepu8_epi16(top_hihi);
+ const __m128i top8 = _mm_unpackhi_epi8(top_hihi, zero);
+ const __m128i scale = _mm_set1_epi16(256);
+ const __m128i round = _mm_set1_epi16(128);
+ const uint8_t* weights_base_ptr = kSmoothWeights + 60;
+ for (int left_offset = 0; left_offset < 64; left_offset += 16) {
+ const __m128i weights = LoadUnaligned16(weights_base_ptr + left_offset);
+ const __m128i weights_lo = _mm_cvtepu8_epi16(weights);
+ const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero);
+ const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo);
+ const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi);
+ const __m128i scaled_bottom_left_lo =
+ _mm_mullo_epi16(inverted_weights_lo, bottom_left);
+ const __m128i scaled_bottom_left_hi =
+ _mm_mullo_epi16(inverted_weights_hi, bottom_left);
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left_lo, y_select);
+ WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ WriteSmoothDirectionalSum16(dst + 32, top5, top6, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ WriteSmoothDirectionalSum16(dst + 48, top7, top8, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left_hi, y_select);
+ WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ WriteSmoothDirectionalSum16(dst + 32, top5, top6, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ WriteSmoothDirectionalSum16(dst + 48, top7, top8, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ dst += stride;
+ }
+ }
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorSmooth)
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmooth] =
+ Smooth4x4_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_IntraPredictorSmooth)
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmooth] =
+ Smooth4x8_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_IntraPredictorSmooth)
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmooth] =
+ Smooth4x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_IntraPredictorSmooth)
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmooth] =
+ Smooth8x4_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_IntraPredictorSmooth)
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmooth] =
+ Smooth8x8_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_IntraPredictorSmooth)
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmooth] =
+ Smooth8x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_IntraPredictorSmooth)
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmooth] =
+ Smooth8x32_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_IntraPredictorSmooth)
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmooth] =
+ SmoothWxH<16, 4>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_IntraPredictorSmooth)
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmooth] =
+ SmoothWxH<16, 8>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_IntraPredictorSmooth)
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmooth] =
+ SmoothWxH<16, 16>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_IntraPredictorSmooth)
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmooth] =
+ SmoothWxH<16, 32>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x64_IntraPredictorSmooth)
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmooth] =
+ SmoothWxH<16, 64>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_IntraPredictorSmooth)
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmooth] =
+ SmoothWxH<32, 8>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_IntraPredictorSmooth)
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmooth] =
+ SmoothWxH<32, 16>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_IntraPredictorSmooth)
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmooth] =
+ SmoothWxH<32, 32>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x64_IntraPredictorSmooth)
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmooth] =
+ SmoothWxH<32, 64>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x16_IntraPredictorSmooth)
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmooth] =
+ SmoothWxH<64, 16>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x32_IntraPredictorSmooth)
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmooth] =
+ SmoothWxH<64, 32>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x64_IntraPredictorSmooth)
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmooth] =
+ SmoothWxH<64, 64>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorSmoothVertical)
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothVertical] =
+ SmoothVertical4x4_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_IntraPredictorSmoothVertical)
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothVertical] =
+ SmoothVertical4x8_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_IntraPredictorSmoothVertical)
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothVertical] =
+ SmoothVertical4x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_IntraPredictorSmoothVertical)
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothVertical] =
+ SmoothVertical8x4_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_IntraPredictorSmoothVertical)
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothVertical] =
+ SmoothVertical8x8_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_IntraPredictorSmoothVertical)
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothVertical] =
+ SmoothVertical8x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_IntraPredictorSmoothVertical)
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothVertical] =
+ SmoothVertical8x32_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_IntraPredictorSmoothVertical)
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothVertical] =
+ SmoothVertical16x4_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_IntraPredictorSmoothVertical)
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothVertical] =
+ SmoothVertical16x8_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_IntraPredictorSmoothVertical)
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothVertical] =
+ SmoothVertical16x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_IntraPredictorSmoothVertical)
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothVertical] =
+ SmoothVertical16x32_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x64_IntraPredictorSmoothVertical)
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothVertical] =
+ SmoothVertical16x64_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_IntraPredictorSmoothVertical)
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothVertical] =
+ SmoothVertical32x8_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_IntraPredictorSmoothVertical)
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothVertical] =
+ SmoothVertical32x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_IntraPredictorSmoothVertical)
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothVertical] =
+ SmoothVertical32x32_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x64_IntraPredictorSmoothVertical)
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothVertical] =
+ SmoothVertical32x64_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x16_IntraPredictorSmoothVertical)
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothVertical] =
+ SmoothVertical64x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x32_IntraPredictorSmoothVertical)
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothVertical] =
+ SmoothVertical64x32_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x64_IntraPredictorSmoothVertical)
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothVertical] =
+ SmoothVertical64x64_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorSmoothHorizontal)
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal4x4_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_IntraPredictorSmoothHorizontal)
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal4x8_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_IntraPredictorSmoothHorizontal)
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal4x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_IntraPredictorSmoothHorizontal)
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal8x4_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_IntraPredictorSmoothHorizontal)
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal8x8_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_IntraPredictorSmoothHorizontal)
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal8x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_IntraPredictorSmoothHorizontal)
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal8x32_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_IntraPredictorSmoothHorizontal)
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal16x4_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_IntraPredictorSmoothHorizontal)
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal16x8_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_IntraPredictorSmoothHorizontal)
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal16x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_IntraPredictorSmoothHorizontal)
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal16x32_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x64_IntraPredictorSmoothHorizontal)
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal16x64_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_IntraPredictorSmoothHorizontal)
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal32x8_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_IntraPredictorSmoothHorizontal)
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal32x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_IntraPredictorSmoothHorizontal)
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal32x32_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x64_IntraPredictorSmoothHorizontal)
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal32x64_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x16_IntraPredictorSmoothHorizontal)
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal64x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x32_IntraPredictorSmoothHorizontal)
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal64x32_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x64_IntraPredictorSmoothHorizontal)
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothHorizontal] =
+ SmoothHorizontal64x64_SSE4_1;
+#endif
+}
+
+} // namespace
+} // namespace low_bitdepth
+
+void IntraPredSmoothInit_SSE4_1() { low_bitdepth::Init8bpp(); }
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_TARGETING_SSE4_1
+
+namespace libgav1 {
+namespace dsp {
+
+void IntraPredSmoothInit_SSE4_1() {}
+
+} // namespace dsp
+} // namespace libgav1
+
+#endif // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/intrapred_sse4.cc b/src/dsp/x86/intrapred_sse4.cc
new file mode 100644
index 0000000..9938dfe
--- /dev/null
+++ b/src/dsp/x86/intrapred_sse4.cc
@@ -0,0 +1,3535 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <xmmintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring> // memcpy
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/dsp/x86/transpose_sse4.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+//------------------------------------------------------------------------------
+// Utility Functions
+
+// This is a fast way to divide by a number of the form 2^n + 2^k, n > k.
+// Divide by 2^k by right shifting by k, leaving the denominator 2^m + 1. In the
+// block size cases, n - k is 1 or 2 (block is proportional to 1x2 or 1x4), so
+// we use a multiplier that reflects division by 2+1=3 or 4+1=5 in the high
+// bits.
+constexpr int kThreeInverse = 0x5556;
+constexpr int kFiveInverse = 0x3334;
+template <int shiftk, int multiplier>
+inline __m128i DivideByMultiplyShift_U32(const __m128i dividend) {
+ const __m128i interm = _mm_srli_epi32(dividend, shiftk);
+ return _mm_mulhi_epi16(interm, _mm_cvtsi32_si128(multiplier));
+}
+
+// This shuffle mask selects 32-bit blocks in the order 0, 1, 0, 1, which
+// duplicates the first 8 bytes of a 128-bit vector into the second 8 bytes.
+constexpr int kDuplicateFirstHalf = 0x44;
+
+//------------------------------------------------------------------------------
+// DcPredFuncs_SSE4_1
+
+using DcSumFunc = __m128i (*)(const void* ref);
+using DcStoreFunc = void (*)(void* dest, ptrdiff_t stride, const __m128i dc);
+using WriteDuplicateFunc = void (*)(void* dest, ptrdiff_t stride,
+ const __m128i column);
+// For copying an entire column across a block.
+using ColumnStoreFunc = void (*)(void* dest, ptrdiff_t stride,
+ const void* column);
+
+// DC intra-predictors for non-square blocks.
+template <int width_log2, int height_log2, DcSumFunc top_sumfn,
+ DcSumFunc left_sumfn, DcStoreFunc storefn, int shiftk, int dc_mult>
+struct DcPredFuncs_SSE4_1 {
+ DcPredFuncs_SSE4_1() = delete;
+
+ static void DcTop(void* dest, ptrdiff_t stride, const void* top_row,
+ const void* left_column);
+ static void DcLeft(void* dest, ptrdiff_t stride, const void* top_row,
+ const void* left_column);
+ static void Dc(void* dest, ptrdiff_t stride, const void* top_row,
+ const void* left_column);
+};
+
+// Directional intra-predictors for square blocks.
+template <ColumnStoreFunc col_storefn>
+struct DirectionalPredFuncs_SSE4_1 {
+ DirectionalPredFuncs_SSE4_1() = delete;
+
+ static void Vertical(void* dest, ptrdiff_t stride, const void* top_row,
+ const void* left_column);
+ static void Horizontal(void* dest, ptrdiff_t stride, const void* top_row,
+ const void* left_column);
+};
+
+template <int width_log2, int height_log2, DcSumFunc top_sumfn,
+ DcSumFunc left_sumfn, DcStoreFunc storefn, int shiftk, int dc_mult>
+void DcPredFuncs_SSE4_1<width_log2, height_log2, top_sumfn, left_sumfn, storefn,
+ shiftk, dc_mult>::DcTop(void* const dest,
+ ptrdiff_t stride,
+ const void* const top_row,
+ const void* /*left_column*/) {
+ const __m128i rounder = _mm_set1_epi32(1 << (width_log2 - 1));
+ const __m128i sum = top_sumfn(top_row);
+ const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, rounder), width_log2);
+ storefn(dest, stride, dc);
+}
+
+template <int width_log2, int height_log2, DcSumFunc top_sumfn,
+ DcSumFunc left_sumfn, DcStoreFunc storefn, int shiftk, int dc_mult>
+void DcPredFuncs_SSE4_1<width_log2, height_log2, top_sumfn, left_sumfn, storefn,
+ shiftk,
+ dc_mult>::DcLeft(void* const dest, ptrdiff_t stride,
+ const void* /*top_row*/,
+ const void* const left_column) {
+ const __m128i rounder = _mm_set1_epi32(1 << (height_log2 - 1));
+ const __m128i sum = left_sumfn(left_column);
+ const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, rounder), height_log2);
+ storefn(dest, stride, dc);
+}
+
+template <int width_log2, int height_log2, DcSumFunc top_sumfn,
+ DcSumFunc left_sumfn, DcStoreFunc storefn, int shiftk, int dc_mult>
+void DcPredFuncs_SSE4_1<width_log2, height_log2, top_sumfn, left_sumfn, storefn,
+ shiftk, dc_mult>::Dc(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const __m128i rounder =
+ _mm_set1_epi32((1 << (width_log2 - 1)) + (1 << (height_log2 - 1)));
+ const __m128i sum_top = top_sumfn(top_row);
+ const __m128i sum_left = left_sumfn(left_column);
+ const __m128i sum = _mm_add_epi32(sum_top, sum_left);
+ if (width_log2 == height_log2) {
+ const __m128i dc =
+ _mm_srli_epi32(_mm_add_epi32(sum, rounder), width_log2 + 1);
+ storefn(dest, stride, dc);
+ } else {
+ const __m128i dc =
+ DivideByMultiplyShift_U32<shiftk, dc_mult>(_mm_add_epi32(sum, rounder));
+ storefn(dest, stride, dc);
+ }
+}
+
+//------------------------------------------------------------------------------
+// DcPredFuncs_SSE4_1 directional predictors
+
+template <ColumnStoreFunc col_storefn>
+void DirectionalPredFuncs_SSE4_1<col_storefn>::Horizontal(
+ void* const dest, ptrdiff_t stride, const void* /*top_row*/,
+ const void* const left_column) {
+ col_storefn(dest, stride, left_column);
+}
+
+} // namespace
+
+//------------------------------------------------------------------------------
+namespace low_bitdepth {
+namespace {
+
+// |ref| points to 4 bytes containing 4 packed ints.
+inline __m128i DcSum4_SSE4_1(const void* const ref) {
+ const __m128i vals = Load4(ref);
+ const __m128i zero = _mm_setzero_si128();
+ return _mm_sad_epu8(vals, zero);
+}
+
+inline __m128i DcSum8_SSE4_1(const void* const ref) {
+ const __m128i vals = LoadLo8(ref);
+ const __m128i zero = _mm_setzero_si128();
+ return _mm_sad_epu8(vals, zero);
+}
+
+inline __m128i DcSum16_SSE4_1(const void* const ref) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i vals = LoadUnaligned16(ref);
+ const __m128i partial_sum = _mm_sad_epu8(vals, zero);
+ return _mm_add_epi16(partial_sum, _mm_srli_si128(partial_sum, 8));
+}
+
+inline __m128i DcSum32_SSE4_1(const void* const ref) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i vals1 = LoadUnaligned16(ref);
+ const __m128i vals2 = LoadUnaligned16(static_cast<const uint8_t*>(ref) + 16);
+ const __m128i partial_sum1 = _mm_sad_epu8(vals1, zero);
+ const __m128i partial_sum2 = _mm_sad_epu8(vals2, zero);
+ const __m128i partial_sum = _mm_add_epi16(partial_sum1, partial_sum2);
+ return _mm_add_epi16(partial_sum, _mm_srli_si128(partial_sum, 8));
+}
+
+inline __m128i DcSum64_SSE4_1(const void* const ref) {
+ const auto* const ref_ptr = static_cast<const uint8_t*>(ref);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i vals1 = LoadUnaligned16(ref_ptr);
+ const __m128i vals2 = LoadUnaligned16(ref_ptr + 16);
+ const __m128i vals3 = LoadUnaligned16(ref_ptr + 32);
+ const __m128i vals4 = LoadUnaligned16(ref_ptr + 48);
+ const __m128i partial_sum1 = _mm_sad_epu8(vals1, zero);
+ const __m128i partial_sum2 = _mm_sad_epu8(vals2, zero);
+ __m128i partial_sum = _mm_add_epi16(partial_sum1, partial_sum2);
+ const __m128i partial_sum3 = _mm_sad_epu8(vals3, zero);
+ partial_sum = _mm_add_epi16(partial_sum, partial_sum3);
+ const __m128i partial_sum4 = _mm_sad_epu8(vals4, zero);
+ partial_sum = _mm_add_epi16(partial_sum, partial_sum4);
+ return _mm_add_epi16(partial_sum, _mm_srli_si128(partial_sum, 8));
+}
+
+template <int height>
+inline void DcStore4xH_SSE4_1(void* const dest, ptrdiff_t stride,
+ const __m128i dc) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i dc_dup = _mm_shuffle_epi8(dc, zero);
+ int y = height - 1;
+ auto* dst = static_cast<uint8_t*>(dest);
+ do {
+ Store4(dst, dc_dup);
+ dst += stride;
+ } while (--y != 0);
+ Store4(dst, dc_dup);
+}
+
+template <int height>
+inline void DcStore8xH_SSE4_1(void* const dest, ptrdiff_t stride,
+ const __m128i dc) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i dc_dup = _mm_shuffle_epi8(dc, zero);
+ int y = height - 1;
+ auto* dst = static_cast<uint8_t*>(dest);
+ do {
+ StoreLo8(dst, dc_dup);
+ dst += stride;
+ } while (--y != 0);
+ StoreLo8(dst, dc_dup);
+}
+
+template <int height>
+inline void DcStore16xH_SSE4_1(void* const dest, ptrdiff_t stride,
+ const __m128i dc) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i dc_dup = _mm_shuffle_epi8(dc, zero);
+ int y = height - 1;
+ auto* dst = static_cast<uint8_t*>(dest);
+ do {
+ StoreUnaligned16(dst, dc_dup);
+ dst += stride;
+ } while (--y != 0);
+ StoreUnaligned16(dst, dc_dup);
+}
+
+template <int height>
+inline void DcStore32xH_SSE4_1(void* const dest, ptrdiff_t stride,
+ const __m128i dc) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i dc_dup = _mm_shuffle_epi8(dc, zero);
+ int y = height - 1;
+ auto* dst = static_cast<uint8_t*>(dest);
+ do {
+ StoreUnaligned16(dst, dc_dup);
+ StoreUnaligned16(dst + 16, dc_dup);
+ dst += stride;
+ } while (--y != 0);
+ StoreUnaligned16(dst, dc_dup);
+ StoreUnaligned16(dst + 16, dc_dup);
+}
+
+template <int height>
+inline void DcStore64xH_SSE4_1(void* const dest, ptrdiff_t stride,
+ const __m128i dc) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i dc_dup = _mm_shuffle_epi8(dc, zero);
+ int y = height - 1;
+ auto* dst = static_cast<uint8_t*>(dest);
+ do {
+ StoreUnaligned16(dst, dc_dup);
+ StoreUnaligned16(dst + 16, dc_dup);
+ StoreUnaligned16(dst + 32, dc_dup);
+ StoreUnaligned16(dst + 48, dc_dup);
+ dst += stride;
+ } while (--y != 0);
+ StoreUnaligned16(dst, dc_dup);
+ StoreUnaligned16(dst + 16, dc_dup);
+ StoreUnaligned16(dst + 32, dc_dup);
+ StoreUnaligned16(dst + 48, dc_dup);
+}
+
+// WriteDuplicateN assumes dup has 4 sets of 4 identical bytes that are meant to
+// be copied for width N into dest.
+inline void WriteDuplicate4x4(void* const dest, ptrdiff_t stride,
+ const __m128i dup32) {
+ auto* dst = static_cast<uint8_t*>(dest);
+ Store4(dst, dup32);
+ dst += stride;
+ const int row1 = _mm_extract_epi32(dup32, 1);
+ memcpy(dst, &row1, 4);
+ dst += stride;
+ const int row2 = _mm_extract_epi32(dup32, 2);
+ memcpy(dst, &row2, 4);
+ dst += stride;
+ const int row3 = _mm_extract_epi32(dup32, 3);
+ memcpy(dst, &row3, 4);
+}
+
+inline void WriteDuplicate8x4(void* const dest, ptrdiff_t stride,
+ const __m128i dup32) {
+ const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32);
+ const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32);
+ auto* dst = static_cast<uint8_t*>(dest);
+ _mm_storel_epi64(reinterpret_cast<__m128i*>(dst), dup64_lo);
+ dst += stride;
+ _mm_storeh_pi(reinterpret_cast<__m64*>(dst), _mm_castsi128_ps(dup64_lo));
+ dst += stride;
+ _mm_storel_epi64(reinterpret_cast<__m128i*>(dst), dup64_hi);
+ dst += stride;
+ _mm_storeh_pi(reinterpret_cast<__m64*>(dst), _mm_castsi128_ps(dup64_hi));
+}
+
+inline void WriteDuplicate16x4(void* const dest, ptrdiff_t stride,
+ const __m128i dup32) {
+ const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32);
+ const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32);
+
+ auto* dst = static_cast<uint8_t*>(dest);
+ const __m128i dup128_0 = _mm_unpacklo_epi64(dup64_lo, dup64_lo);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_0);
+ dst += stride;
+ const __m128i dup128_1 = _mm_unpackhi_epi64(dup64_lo, dup64_lo);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_1);
+ dst += stride;
+ const __m128i dup128_2 = _mm_unpacklo_epi64(dup64_hi, dup64_hi);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_2);
+ dst += stride;
+ const __m128i dup128_3 = _mm_unpackhi_epi64(dup64_hi, dup64_hi);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_3);
+}
+
+inline void WriteDuplicate32x4(void* const dest, ptrdiff_t stride,
+ const __m128i dup32) {
+ const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32);
+ const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32);
+
+ auto* dst = static_cast<uint8_t*>(dest);
+ const __m128i dup128_0 = _mm_unpacklo_epi64(dup64_lo, dup64_lo);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_0);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_0);
+ dst += stride;
+ const __m128i dup128_1 = _mm_unpackhi_epi64(dup64_lo, dup64_lo);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_1);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_1);
+ dst += stride;
+ const __m128i dup128_2 = _mm_unpacklo_epi64(dup64_hi, dup64_hi);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_2);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_2);
+ dst += stride;
+ const __m128i dup128_3 = _mm_unpackhi_epi64(dup64_hi, dup64_hi);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_3);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_3);
+}
+
+inline void WriteDuplicate64x4(void* const dest, ptrdiff_t stride,
+ const __m128i dup32) {
+ const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32);
+ const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32);
+
+ auto* dst = static_cast<uint8_t*>(dest);
+ const __m128i dup128_0 = _mm_unpacklo_epi64(dup64_lo, dup64_lo);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_0);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_0);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 32), dup128_0);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 48), dup128_0);
+ dst += stride;
+ const __m128i dup128_1 = _mm_unpackhi_epi64(dup64_lo, dup64_lo);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_1);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_1);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 32), dup128_1);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 48), dup128_1);
+ dst += stride;
+ const __m128i dup128_2 = _mm_unpacklo_epi64(dup64_hi, dup64_hi);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_2);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_2);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 32), dup128_2);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 48), dup128_2);
+ dst += stride;
+ const __m128i dup128_3 = _mm_unpackhi_epi64(dup64_hi, dup64_hi);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_3);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_3);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 32), dup128_3);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 48), dup128_3);
+}
+
+// ColStoreN<height> copies each of the |height| values in |column| across its
+// corresponding in dest.
+template <WriteDuplicateFunc writefn>
+inline void ColStore4_SSE4_1(void* const dest, ptrdiff_t stride,
+ const void* const column) {
+ const __m128i col_data = Load4(column);
+ const __m128i col_dup16 = _mm_unpacklo_epi8(col_data, col_data);
+ const __m128i col_dup32 = _mm_unpacklo_epi16(col_dup16, col_dup16);
+ writefn(dest, stride, col_dup32);
+}
+
+template <WriteDuplicateFunc writefn>
+inline void ColStore8_SSE4_1(void* const dest, ptrdiff_t stride,
+ const void* const column) {
+ const ptrdiff_t stride4 = stride << 2;
+ const __m128i col_data = LoadLo8(column);
+ const __m128i col_dup16 = _mm_unpacklo_epi8(col_data, col_data);
+ const __m128i col_dup32_lo = _mm_unpacklo_epi16(col_dup16, col_dup16);
+ auto* dst = static_cast<uint8_t*>(dest);
+ writefn(dst, stride, col_dup32_lo);
+ dst += stride4;
+ const __m128i col_dup32_hi = _mm_unpackhi_epi16(col_dup16, col_dup16);
+ writefn(dst, stride, col_dup32_hi);
+}
+
+template <WriteDuplicateFunc writefn>
+inline void ColStore16_SSE4_1(void* const dest, ptrdiff_t stride,
+ const void* const column) {
+ const ptrdiff_t stride4 = stride << 2;
+ const __m128i col_data = _mm_loadu_si128(static_cast<const __m128i*>(column));
+ const __m128i col_dup16_lo = _mm_unpacklo_epi8(col_data, col_data);
+ const __m128i col_dup16_hi = _mm_unpackhi_epi8(col_data, col_data);
+ const __m128i col_dup32_lolo = _mm_unpacklo_epi16(col_dup16_lo, col_dup16_lo);
+ auto* dst = static_cast<uint8_t*>(dest);
+ writefn(dst, stride, col_dup32_lolo);
+ dst += stride4;
+ const __m128i col_dup32_lohi = _mm_unpackhi_epi16(col_dup16_lo, col_dup16_lo);
+ writefn(dst, stride, col_dup32_lohi);
+ dst += stride4;
+ const __m128i col_dup32_hilo = _mm_unpacklo_epi16(col_dup16_hi, col_dup16_hi);
+ writefn(dst, stride, col_dup32_hilo);
+ dst += stride4;
+ const __m128i col_dup32_hihi = _mm_unpackhi_epi16(col_dup16_hi, col_dup16_hi);
+ writefn(dst, stride, col_dup32_hihi);
+}
+
+template <WriteDuplicateFunc writefn>
+inline void ColStore32_SSE4_1(void* const dest, ptrdiff_t stride,
+ const void* const column) {
+ const ptrdiff_t stride4 = stride << 2;
+ auto* dst = static_cast<uint8_t*>(dest);
+ for (int y = 0; y < 32; y += 16) {
+ const __m128i col_data =
+ LoadUnaligned16(static_cast<const uint8_t*>(column) + y);
+ const __m128i col_dup16_lo = _mm_unpacklo_epi8(col_data, col_data);
+ const __m128i col_dup16_hi = _mm_unpackhi_epi8(col_data, col_data);
+ const __m128i col_dup32_lolo =
+ _mm_unpacklo_epi16(col_dup16_lo, col_dup16_lo);
+ writefn(dst, stride, col_dup32_lolo);
+ dst += stride4;
+ const __m128i col_dup32_lohi =
+ _mm_unpackhi_epi16(col_dup16_lo, col_dup16_lo);
+ writefn(dst, stride, col_dup32_lohi);
+ dst += stride4;
+ const __m128i col_dup32_hilo =
+ _mm_unpacklo_epi16(col_dup16_hi, col_dup16_hi);
+ writefn(dst, stride, col_dup32_hilo);
+ dst += stride4;
+ const __m128i col_dup32_hihi =
+ _mm_unpackhi_epi16(col_dup16_hi, col_dup16_hi);
+ writefn(dst, stride, col_dup32_hihi);
+ dst += stride4;
+ }
+}
+
+template <WriteDuplicateFunc writefn>
+inline void ColStore64_SSE4_1(void* const dest, ptrdiff_t stride,
+ const void* const column) {
+ const ptrdiff_t stride4 = stride << 2;
+ auto* dst = static_cast<uint8_t*>(dest);
+ for (int y = 0; y < 64; y += 16) {
+ const __m128i col_data =
+ LoadUnaligned16(static_cast<const uint8_t*>(column) + y);
+ const __m128i col_dup16_lo = _mm_unpacklo_epi8(col_data, col_data);
+ const __m128i col_dup16_hi = _mm_unpackhi_epi8(col_data, col_data);
+ const __m128i col_dup32_lolo =
+ _mm_unpacklo_epi16(col_dup16_lo, col_dup16_lo);
+ writefn(dst, stride, col_dup32_lolo);
+ dst += stride4;
+ const __m128i col_dup32_lohi =
+ _mm_unpackhi_epi16(col_dup16_lo, col_dup16_lo);
+ writefn(dst, stride, col_dup32_lohi);
+ dst += stride4;
+ const __m128i col_dup32_hilo =
+ _mm_unpacklo_epi16(col_dup16_hi, col_dup16_hi);
+ writefn(dst, stride, col_dup32_hilo);
+ dst += stride4;
+ const __m128i col_dup32_hihi =
+ _mm_unpackhi_epi16(col_dup16_hi, col_dup16_hi);
+ writefn(dst, stride, col_dup32_hihi);
+ dst += stride4;
+ }
+}
+
+struct DcDefs {
+ DcDefs() = delete;
+
+ using _4x4 = DcPredFuncs_SSE4_1<2, 2, DcSum4_SSE4_1, DcSum4_SSE4_1,
+ DcStore4xH_SSE4_1<4>, 0, 0>;
+ // shiftk is the smaller of width_log2 and height_log2.
+ // dc_mult corresponds to the ratio of the smaller block size to the larger.
+ using _4x8 = DcPredFuncs_SSE4_1<2, 3, DcSum4_SSE4_1, DcSum8_SSE4_1,
+ DcStore4xH_SSE4_1<8>, 2, kThreeInverse>;
+ using _4x16 = DcPredFuncs_SSE4_1<2, 4, DcSum4_SSE4_1, DcSum16_SSE4_1,
+ DcStore4xH_SSE4_1<16>, 2, kFiveInverse>;
+
+ using _8x4 = DcPredFuncs_SSE4_1<3, 2, DcSum8_SSE4_1, DcSum4_SSE4_1,
+ DcStore8xH_SSE4_1<4>, 2, kThreeInverse>;
+ using _8x8 = DcPredFuncs_SSE4_1<3, 3, DcSum8_SSE4_1, DcSum8_SSE4_1,
+ DcStore8xH_SSE4_1<8>, 0, 0>;
+ using _8x16 = DcPredFuncs_SSE4_1<3, 4, DcSum8_SSE4_1, DcSum16_SSE4_1,
+ DcStore8xH_SSE4_1<16>, 3, kThreeInverse>;
+ using _8x32 = DcPredFuncs_SSE4_1<3, 5, DcSum8_SSE4_1, DcSum32_SSE4_1,
+ DcStore8xH_SSE4_1<32>, 3, kFiveInverse>;
+
+ using _16x4 = DcPredFuncs_SSE4_1<4, 2, DcSum16_SSE4_1, DcSum4_SSE4_1,
+ DcStore16xH_SSE4_1<4>, 2, kFiveInverse>;
+ using _16x8 = DcPredFuncs_SSE4_1<4, 3, DcSum16_SSE4_1, DcSum8_SSE4_1,
+ DcStore16xH_SSE4_1<8>, 3, kThreeInverse>;
+ using _16x16 = DcPredFuncs_SSE4_1<4, 4, DcSum16_SSE4_1, DcSum16_SSE4_1,
+ DcStore16xH_SSE4_1<16>, 0, 0>;
+ using _16x32 = DcPredFuncs_SSE4_1<4, 5, DcSum16_SSE4_1, DcSum32_SSE4_1,
+ DcStore16xH_SSE4_1<32>, 4, kThreeInverse>;
+ using _16x64 = DcPredFuncs_SSE4_1<4, 6, DcSum16_SSE4_1, DcSum64_SSE4_1,
+ DcStore16xH_SSE4_1<64>, 4, kFiveInverse>;
+
+ using _32x8 = DcPredFuncs_SSE4_1<5, 3, DcSum32_SSE4_1, DcSum8_SSE4_1,
+ DcStore32xH_SSE4_1<8>, 3, kFiveInverse>;
+ using _32x16 = DcPredFuncs_SSE4_1<5, 4, DcSum32_SSE4_1, DcSum16_SSE4_1,
+ DcStore32xH_SSE4_1<16>, 4, kThreeInverse>;
+ using _32x32 = DcPredFuncs_SSE4_1<5, 5, DcSum32_SSE4_1, DcSum32_SSE4_1,
+ DcStore32xH_SSE4_1<32>, 0, 0>;
+ using _32x64 = DcPredFuncs_SSE4_1<5, 6, DcSum32_SSE4_1, DcSum64_SSE4_1,
+ DcStore32xH_SSE4_1<64>, 5, kThreeInverse>;
+
+ using _64x16 = DcPredFuncs_SSE4_1<6, 4, DcSum64_SSE4_1, DcSum16_SSE4_1,
+ DcStore64xH_SSE4_1<16>, 4, kFiveInverse>;
+ using _64x32 = DcPredFuncs_SSE4_1<6, 5, DcSum64_SSE4_1, DcSum32_SSE4_1,
+ DcStore64xH_SSE4_1<32>, 5, kThreeInverse>;
+ using _64x64 = DcPredFuncs_SSE4_1<6, 6, DcSum64_SSE4_1, DcSum64_SSE4_1,
+ DcStore64xH_SSE4_1<64>, 0, 0>;
+};
+
+struct DirDefs {
+ DirDefs() = delete;
+
+ using _4x4 = DirectionalPredFuncs_SSE4_1<ColStore4_SSE4_1<WriteDuplicate4x4>>;
+ using _4x8 = DirectionalPredFuncs_SSE4_1<ColStore8_SSE4_1<WriteDuplicate4x4>>;
+ using _4x16 =
+ DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate4x4>>;
+ using _8x4 = DirectionalPredFuncs_SSE4_1<ColStore4_SSE4_1<WriteDuplicate8x4>>;
+ using _8x8 = DirectionalPredFuncs_SSE4_1<ColStore8_SSE4_1<WriteDuplicate8x4>>;
+ using _8x16 =
+ DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate8x4>>;
+ using _8x32 =
+ DirectionalPredFuncs_SSE4_1<ColStore32_SSE4_1<WriteDuplicate8x4>>;
+ using _16x4 =
+ DirectionalPredFuncs_SSE4_1<ColStore4_SSE4_1<WriteDuplicate16x4>>;
+ using _16x8 =
+ DirectionalPredFuncs_SSE4_1<ColStore8_SSE4_1<WriteDuplicate16x4>>;
+ using _16x16 =
+ DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate16x4>>;
+ using _16x32 =
+ DirectionalPredFuncs_SSE4_1<ColStore32_SSE4_1<WriteDuplicate16x4>>;
+ using _16x64 =
+ DirectionalPredFuncs_SSE4_1<ColStore64_SSE4_1<WriteDuplicate16x4>>;
+ using _32x8 =
+ DirectionalPredFuncs_SSE4_1<ColStore8_SSE4_1<WriteDuplicate32x4>>;
+ using _32x16 =
+ DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate32x4>>;
+ using _32x32 =
+ DirectionalPredFuncs_SSE4_1<ColStore32_SSE4_1<WriteDuplicate32x4>>;
+ using _32x64 =
+ DirectionalPredFuncs_SSE4_1<ColStore64_SSE4_1<WriteDuplicate32x4>>;
+ using _64x16 =
+ DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate64x4>>;
+ using _64x32 =
+ DirectionalPredFuncs_SSE4_1<ColStore32_SSE4_1<WriteDuplicate64x4>>;
+ using _64x64 =
+ DirectionalPredFuncs_SSE4_1<ColStore64_SSE4_1<WriteDuplicate64x4>>;
+};
+
+template <int y_mask>
+inline void WritePaethLine4(uint8_t* dst, const __m128i& top,
+ const __m128i& left, const __m128i& top_lefts,
+ const __m128i& top_dists, const __m128i& left_dists,
+ const __m128i& top_left_diffs) {
+ const __m128i top_dists_y = _mm_shuffle_epi32(top_dists, y_mask);
+
+ const __m128i lefts_y = _mm_shuffle_epi32(left, y_mask);
+ const __m128i top_left_dists =
+ _mm_abs_epi32(_mm_add_epi32(lefts_y, top_left_diffs));
+
+ // Section 7.11.2.2 specifies the logic and terms here. The less-or-equal
+ // operation is unavailable, so the logic for selecting top, left, or
+ // top_left is inverted.
+ __m128i not_select_left = _mm_cmpgt_epi32(left_dists, top_left_dists);
+ not_select_left =
+ _mm_or_si128(not_select_left, _mm_cmpgt_epi32(left_dists, top_dists_y));
+ const __m128i not_select_top = _mm_cmpgt_epi32(top_dists_y, top_left_dists);
+
+ const __m128i left_out = _mm_andnot_si128(not_select_left, lefts_y);
+
+ const __m128i top_left_out = _mm_and_si128(not_select_top, top_lefts);
+ __m128i top_or_top_left_out = _mm_andnot_si128(not_select_top, top);
+ top_or_top_left_out = _mm_or_si128(top_or_top_left_out, top_left_out);
+ top_or_top_left_out = _mm_and_si128(not_select_left, top_or_top_left_out);
+
+ // The sequence of 32-bit packed operations was found (see CL via blame) to
+ // outperform 16-bit operations, despite the availability of the packus
+ // function, when tested on a Xeon E7 v3.
+ const __m128i cvtepi32_epi8 = _mm_set1_epi32(0x0C080400);
+ const __m128i pred = _mm_shuffle_epi8(
+ _mm_or_si128(left_out, top_or_top_left_out), cvtepi32_epi8);
+ Store4(dst, pred);
+}
+
+// top_left_diffs is the only variable whose ints may exceed 8 bits. Otherwise
+// we would be able to do all of these operations as epi8 for a 16-pixel version
+// of this function. Still, since lefts_y is just a vector of duplicates, it
+// could pay off to accommodate top_left_dists for cmpgt, and repack into epi8
+// for the blends.
+template <int y_mask>
+inline void WritePaethLine8(uint8_t* dst, const __m128i& top,
+ const __m128i& left, const __m128i& top_lefts,
+ const __m128i& top_dists, const __m128i& left_dists,
+ const __m128i& top_left_diffs) {
+ const __m128i select_y = _mm_set1_epi32(y_mask);
+ const __m128i top_dists_y = _mm_shuffle_epi8(top_dists, select_y);
+
+ const __m128i lefts_y = _mm_shuffle_epi8(left, select_y);
+ const __m128i top_left_dists =
+ _mm_abs_epi16(_mm_add_epi16(lefts_y, top_left_diffs));
+
+ // Section 7.11.2.2 specifies the logic and terms here. The less-or-equal
+ // operation is unavailable, so the logic for selecting top, left, or
+ // top_left is inverted.
+ __m128i not_select_left = _mm_cmpgt_epi16(left_dists, top_left_dists);
+ not_select_left =
+ _mm_or_si128(not_select_left, _mm_cmpgt_epi16(left_dists, top_dists_y));
+ const __m128i not_select_top = _mm_cmpgt_epi16(top_dists_y, top_left_dists);
+
+ const __m128i left_out = _mm_andnot_si128(not_select_left, lefts_y);
+
+ const __m128i top_left_out = _mm_and_si128(not_select_top, top_lefts);
+ __m128i top_or_top_left_out = _mm_andnot_si128(not_select_top, top);
+ top_or_top_left_out = _mm_or_si128(top_or_top_left_out, top_left_out);
+ top_or_top_left_out = _mm_and_si128(not_select_left, top_or_top_left_out);
+
+ const __m128i pred = _mm_packus_epi16(
+ _mm_or_si128(left_out, top_or_top_left_out), /* unused */ left_out);
+ _mm_storel_epi64(reinterpret_cast<__m128i*>(dst), pred);
+}
+
+// |top| is an epi8 of length 16
+// |left| is epi8 of unknown length, as y_mask specifies access
+// |top_lefts| is an epi8 of 16 duplicates
+// |top_dists| is an epi8 of unknown length, as y_mask specifies access
+// |left_dists| is an epi8 of length 16
+// |left_dists_lo| is an epi16 of length 8
+// |left_dists_hi| is an epi16 of length 8
+// |top_left_diffs_lo| is an epi16 of length 8
+// |top_left_diffs_hi| is an epi16 of length 8
+// The latter two vectors are epi16 because their values may reach -510.
+// |left_dists| is provided alongside its spread out version because it doesn't
+// change between calls and interacts with both kinds of packing.
+template <int y_mask>
+inline void WritePaethLine16(uint8_t* dst, const __m128i& top,
+ const __m128i& left, const __m128i& top_lefts,
+ const __m128i& top_dists,
+ const __m128i& left_dists,
+ const __m128i& left_dists_lo,
+ const __m128i& left_dists_hi,
+ const __m128i& top_left_diffs_lo,
+ const __m128i& top_left_diffs_hi) {
+ const __m128i select_y = _mm_set1_epi32(y_mask);
+ const __m128i top_dists_y8 = _mm_shuffle_epi8(top_dists, select_y);
+ const __m128i top_dists_y16 = _mm_cvtepu8_epi16(top_dists_y8);
+ const __m128i lefts_y8 = _mm_shuffle_epi8(left, select_y);
+ const __m128i lefts_y16 = _mm_cvtepu8_epi16(lefts_y8);
+
+ const __m128i top_left_dists_lo =
+ _mm_abs_epi16(_mm_add_epi16(lefts_y16, top_left_diffs_lo));
+ const __m128i top_left_dists_hi =
+ _mm_abs_epi16(_mm_add_epi16(lefts_y16, top_left_diffs_hi));
+
+ const __m128i left_gt_top_left_lo = _mm_packs_epi16(
+ _mm_cmpgt_epi16(left_dists_lo, top_left_dists_lo), left_dists_lo);
+ const __m128i left_gt_top_left_hi =
+ _mm_packs_epi16(_mm_cmpgt_epi16(left_dists_hi, top_left_dists_hi),
+ /* unused second arg for pack */ left_dists_hi);
+ const __m128i left_gt_top_left = _mm_alignr_epi8(
+ left_gt_top_left_hi, _mm_slli_si128(left_gt_top_left_lo, 8), 8);
+
+ const __m128i not_select_top_lo =
+ _mm_packs_epi16(_mm_cmpgt_epi16(top_dists_y16, top_left_dists_lo),
+ /* unused second arg for pack */ top_dists_y16);
+ const __m128i not_select_top_hi =
+ _mm_packs_epi16(_mm_cmpgt_epi16(top_dists_y16, top_left_dists_hi),
+ /* unused second arg for pack */ top_dists_y16);
+ const __m128i not_select_top = _mm_alignr_epi8(
+ not_select_top_hi, _mm_slli_si128(not_select_top_lo, 8), 8);
+
+ const __m128i left_leq_top =
+ _mm_cmpeq_epi8(left_dists, _mm_min_epu8(top_dists_y8, left_dists));
+ const __m128i select_left = _mm_andnot_si128(left_gt_top_left, left_leq_top);
+
+ // Section 7.11.2.2 specifies the logic and terms here. The less-or-equal
+ // operation is unavailable, so the logic for selecting top, left, or
+ // top_left is inverted.
+ const __m128i left_out = _mm_and_si128(select_left, lefts_y8);
+
+ const __m128i top_left_out = _mm_and_si128(not_select_top, top_lefts);
+ __m128i top_or_top_left_out = _mm_andnot_si128(not_select_top, top);
+ top_or_top_left_out = _mm_or_si128(top_or_top_left_out, top_left_out);
+ top_or_top_left_out = _mm_andnot_si128(select_left, top_or_top_left_out);
+ const __m128i pred = _mm_or_si128(left_out, top_or_top_left_out);
+
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), pred);
+}
+
+void Paeth4x4_SSE4_1(void* const dest, ptrdiff_t stride,
+ const void* const top_row, const void* const left_column) {
+ const __m128i left = _mm_cvtepu8_epi32(Load4(left_column));
+ const __m128i top = _mm_cvtepu8_epi32(Load4(top_row));
+
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ const __m128i top_lefts = _mm_set1_epi32(top_ptr[-1]);
+
+ // Given that the spec defines "base" as top[x] + left[y] - top[-1],
+ // pLeft = abs(base - left[y]) = abs(top[x] - top[-1])
+ // pTop = abs(base - top[x]) = abs(left[y] - top[-1])
+ const __m128i left_dists = _mm_abs_epi32(_mm_sub_epi32(top, top_lefts));
+ const __m128i top_dists = _mm_abs_epi32(_mm_sub_epi32(left, top_lefts));
+
+ const __m128i top_left_x2 = _mm_add_epi32(top_lefts, top_lefts);
+ const __m128i top_left_diff = _mm_sub_epi32(top, top_left_x2);
+ auto* dst = static_cast<uint8_t*>(dest);
+ WritePaethLine4<0>(dst, top, left, top_lefts, top_dists, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine4<0x55>(dst, top, left, top_lefts, top_dists, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine4<0xAA>(dst, top, left, top_lefts, top_dists, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine4<0xFF>(dst, top, left, top_lefts, top_dists, left_dists,
+ top_left_diff);
+}
+
+void Paeth4x8_SSE4_1(void* const dest, ptrdiff_t stride,
+ const void* const top_row, const void* const left_column) {
+ const __m128i left = LoadLo8(left_column);
+ const __m128i left_lo = _mm_cvtepu8_epi32(left);
+ const __m128i left_hi = _mm_cvtepu8_epi32(_mm_srli_si128(left, 4));
+
+ const __m128i top = _mm_cvtepu8_epi32(Load4(top_row));
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ const __m128i top_lefts = _mm_set1_epi32(top_ptr[-1]);
+
+ // Given that the spec defines "base" as top[x] + left[y] - top[-1],
+ // pLeft = abs(base - left[y]) = abs(top[x] - top[-1])
+ // pTop = abs(base - top[x]) = abs(left[y] - top[-1])
+ const __m128i left_dists = _mm_abs_epi32(_mm_sub_epi32(top, top_lefts));
+ const __m128i top_dists_lo = _mm_abs_epi32(_mm_sub_epi32(left_lo, top_lefts));
+ const __m128i top_dists_hi = _mm_abs_epi32(_mm_sub_epi32(left_hi, top_lefts));
+
+ const __m128i top_left_x2 = _mm_add_epi32(top_lefts, top_lefts);
+ const __m128i top_left_diff = _mm_sub_epi32(top, top_left_x2);
+ auto* dst = static_cast<uint8_t*>(dest);
+ WritePaethLine4<0>(dst, top, left_lo, top_lefts, top_dists_lo, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine4<0x55>(dst, top, left_lo, top_lefts, top_dists_lo, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine4<0xAA>(dst, top, left_lo, top_lefts, top_dists_lo, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine4<0xFF>(dst, top, left_lo, top_lefts, top_dists_lo, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine4<0>(dst, top, left_hi, top_lefts, top_dists_hi, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine4<0x55>(dst, top, left_hi, top_lefts, top_dists_hi, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine4<0xAA>(dst, top, left_hi, top_lefts, top_dists_hi, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine4<0xFF>(dst, top, left_hi, top_lefts, top_dists_hi, left_dists,
+ top_left_diff);
+}
+
+void Paeth4x16_SSE4_1(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const __m128i left = LoadUnaligned16(left_column);
+ const __m128i left_0 = _mm_cvtepu8_epi32(left);
+ const __m128i left_1 = _mm_cvtepu8_epi32(_mm_srli_si128(left, 4));
+ const __m128i left_2 = _mm_cvtepu8_epi32(_mm_srli_si128(left, 8));
+ const __m128i left_3 = _mm_cvtepu8_epi32(_mm_srli_si128(left, 12));
+
+ const __m128i top = _mm_cvtepu8_epi32(Load4(top_row));
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ const __m128i top_lefts = _mm_set1_epi32(top_ptr[-1]);
+
+ // Given that the spec defines "base" as top[x] + left[y] - top[-1],
+ // pLeft = abs(base - left[y]) = abs(top[x] - top[-1])
+ // pTop = abs(base - top[x]) = abs(left[y] - top[-1])
+ const __m128i left_dists = _mm_abs_epi32(_mm_sub_epi32(top, top_lefts));
+ const __m128i top_dists_0 = _mm_abs_epi32(_mm_sub_epi32(left_0, top_lefts));
+ const __m128i top_dists_1 = _mm_abs_epi32(_mm_sub_epi32(left_1, top_lefts));
+ const __m128i top_dists_2 = _mm_abs_epi32(_mm_sub_epi32(left_2, top_lefts));
+ const __m128i top_dists_3 = _mm_abs_epi32(_mm_sub_epi32(left_3, top_lefts));
+
+ const __m128i top_left_x2 = _mm_add_epi32(top_lefts, top_lefts);
+ const __m128i top_left_diff = _mm_sub_epi32(top, top_left_x2);
+
+ auto* dst = static_cast<uint8_t*>(dest);
+ WritePaethLine4<0>(dst, top, left_0, top_lefts, top_dists_0, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine4<0x55>(dst, top, left_0, top_lefts, top_dists_0, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine4<0xAA>(dst, top, left_0, top_lefts, top_dists_0, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine4<0xFF>(dst, top, left_0, top_lefts, top_dists_0, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine4<0>(dst, top, left_1, top_lefts, top_dists_1, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine4<0x55>(dst, top, left_1, top_lefts, top_dists_1, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine4<0xAA>(dst, top, left_1, top_lefts, top_dists_1, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine4<0xFF>(dst, top, left_1, top_lefts, top_dists_1, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine4<0>(dst, top, left_2, top_lefts, top_dists_2, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine4<0x55>(dst, top, left_2, top_lefts, top_dists_2, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine4<0xAA>(dst, top, left_2, top_lefts, top_dists_2, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine4<0xFF>(dst, top, left_2, top_lefts, top_dists_2, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine4<0>(dst, top, left_3, top_lefts, top_dists_3, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine4<0x55>(dst, top, left_3, top_lefts, top_dists_3, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine4<0xAA>(dst, top, left_3, top_lefts, top_dists_3, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine4<0xFF>(dst, top, left_3, top_lefts, top_dists_3, left_dists,
+ top_left_diff);
+}
+
+void Paeth8x4_SSE4_1(void* const dest, ptrdiff_t stride,
+ const void* const top_row, const void* const left_column) {
+ const __m128i left = _mm_cvtepu8_epi16(Load4(left_column));
+ const __m128i top = _mm_cvtepu8_epi16(LoadLo8(top_row));
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ const __m128i top_lefts = _mm_set1_epi16(top_ptr[-1]);
+
+ // Given that the spec defines "base" as top[x] + left[y] - top[-1],
+ // pLeft = abs(base - left[y]) = abs(top[x] - top[-1])
+ // pTop = abs(base - top[x]) = abs(left[y] - top[-1])
+ const __m128i left_dists = _mm_abs_epi16(_mm_sub_epi16(top, top_lefts));
+ const __m128i top_dists = _mm_abs_epi16(_mm_sub_epi16(left, top_lefts));
+
+ const __m128i top_left_x2 = _mm_add_epi16(top_lefts, top_lefts);
+ const __m128i top_left_diff = _mm_sub_epi16(top, top_left_x2);
+ auto* dst = static_cast<uint8_t*>(dest);
+ WritePaethLine8<0x01000100>(dst, top, left, top_lefts, top_dists, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine8<0x03020302>(dst, top, left, top_lefts, top_dists, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine8<0x05040504>(dst, top, left, top_lefts, top_dists, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine8<0x07060706>(dst, top, left, top_lefts, top_dists, left_dists,
+ top_left_diff);
+}
+
+void Paeth8x8_SSE4_1(void* const dest, ptrdiff_t stride,
+ const void* const top_row, const void* const left_column) {
+ const __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column));
+ const __m128i top = _mm_cvtepu8_epi16(LoadLo8(top_row));
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ const __m128i top_lefts = _mm_set1_epi16(top_ptr[-1]);
+
+ // Given that the spec defines "base" as top[x] + left[y] - top[-1],
+ // pLeft = abs(base - left[y]) = abs(top[x] - top[-1])
+ // pTop = abs(base - top[x]) = abs(left[y] - top[-1])
+ const __m128i left_dists = _mm_abs_epi16(_mm_sub_epi16(top, top_lefts));
+ const __m128i top_dists = _mm_abs_epi16(_mm_sub_epi16(left, top_lefts));
+
+ const __m128i top_left_x2 = _mm_add_epi16(top_lefts, top_lefts);
+ const __m128i top_left_diff = _mm_sub_epi16(top, top_left_x2);
+ auto* dst = static_cast<uint8_t*>(dest);
+ WritePaethLine8<0x01000100>(dst, top, left, top_lefts, top_dists, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine8<0x03020302>(dst, top, left, top_lefts, top_dists, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine8<0x05040504>(dst, top, left, top_lefts, top_dists, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine8<0x07060706>(dst, top, left, top_lefts, top_dists, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine8<0x09080908>(dst, top, left, top_lefts, top_dists, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine8<0x0B0A0B0A>(dst, top, left, top_lefts, top_dists, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine8<0x0D0C0D0C>(dst, top, left, top_lefts, top_dists, left_dists,
+ top_left_diff);
+ dst += stride;
+ WritePaethLine8<0x0F0E0F0E>(dst, top, left, top_lefts, top_dists, left_dists,
+ top_left_diff);
+}
+
+void Paeth8x16_SSE4_1(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const __m128i left = LoadUnaligned16(left_column);
+ const __m128i left_lo = _mm_cvtepu8_epi16(left);
+ const __m128i left_hi = _mm_cvtepu8_epi16(_mm_srli_si128(left, 8));
+ const __m128i top = _mm_cvtepu8_epi16(LoadLo8(top_row));
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ const __m128i top_lefts = _mm_set1_epi16(top_ptr[-1]);
+
+ // Given that the spec defines "base" as top[x] + left[y] - top[-1],
+ // pLeft = abs(base - left[y]) = abs(top[x] - top[-1])
+ // pTop = abs(base - top[x]) = abs(left[y] - top[-1])
+ const __m128i left_dists = _mm_abs_epi16(_mm_sub_epi16(top, top_lefts));
+ const __m128i top_dists_lo = _mm_abs_epi16(_mm_sub_epi16(left_lo, top_lefts));
+ const __m128i top_dists_hi = _mm_abs_epi16(_mm_sub_epi16(left_hi, top_lefts));
+
+ const __m128i top_left_x2 = _mm_add_epi16(top_lefts, top_lefts);
+ const __m128i top_left_diff = _mm_sub_epi16(top, top_left_x2);
+ auto* dst = static_cast<uint8_t*>(dest);
+ WritePaethLine8<0x01000100>(dst, top, left_lo, top_lefts, top_dists_lo,
+ left_dists, top_left_diff);
+ dst += stride;
+ WritePaethLine8<0x03020302>(dst, top, left_lo, top_lefts, top_dists_lo,
+ left_dists, top_left_diff);
+ dst += stride;
+ WritePaethLine8<0x05040504>(dst, top, left_lo, top_lefts, top_dists_lo,
+ left_dists, top_left_diff);
+ dst += stride;
+ WritePaethLine8<0x07060706>(dst, top, left_lo, top_lefts, top_dists_lo,
+ left_dists, top_left_diff);
+ dst += stride;
+ WritePaethLine8<0x09080908>(dst, top, left_lo, top_lefts, top_dists_lo,
+ left_dists, top_left_diff);
+ dst += stride;
+ WritePaethLine8<0x0B0A0B0A>(dst, top, left_lo, top_lefts, top_dists_lo,
+ left_dists, top_left_diff);
+ dst += stride;
+ WritePaethLine8<0x0D0C0D0C>(dst, top, left_lo, top_lefts, top_dists_lo,
+ left_dists, top_left_diff);
+ dst += stride;
+ WritePaethLine8<0x0F0E0F0E>(dst, top, left_lo, top_lefts, top_dists_lo,
+ left_dists, top_left_diff);
+ dst += stride;
+ WritePaethLine8<0x01000100>(dst, top, left_hi, top_lefts, top_dists_hi,
+ left_dists, top_left_diff);
+ dst += stride;
+ WritePaethLine8<0x03020302>(dst, top, left_hi, top_lefts, top_dists_hi,
+ left_dists, top_left_diff);
+ dst += stride;
+ WritePaethLine8<0x05040504>(dst, top, left_hi, top_lefts, top_dists_hi,
+ left_dists, top_left_diff);
+ dst += stride;
+ WritePaethLine8<0x07060706>(dst, top, left_hi, top_lefts, top_dists_hi,
+ left_dists, top_left_diff);
+ dst += stride;
+ WritePaethLine8<0x09080908>(dst, top, left_hi, top_lefts, top_dists_hi,
+ left_dists, top_left_diff);
+ dst += stride;
+ WritePaethLine8<0x0B0A0B0A>(dst, top, left_hi, top_lefts, top_dists_hi,
+ left_dists, top_left_diff);
+ dst += stride;
+ WritePaethLine8<0x0D0C0D0C>(dst, top, left_hi, top_lefts, top_dists_hi,
+ left_dists, top_left_diff);
+ dst += stride;
+ WritePaethLine8<0x0F0E0F0E>(dst, top, left_hi, top_lefts, top_dists_hi,
+ left_dists, top_left_diff);
+}
+
+void Paeth8x32_SSE4_1(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ auto* const dst = static_cast<uint8_t*>(dest);
+ Paeth8x16_SSE4_1(dst, stride, top_row, left_column);
+ Paeth8x16_SSE4_1(dst + (stride << 4), stride, top_row, left_ptr + 16);
+}
+
+void Paeth16x4_SSE4_1(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const __m128i left = Load4(left_column);
+ const __m128i top = LoadUnaligned16(top_row);
+ const __m128i top_lo = _mm_cvtepu8_epi16(top);
+ const __m128i top_hi = _mm_cvtepu8_epi16(_mm_srli_si128(top, 8));
+
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ const __m128i top_lefts16 = _mm_set1_epi16(top_ptr[-1]);
+ const __m128i top_lefts8 = _mm_set1_epi8(static_cast<int8_t>(top_ptr[-1]));
+
+ // Given that the spec defines "base" as top[x] + left[y] - top[-1],
+ // pLeft = abs(base - left[y]) = abs(top[x] - top[-1])
+ // pTop = abs(base - top[x]) = abs(left[y] - top[-1])
+
+ const __m128i left_dists = _mm_or_si128(_mm_subs_epu8(top, top_lefts8),
+ _mm_subs_epu8(top_lefts8, top));
+ const __m128i left_dists_lo = _mm_cvtepu8_epi16(left_dists);
+ const __m128i left_dists_hi =
+ _mm_cvtepu8_epi16(_mm_srli_si128(left_dists, 8));
+ const __m128i top_dists = _mm_or_si128(_mm_subs_epu8(left, top_lefts8),
+ _mm_subs_epu8(top_lefts8, left));
+
+ const __m128i top_left_x2 = _mm_add_epi16(top_lefts16, top_lefts16);
+ const __m128i top_left_diff_lo = _mm_sub_epi16(top_lo, top_left_x2);
+ const __m128i top_left_diff_hi = _mm_sub_epi16(top_hi, top_left_x2);
+ auto* dst = static_cast<uint8_t*>(dest);
+ WritePaethLine16<0>(dst, top, left, top_lefts8, top_dists, left_dists,
+ left_dists_lo, left_dists_hi, top_left_diff_lo,
+ top_left_diff_hi);
+ dst += stride;
+ WritePaethLine16<0x01010101>(dst, top, left, top_lefts8, top_dists,
+ left_dists, left_dists_lo, left_dists_hi,
+ top_left_diff_lo, top_left_diff_hi);
+ dst += stride;
+ WritePaethLine16<0x02020202>(dst, top, left, top_lefts8, top_dists,
+ left_dists, left_dists_lo, left_dists_hi,
+ top_left_diff_lo, top_left_diff_hi);
+ dst += stride;
+ WritePaethLine16<0x03030303>(dst, top, left, top_lefts8, top_dists,
+ left_dists, left_dists_lo, left_dists_hi,
+ top_left_diff_lo, top_left_diff_hi);
+}
+
+// Inlined for calling with offsets in larger transform sizes, mainly to
+// preserve top_left.
+inline void WritePaeth16x8(void* const dest, ptrdiff_t stride,
+ const uint8_t top_left, const __m128i top,
+ const __m128i left) {
+ const __m128i top_lo = _mm_cvtepu8_epi16(top);
+ const __m128i top_hi = _mm_cvtepu8_epi16(_mm_srli_si128(top, 8));
+
+ const __m128i top_lefts16 = _mm_set1_epi16(top_left);
+ const __m128i top_lefts8 = _mm_set1_epi8(static_cast<int8_t>(top_left));
+
+ // Given that the spec defines "base" as top[x] + left[y] - top_left,
+ // pLeft = abs(base - left[y]) = abs(top[x] - top[-1])
+ // pTop = abs(base - top[x]) = abs(left[y] - top[-1])
+
+ const __m128i left_dists = _mm_or_si128(_mm_subs_epu8(top, top_lefts8),
+ _mm_subs_epu8(top_lefts8, top));
+ const __m128i left_dists_lo = _mm_cvtepu8_epi16(left_dists);
+ const __m128i left_dists_hi =
+ _mm_cvtepu8_epi16(_mm_srli_si128(left_dists, 8));
+ const __m128i top_dists = _mm_or_si128(_mm_subs_epu8(left, top_lefts8),
+ _mm_subs_epu8(top_lefts8, left));
+
+ const __m128i top_left_x2 = _mm_add_epi16(top_lefts16, top_lefts16);
+ const __m128i top_left_diff_lo = _mm_sub_epi16(top_lo, top_left_x2);
+ const __m128i top_left_diff_hi = _mm_sub_epi16(top_hi, top_left_x2);
+ auto* dst = static_cast<uint8_t*>(dest);
+ WritePaethLine16<0>(dst, top, left, top_lefts8, top_dists, left_dists,
+ left_dists_lo, left_dists_hi, top_left_diff_lo,
+ top_left_diff_hi);
+ dst += stride;
+ WritePaethLine16<0x01010101>(dst, top, left, top_lefts8, top_dists,
+ left_dists, left_dists_lo, left_dists_hi,
+ top_left_diff_lo, top_left_diff_hi);
+ dst += stride;
+ WritePaethLine16<0x02020202>(dst, top, left, top_lefts8, top_dists,
+ left_dists, left_dists_lo, left_dists_hi,
+ top_left_diff_lo, top_left_diff_hi);
+ dst += stride;
+ WritePaethLine16<0x03030303>(dst, top, left, top_lefts8, top_dists,
+ left_dists, left_dists_lo, left_dists_hi,
+ top_left_diff_lo, top_left_diff_hi);
+ dst += stride;
+ WritePaethLine16<0x04040404>(dst, top, left, top_lefts8, top_dists,
+ left_dists, left_dists_lo, left_dists_hi,
+ top_left_diff_lo, top_left_diff_hi);
+ dst += stride;
+ WritePaethLine16<0x05050505>(dst, top, left, top_lefts8, top_dists,
+ left_dists, left_dists_lo, left_dists_hi,
+ top_left_diff_lo, top_left_diff_hi);
+ dst += stride;
+ WritePaethLine16<0x06060606>(dst, top, left, top_lefts8, top_dists,
+ left_dists, left_dists_lo, left_dists_hi,
+ top_left_diff_lo, top_left_diff_hi);
+ dst += stride;
+ WritePaethLine16<0x07070707>(dst, top, left, top_lefts8, top_dists,
+ left_dists, left_dists_lo, left_dists_hi,
+ top_left_diff_lo, top_left_diff_hi);
+}
+
+void Paeth16x8_SSE4_1(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const __m128i top = LoadUnaligned16(top_row);
+ const __m128i left = LoadLo8(left_column);
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ WritePaeth16x8(static_cast<uint8_t*>(dest), stride, top_ptr[-1], top, left);
+}
+
+void WritePaeth16x16(void* const dest, ptrdiff_t stride, const uint8_t top_left,
+ const __m128i top, const __m128i left) {
+ const __m128i top_lo = _mm_cvtepu8_epi16(top);
+ const __m128i top_hi = _mm_cvtepu8_epi16(_mm_srli_si128(top, 8));
+
+ const __m128i top_lefts16 = _mm_set1_epi16(top_left);
+ const __m128i top_lefts8 = _mm_set1_epi8(static_cast<int8_t>(top_left));
+
+ // Given that the spec defines "base" as top[x] + left[y] - top[-1],
+ // pLeft = abs(base - left[y]) = abs(top[x] - top[-1])
+ // pTop = abs(base - top[x]) = abs(left[y] - top[-1])
+
+ const __m128i left_dists = _mm_or_si128(_mm_subs_epu8(top, top_lefts8),
+ _mm_subs_epu8(top_lefts8, top));
+ const __m128i left_dists_lo = _mm_cvtepu8_epi16(left_dists);
+ const __m128i left_dists_hi =
+ _mm_cvtepu8_epi16(_mm_srli_si128(left_dists, 8));
+ const __m128i top_dists = _mm_or_si128(_mm_subs_epu8(left, top_lefts8),
+ _mm_subs_epu8(top_lefts8, left));
+
+ const __m128i top_left_x2 = _mm_add_epi16(top_lefts16, top_lefts16);
+ const __m128i top_left_diff_lo = _mm_sub_epi16(top_lo, top_left_x2);
+ const __m128i top_left_diff_hi = _mm_sub_epi16(top_hi, top_left_x2);
+ auto* dst = static_cast<uint8_t*>(dest);
+ WritePaethLine16<0>(dst, top, left, top_lefts8, top_dists, left_dists,
+ left_dists_lo, left_dists_hi, top_left_diff_lo,
+ top_left_diff_hi);
+ dst += stride;
+ WritePaethLine16<0x01010101>(dst, top, left, top_lefts8, top_dists,
+ left_dists, left_dists_lo, left_dists_hi,
+ top_left_diff_lo, top_left_diff_hi);
+ dst += stride;
+ WritePaethLine16<0x02020202>(dst, top, left, top_lefts8, top_dists,
+ left_dists, left_dists_lo, left_dists_hi,
+ top_left_diff_lo, top_left_diff_hi);
+ dst += stride;
+ WritePaethLine16<0x03030303>(dst, top, left, top_lefts8, top_dists,
+ left_dists, left_dists_lo, left_dists_hi,
+ top_left_diff_lo, top_left_diff_hi);
+ dst += stride;
+ WritePaethLine16<0x04040404>(dst, top, left, top_lefts8, top_dists,
+ left_dists, left_dists_lo, left_dists_hi,
+ top_left_diff_lo, top_left_diff_hi);
+ dst += stride;
+ WritePaethLine16<0x05050505>(dst, top, left, top_lefts8, top_dists,
+ left_dists, left_dists_lo, left_dists_hi,
+ top_left_diff_lo, top_left_diff_hi);
+ dst += stride;
+ WritePaethLine16<0x06060606>(dst, top, left, top_lefts8, top_dists,
+ left_dists, left_dists_lo, left_dists_hi,
+ top_left_diff_lo, top_left_diff_hi);
+ dst += stride;
+ WritePaethLine16<0x07070707>(dst, top, left, top_lefts8, top_dists,
+ left_dists, left_dists_lo, left_dists_hi,
+ top_left_diff_lo, top_left_diff_hi);
+ dst += stride;
+ WritePaethLine16<0x08080808>(dst, top, left, top_lefts8, top_dists,
+ left_dists, left_dists_lo, left_dists_hi,
+ top_left_diff_lo, top_left_diff_hi);
+ dst += stride;
+ WritePaethLine16<0x09090909>(dst, top, left, top_lefts8, top_dists,
+ left_dists, left_dists_lo, left_dists_hi,
+ top_left_diff_lo, top_left_diff_hi);
+ dst += stride;
+ WritePaethLine16<0x0A0A0A0A>(dst, top, left, top_lefts8, top_dists,
+ left_dists, left_dists_lo, left_dists_hi,
+ top_left_diff_lo, top_left_diff_hi);
+ dst += stride;
+ WritePaethLine16<0x0B0B0B0B>(dst, top, left, top_lefts8, top_dists,
+ left_dists, left_dists_lo, left_dists_hi,
+ top_left_diff_lo, top_left_diff_hi);
+ dst += stride;
+ WritePaethLine16<0x0C0C0C0C>(dst, top, left, top_lefts8, top_dists,
+ left_dists, left_dists_lo, left_dists_hi,
+ top_left_diff_lo, top_left_diff_hi);
+ dst += stride;
+ WritePaethLine16<0x0D0D0D0D>(dst, top, left, top_lefts8, top_dists,
+ left_dists, left_dists_lo, left_dists_hi,
+ top_left_diff_lo, top_left_diff_hi);
+ dst += stride;
+ WritePaethLine16<0x0E0E0E0E>(dst, top, left, top_lefts8, top_dists,
+ left_dists, left_dists_lo, left_dists_hi,
+ top_left_diff_lo, top_left_diff_hi);
+ dst += stride;
+ WritePaethLine16<0x0F0F0F0F>(dst, top, left, top_lefts8, top_dists,
+ left_dists, left_dists_lo, left_dists_hi,
+ top_left_diff_lo, top_left_diff_hi);
+}
+
+void Paeth16x16_SSE4_1(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const __m128i left = LoadUnaligned16(left_column);
+ const __m128i top = LoadUnaligned16(top_row);
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ WritePaeth16x16(static_cast<uint8_t*>(dest), stride, top_ptr[-1], top, left);
+}
+
+void Paeth16x32_SSE4_1(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const __m128i left_0 = LoadUnaligned16(left_column);
+ const __m128i top = LoadUnaligned16(top_row);
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ const uint8_t top_left = top_ptr[-1];
+ auto* const dst = static_cast<uint8_t*>(dest);
+ WritePaeth16x16(dst, stride, top_left, top, left_0);
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ const __m128i left_1 = LoadUnaligned16(left_ptr + 16);
+ WritePaeth16x16(dst + (stride << 4), stride, top_left, top, left_1);
+}
+
+void Paeth16x64_SSE4_1(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const ptrdiff_t stride16 = stride << 4;
+ const __m128i left_0 = LoadUnaligned16(left_column);
+ const __m128i top = LoadUnaligned16(top_row);
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ const uint8_t top_left = top_ptr[-1];
+ auto* dst = static_cast<uint8_t*>(dest);
+ WritePaeth16x16(dst, stride, top_left, top, left_0);
+ dst += stride16;
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ const __m128i left_1 = LoadUnaligned16(left_ptr + 16);
+ WritePaeth16x16(dst, stride, top_left, top, left_1);
+ dst += stride16;
+ const __m128i left_2 = LoadUnaligned16(left_ptr + 32);
+ WritePaeth16x16(dst, stride, top_left, top, left_2);
+ dst += stride16;
+ const __m128i left_3 = LoadUnaligned16(left_ptr + 48);
+ WritePaeth16x16(dst, stride, top_left, top, left_3);
+}
+
+void Paeth32x8_SSE4_1(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const __m128i left = LoadLo8(left_column);
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ const __m128i top_0 = LoadUnaligned16(top_row);
+ const uint8_t top_left = top_ptr[-1];
+ auto* const dst = static_cast<uint8_t*>(dest);
+ WritePaeth16x8(dst, stride, top_left, top_0, left);
+ const __m128i top_1 = LoadUnaligned16(top_ptr + 16);
+ WritePaeth16x8(dst + 16, stride, top_left, top_1, left);
+}
+
+void Paeth32x16_SSE4_1(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const __m128i left = LoadUnaligned16(left_column);
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ const __m128i top_0 = LoadUnaligned16(top_row);
+ const uint8_t top_left = top_ptr[-1];
+ auto* const dst = static_cast<uint8_t*>(dest);
+ WritePaeth16x16(dst, stride, top_left, top_0, left);
+ const __m128i top_1 = LoadUnaligned16(top_ptr + 16);
+ WritePaeth16x16(dst + 16, stride, top_left, top_1, left);
+}
+
+void Paeth32x32_SSE4_1(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ const __m128i left_0 = LoadUnaligned16(left_ptr);
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ const __m128i top_0 = LoadUnaligned16(top_ptr);
+ const __m128i left_1 = LoadUnaligned16(left_ptr + 16);
+ const __m128i top_1 = LoadUnaligned16(top_ptr + 16);
+ const uint8_t top_left = top_ptr[-1];
+ auto* dst = static_cast<uint8_t*>(dest);
+ WritePaeth16x16(dst, stride, top_left, top_0, left_0);
+ WritePaeth16x16(dst + 16, stride, top_left, top_1, left_0);
+ dst += (stride << 4);
+ WritePaeth16x16(dst, stride, top_left, top_0, left_1);
+ WritePaeth16x16(dst + 16, stride, top_left, top_1, left_1);
+}
+
+void Paeth32x64_SSE4_1(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ const __m128i left_0 = LoadUnaligned16(left_ptr);
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ const __m128i top_0 = LoadUnaligned16(top_ptr);
+ const __m128i left_1 = LoadUnaligned16(left_ptr + 16);
+ const __m128i top_1 = LoadUnaligned16(top_ptr + 16);
+ const __m128i left_2 = LoadUnaligned16(left_ptr + 32);
+ const __m128i left_3 = LoadUnaligned16(left_ptr + 48);
+ const uint8_t top_left = top_ptr[-1];
+ auto* dst = static_cast<uint8_t*>(dest);
+ WritePaeth16x16(dst, stride, top_left, top_0, left_0);
+ WritePaeth16x16(dst + 16, stride, top_left, top_1, left_0);
+ dst += (stride << 4);
+ WritePaeth16x16(dst, stride, top_left, top_0, left_1);
+ WritePaeth16x16(dst + 16, stride, top_left, top_1, left_1);
+ dst += (stride << 4);
+ WritePaeth16x16(dst, stride, top_left, top_0, left_2);
+ WritePaeth16x16(dst + 16, stride, top_left, top_1, left_2);
+ dst += (stride << 4);
+ WritePaeth16x16(dst, stride, top_left, top_0, left_3);
+ WritePaeth16x16(dst + 16, stride, top_left, top_1, left_3);
+}
+
+void Paeth64x16_SSE4_1(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const __m128i left = LoadUnaligned16(left_column);
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ const __m128i top_0 = LoadUnaligned16(top_ptr);
+ const __m128i top_1 = LoadUnaligned16(top_ptr + 16);
+ const __m128i top_2 = LoadUnaligned16(top_ptr + 32);
+ const __m128i top_3 = LoadUnaligned16(top_ptr + 48);
+ const uint8_t top_left = top_ptr[-1];
+ auto* dst = static_cast<uint8_t*>(dest);
+ WritePaeth16x16(dst, stride, top_left, top_0, left);
+ WritePaeth16x16(dst + 16, stride, top_left, top_1, left);
+ WritePaeth16x16(dst + 32, stride, top_left, top_2, left);
+ WritePaeth16x16(dst + 48, stride, top_left, top_3, left);
+}
+
+void Paeth64x32_SSE4_1(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ const __m128i left_0 = LoadUnaligned16(left_ptr);
+ const __m128i left_1 = LoadUnaligned16(left_ptr + 16);
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ const __m128i top_0 = LoadUnaligned16(top_ptr);
+ const __m128i top_1 = LoadUnaligned16(top_ptr + 16);
+ const __m128i top_2 = LoadUnaligned16(top_ptr + 32);
+ const __m128i top_3 = LoadUnaligned16(top_ptr + 48);
+ const uint8_t top_left = top_ptr[-1];
+ auto* dst = static_cast<uint8_t*>(dest);
+ WritePaeth16x16(dst, stride, top_left, top_0, left_0);
+ WritePaeth16x16(dst + 16, stride, top_left, top_1, left_0);
+ WritePaeth16x16(dst + 32, stride, top_left, top_2, left_0);
+ WritePaeth16x16(dst + 48, stride, top_left, top_3, left_0);
+ dst += (stride << 4);
+ WritePaeth16x16(dst, stride, top_left, top_0, left_1);
+ WritePaeth16x16(dst + 16, stride, top_left, top_1, left_1);
+ WritePaeth16x16(dst + 32, stride, top_left, top_2, left_1);
+ WritePaeth16x16(dst + 48, stride, top_left, top_3, left_1);
+}
+
+void Paeth64x64_SSE4_1(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ const __m128i left_0 = LoadUnaligned16(left_ptr);
+ const __m128i left_1 = LoadUnaligned16(left_ptr + 16);
+ const __m128i left_2 = LoadUnaligned16(left_ptr + 32);
+ const __m128i left_3 = LoadUnaligned16(left_ptr + 48);
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ const __m128i top_0 = LoadUnaligned16(top_ptr);
+ const __m128i top_1 = LoadUnaligned16(top_ptr + 16);
+ const __m128i top_2 = LoadUnaligned16(top_ptr + 32);
+ const __m128i top_3 = LoadUnaligned16(top_ptr + 48);
+ const uint8_t top_left = top_ptr[-1];
+ auto* dst = static_cast<uint8_t*>(dest);
+ WritePaeth16x16(dst, stride, top_left, top_0, left_0);
+ WritePaeth16x16(dst + 16, stride, top_left, top_1, left_0);
+ WritePaeth16x16(dst + 32, stride, top_left, top_2, left_0);
+ WritePaeth16x16(dst + 48, stride, top_left, top_3, left_0);
+ dst += (stride << 4);
+ WritePaeth16x16(dst, stride, top_left, top_0, left_1);
+ WritePaeth16x16(dst + 16, stride, top_left, top_1, left_1);
+ WritePaeth16x16(dst + 32, stride, top_left, top_2, left_1);
+ WritePaeth16x16(dst + 48, stride, top_left, top_3, left_1);
+ dst += (stride << 4);
+ WritePaeth16x16(dst, stride, top_left, top_0, left_2);
+ WritePaeth16x16(dst + 16, stride, top_left, top_1, left_2);
+ WritePaeth16x16(dst + 32, stride, top_left, top_2, left_2);
+ WritePaeth16x16(dst + 48, stride, top_left, top_3, left_2);
+ dst += (stride << 4);
+ WritePaeth16x16(dst, stride, top_left, top_0, left_3);
+ WritePaeth16x16(dst + 16, stride, top_left, top_1, left_3);
+ WritePaeth16x16(dst + 32, stride, top_left, top_2, left_3);
+ WritePaeth16x16(dst + 48, stride, top_left, top_3, left_3);
+}
+
+//------------------------------------------------------------------------------
+// 7.11.2.4. Directional intra prediction process
+
+// Special case: An |xstep| of 64 corresponds to an angle delta of 45, meaning
+// upsampling is ruled out. In addition, the bits masked by 0x3F for
+// |shift_val| are 0 for all multiples of 64, so the formula
+// val = top[top_base_x]*shift + top[top_base_x+1]*(32-shift), reduces to
+// val = top[top_base_x+1] << 5, meaning only the second set of pixels is
+// involved in the output. Hence |top| is offset by 1.
+inline void DirectionalZone1_Step64(uint8_t* dst, ptrdiff_t stride,
+ const uint8_t* const top, const int width,
+ const int height) {
+ ptrdiff_t offset = 1;
+ if (height == 4) {
+ memcpy(dst, top + offset, width);
+ dst += stride;
+ memcpy(dst, top + offset + 1, width);
+ dst += stride;
+ memcpy(dst, top + offset + 2, width);
+ dst += stride;
+ memcpy(dst, top + offset + 3, width);
+ return;
+ }
+ int y = 0;
+ do {
+ memcpy(dst, top + offset, width);
+ dst += stride;
+ memcpy(dst, top + offset + 1, width);
+ dst += stride;
+ memcpy(dst, top + offset + 2, width);
+ dst += stride;
+ memcpy(dst, top + offset + 3, width);
+ dst += stride;
+ memcpy(dst, top + offset + 4, width);
+ dst += stride;
+ memcpy(dst, top + offset + 5, width);
+ dst += stride;
+ memcpy(dst, top + offset + 6, width);
+ dst += stride;
+ memcpy(dst, top + offset + 7, width);
+ dst += stride;
+
+ offset += 8;
+ y += 8;
+ } while (y < height);
+}
+
+inline void DirectionalZone1_4xH(uint8_t* dst, ptrdiff_t stride,
+ const uint8_t* const top, const int height,
+ const int xstep, const bool upsampled) {
+ const int upsample_shift = static_cast<int>(upsampled);
+ const int scale_bits = 6 - upsample_shift;
+ const int rounding_bits = 5;
+ const int max_base_x = (height + 3 /* width - 1 */) << upsample_shift;
+ const __m128i final_top_val = _mm_set1_epi16(top[max_base_x]);
+ const __m128i sampler = upsampled ? _mm_set_epi64x(0, 0x0706050403020100)
+ : _mm_set_epi64x(0, 0x0403030202010100);
+ // Each 16-bit value here corresponds to a position that may exceed
+ // |max_base_x|. When added to the top_base_x, it is used to mask values
+ // that pass the end of |top|. Starting from 1 to simulate "cmpge" which is
+ // not supported for packed integers.
+ const __m128i offsets =
+ _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
+
+ // All rows from |min_corner_only_y| down will simply use memcpy. |max_base_x|
+ // is always greater than |height|, so clipping to 1 is enough to make the
+ // logic work.
+ const int xstep_units = std::max(xstep >> scale_bits, 1);
+ const int min_corner_only_y = std::min(max_base_x / xstep_units, height);
+
+ // Rows up to this y-value can be computed without checking for bounds.
+ int y = 0;
+ int top_x = xstep;
+
+ for (; y < min_corner_only_y; ++y, dst += stride, top_x += xstep) {
+ const int top_base_x = top_x >> scale_bits;
+
+ // Permit negative values of |top_x|.
+ const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+ const __m128i shift = _mm_set1_epi8(shift_val);
+ const __m128i max_shift = _mm_set1_epi8(32);
+ const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
+ const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
+ __m128i top_index_vect = _mm_set1_epi16(top_base_x);
+ top_index_vect = _mm_add_epi16(top_index_vect, offsets);
+ const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x);
+
+ // Load 8 values because we will select the sampled values based on
+ // |upsampled|.
+ const __m128i values = LoadLo8(top + top_base_x);
+ const __m128i sampled_values = _mm_shuffle_epi8(values, sampler);
+ const __m128i past_max = _mm_cmpgt_epi16(top_index_vect, max_base_x_vect);
+ __m128i prod = _mm_maddubs_epi16(sampled_values, shifts);
+ prod = RightShiftWithRounding_U16(prod, rounding_bits);
+ // Replace pixels from invalid range with top-right corner.
+ prod = _mm_blendv_epi8(prod, final_top_val, past_max);
+ Store4(dst, _mm_packus_epi16(prod, prod));
+ }
+
+ // Fill in corner-only rows.
+ for (; y < height; ++y) {
+ memset(dst, top[max_base_x], /* width */ 4);
+ dst += stride;
+ }
+}
+
+// 7.11.2.4 (7) angle < 90
+inline void DirectionalZone1_Large(uint8_t* dest, ptrdiff_t stride,
+ const uint8_t* const top_row,
+ const int width, const int height,
+ const int xstep, const bool upsampled) {
+ const int upsample_shift = static_cast<int>(upsampled);
+ const __m128i sampler =
+ upsampled ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100)
+ : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100);
+ const int scale_bits = 6 - upsample_shift;
+ const int max_base_x = ((width + height) - 1) << upsample_shift;
+
+ const __m128i max_shift = _mm_set1_epi8(32);
+ const int rounding_bits = 5;
+ const int base_step = 1 << upsample_shift;
+ const int base_step8 = base_step << 3;
+
+ // All rows from |min_corner_only_y| down will simply use memcpy. |max_base_x|
+ // is always greater than |height|, so clipping to 1 is enough to make the
+ // logic work.
+ const int xstep_units = std::max(xstep >> scale_bits, 1);
+ const int min_corner_only_y = std::min(max_base_x / xstep_units, height);
+
+ // Rows up to this y-value can be computed without checking for bounds.
+ const int max_no_corner_y = std::min(
+ LeftShift((max_base_x - (base_step * width)), scale_bits) / xstep,
+ height);
+ // No need to check for exceeding |max_base_x| in the first loop.
+ int y = 0;
+ int top_x = xstep;
+ for (; y < max_no_corner_y; ++y, dest += stride, top_x += xstep) {
+ int top_base_x = top_x >> scale_bits;
+ // Permit negative values of |top_x|.
+ const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+ const __m128i shift = _mm_set1_epi8(shift_val);
+ const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
+ const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
+ int x = 0;
+ do {
+ const __m128i top_vals = LoadUnaligned16(top_row + top_base_x);
+ __m128i vals = _mm_shuffle_epi8(top_vals, sampler);
+ vals = _mm_maddubs_epi16(vals, shifts);
+ vals = RightShiftWithRounding_U16(vals, rounding_bits);
+ StoreLo8(dest + x, _mm_packus_epi16(vals, vals));
+ top_base_x += base_step8;
+ x += 8;
+ } while (x < width);
+ }
+
+ // Each 16-bit value here corresponds to a position that may exceed
+ // |max_base_x|. When added to the top_base_x, it is used to mask values
+ // that pass the end of |top|. Starting from 1 to simulate "cmpge" which is
+ // not supported for packed integers.
+ const __m128i offsets =
+ _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
+
+ const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x);
+ const __m128i final_top_val = _mm_set1_epi16(top_row[max_base_x]);
+ const __m128i base_step8_vect = _mm_set1_epi16(base_step8);
+ for (; y < min_corner_only_y; ++y, dest += stride, top_x += xstep) {
+ int top_base_x = top_x >> scale_bits;
+
+ const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+ const __m128i shift = _mm_set1_epi8(shift_val);
+ const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
+ const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
+ __m128i top_index_vect = _mm_set1_epi16(top_base_x);
+ top_index_vect = _mm_add_epi16(top_index_vect, offsets);
+
+ int x = 0;
+ const int min_corner_only_x =
+ std::min(width, ((max_base_x - top_base_x) >> upsample_shift) + 7) & ~7;
+ for (; x < min_corner_only_x;
+ x += 8, top_base_x += base_step8,
+ top_index_vect = _mm_add_epi16(top_index_vect, base_step8_vect)) {
+ const __m128i past_max = _mm_cmpgt_epi16(top_index_vect, max_base_x_vect);
+ // Assuming a buffer zone of 8 bytes at the end of top_row, this prevents
+ // reading out of bounds. If all indices are past max and we don't need to
+ // use the loaded bytes at all, |top_base_x| becomes 0. |top_base_x| will
+ // reset for the next |y|.
+ top_base_x &= ~_mm_cvtsi128_si32(past_max);
+ const __m128i top_vals = LoadUnaligned16(top_row + top_base_x);
+ __m128i vals = _mm_shuffle_epi8(top_vals, sampler);
+ vals = _mm_maddubs_epi16(vals, shifts);
+ vals = RightShiftWithRounding_U16(vals, rounding_bits);
+ vals = _mm_blendv_epi8(vals, final_top_val, past_max);
+ StoreLo8(dest + x, _mm_packus_epi16(vals, vals));
+ }
+ // Corner-only section of the row.
+ memset(dest + x, top_row[max_base_x], width - x);
+ }
+ // Fill in corner-only rows.
+ for (; y < height; ++y) {
+ memset(dest, top_row[max_base_x], width);
+ dest += stride;
+ }
+}
+
+// 7.11.2.4 (7) angle < 90
+inline void DirectionalZone1_SSE4_1(uint8_t* dest, ptrdiff_t stride,
+ const uint8_t* const top_row,
+ const int width, const int height,
+ const int xstep, const bool upsampled) {
+ const int upsample_shift = static_cast<int>(upsampled);
+ if (xstep == 64) {
+ DirectionalZone1_Step64(dest, stride, top_row, width, height);
+ return;
+ }
+ if (width == 4) {
+ DirectionalZone1_4xH(dest, stride, top_row, height, xstep, upsampled);
+ return;
+ }
+ if (width >= 32) {
+ DirectionalZone1_Large(dest, stride, top_row, width, height, xstep,
+ upsampled);
+ return;
+ }
+ const __m128i sampler =
+ upsampled ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100)
+ : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100);
+ const int scale_bits = 6 - upsample_shift;
+ const int max_base_x = ((width + height) - 1) << upsample_shift;
+
+ const __m128i max_shift = _mm_set1_epi8(32);
+ const int rounding_bits = 5;
+ const int base_step = 1 << upsample_shift;
+ const int base_step8 = base_step << 3;
+
+ // No need to check for exceeding |max_base_x| in the loops.
+ if (((xstep * height) >> scale_bits) + base_step * width < max_base_x) {
+ int top_x = xstep;
+ int y = 0;
+ do {
+ int top_base_x = top_x >> scale_bits;
+ // Permit negative values of |top_x|.
+ const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+ const __m128i shift = _mm_set1_epi8(shift_val);
+ const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
+ const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
+ int x = 0;
+ do {
+ const __m128i top_vals = LoadUnaligned16(top_row + top_base_x);
+ __m128i vals = _mm_shuffle_epi8(top_vals, sampler);
+ vals = _mm_maddubs_epi16(vals, shifts);
+ vals = RightShiftWithRounding_U16(vals, rounding_bits);
+ StoreLo8(dest + x, _mm_packus_epi16(vals, vals));
+ top_base_x += base_step8;
+ x += 8;
+ } while (x < width);
+ dest += stride;
+ top_x += xstep;
+ } while (++y < height);
+ return;
+ }
+
+ // Each 16-bit value here corresponds to a position that may exceed
+ // |max_base_x|. When added to the top_base_x, it is used to mask values
+ // that pass the end of |top|. Starting from 1 to simulate "cmpge" which is
+ // not supported for packed integers.
+ const __m128i offsets =
+ _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
+
+ const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x);
+ const __m128i final_top_val = _mm_set1_epi16(top_row[max_base_x]);
+ const __m128i base_step8_vect = _mm_set1_epi16(base_step8);
+ int top_x = xstep;
+ int y = 0;
+ do {
+ int top_base_x = top_x >> scale_bits;
+
+ if (top_base_x >= max_base_x) {
+ for (int i = y; i < height; ++i) {
+ memset(dest, top_row[max_base_x], width);
+ dest += stride;
+ }
+ return;
+ }
+
+ const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+ const __m128i shift = _mm_set1_epi8(shift_val);
+ const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
+ const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
+ __m128i top_index_vect = _mm_set1_epi16(top_base_x);
+ top_index_vect = _mm_add_epi16(top_index_vect, offsets);
+
+ int x = 0;
+ for (; x < width - 8;
+ x += 8, top_base_x += base_step8,
+ top_index_vect = _mm_add_epi16(top_index_vect, base_step8_vect)) {
+ const __m128i past_max = _mm_cmpgt_epi16(top_index_vect, max_base_x_vect);
+ // Assuming a buffer zone of 8 bytes at the end of top_row, this prevents
+ // reading out of bounds. If all indices are past max and we don't need to
+ // use the loaded bytes at all, |top_base_x| becomes 0. |top_base_x| will
+ // reset for the next |y|.
+ top_base_x &= ~_mm_cvtsi128_si32(past_max);
+ const __m128i top_vals = LoadUnaligned16(top_row + top_base_x);
+ __m128i vals = _mm_shuffle_epi8(top_vals, sampler);
+ vals = _mm_maddubs_epi16(vals, shifts);
+ vals = RightShiftWithRounding_U16(vals, rounding_bits);
+ vals = _mm_blendv_epi8(vals, final_top_val, past_max);
+ StoreLo8(dest + x, _mm_packus_epi16(vals, vals));
+ }
+ const __m128i past_max = _mm_cmpgt_epi16(top_index_vect, max_base_x_vect);
+ __m128i vals;
+ if (upsampled) {
+ vals = LoadUnaligned16(top_row + top_base_x);
+ } else {
+ const __m128i top_vals = LoadLo8(top_row + top_base_x);
+ vals = _mm_shuffle_epi8(top_vals, sampler);
+ vals = _mm_insert_epi8(vals, top_row[top_base_x + 8], 15);
+ }
+ vals = _mm_maddubs_epi16(vals, shifts);
+ vals = RightShiftWithRounding_U16(vals, rounding_bits);
+ vals = _mm_blendv_epi8(vals, final_top_val, past_max);
+ StoreLo8(dest + x, _mm_packus_epi16(vals, vals));
+ dest += stride;
+ top_x += xstep;
+ } while (++y < height);
+}
+
+void DirectionalIntraPredictorZone1_SSE4_1(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const int width, const int height,
+ const int xstep,
+ const bool upsampled_top) {
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ auto* dst = static_cast<uint8_t*>(dest);
+ DirectionalZone1_SSE4_1(dst, stride, top_ptr, width, height, xstep,
+ upsampled_top);
+}
+
+template <bool upsampled>
+inline void DirectionalZone3_4x4(uint8_t* dest, ptrdiff_t stride,
+ const uint8_t* const left_column,
+ const int base_left_y, const int ystep) {
+ // For use in the non-upsampled case.
+ const __m128i sampler = _mm_set_epi64x(0, 0x0403030202010100);
+ const int upsample_shift = static_cast<int>(upsampled);
+ const int scale_bits = 6 - upsample_shift;
+ const __m128i max_shift = _mm_set1_epi8(32);
+ const int rounding_bits = 5;
+
+ __m128i result_block[4];
+ for (int x = 0, left_y = base_left_y; x < 4; x++, left_y += ystep) {
+ const int left_base_y = left_y >> scale_bits;
+ const int shift_val = ((left_y << upsample_shift) & 0x3F) >> 1;
+ const __m128i shift = _mm_set1_epi8(shift_val);
+ const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
+ const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
+ __m128i vals;
+ if (upsampled) {
+ vals = LoadLo8(left_column + left_base_y);
+ } else {
+ const __m128i top_vals = LoadLo8(left_column + left_base_y);
+ vals = _mm_shuffle_epi8(top_vals, sampler);
+ }
+ vals = _mm_maddubs_epi16(vals, shifts);
+ vals = RightShiftWithRounding_U16(vals, rounding_bits);
+ result_block[x] = _mm_packus_epi16(vals, vals);
+ }
+ const __m128i result = Transpose4x4_U8(result_block);
+ // This is result_row0.
+ Store4(dest, result);
+ dest += stride;
+ const int result_row1 = _mm_extract_epi32(result, 1);
+ memcpy(dest, &result_row1, sizeof(result_row1));
+ dest += stride;
+ const int result_row2 = _mm_extract_epi32(result, 2);
+ memcpy(dest, &result_row2, sizeof(result_row2));
+ dest += stride;
+ const int result_row3 = _mm_extract_epi32(result, 3);
+ memcpy(dest, &result_row3, sizeof(result_row3));
+}
+
+template <bool upsampled, int height>
+inline void DirectionalZone3_8xH(uint8_t* dest, ptrdiff_t stride,
+ const uint8_t* const left_column,
+ const int base_left_y, const int ystep) {
+ // For use in the non-upsampled case.
+ const __m128i sampler =
+ _mm_set_epi64x(0x0807070606050504, 0x0403030202010100);
+ const int upsample_shift = static_cast<int>(upsampled);
+ const int scale_bits = 6 - upsample_shift;
+ const __m128i max_shift = _mm_set1_epi8(32);
+ const int rounding_bits = 5;
+
+ __m128i result_block[8];
+ for (int x = 0, left_y = base_left_y; x < 8; x++, left_y += ystep) {
+ const int left_base_y = left_y >> scale_bits;
+ const int shift_val = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+ const __m128i shift = _mm_set1_epi8(shift_val);
+ const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
+ const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
+ __m128i vals;
+ if (upsampled) {
+ vals = LoadUnaligned16(left_column + left_base_y);
+ } else {
+ const __m128i top_vals = LoadUnaligned16(left_column + left_base_y);
+ vals = _mm_shuffle_epi8(top_vals, sampler);
+ }
+ vals = _mm_maddubs_epi16(vals, shifts);
+ result_block[x] = RightShiftWithRounding_U16(vals, rounding_bits);
+ }
+ Transpose8x8_U16(result_block, result_block);
+ for (int y = 0; y < height; ++y) {
+ StoreLo8(dest, _mm_packus_epi16(result_block[y], result_block[y]));
+ dest += stride;
+ }
+}
+
+// 7.11.2.4 (9) angle > 180
+void DirectionalIntraPredictorZone3_SSE4_1(void* dest, ptrdiff_t stride,
+ const void* const left_column,
+ const int width, const int height,
+ const int ystep,
+ const bool upsampled) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ auto* dst = static_cast<uint8_t*>(dest);
+ const int upsample_shift = static_cast<int>(upsampled);
+ if (width == 4 || height == 4) {
+ const ptrdiff_t stride4 = stride << 2;
+ if (upsampled) {
+ int left_y = ystep;
+ int x = 0;
+ do {
+ uint8_t* dst_x = dst + x;
+ int y = 0;
+ do {
+ DirectionalZone3_4x4<true>(
+ dst_x, stride, left_ptr + (y << upsample_shift), left_y, ystep);
+ dst_x += stride4;
+ y += 4;
+ } while (y < height);
+ left_y += ystep << 2;
+ x += 4;
+ } while (x < width);
+ } else {
+ int left_y = ystep;
+ int x = 0;
+ do {
+ uint8_t* dst_x = dst + x;
+ int y = 0;
+ do {
+ DirectionalZone3_4x4<false>(dst_x, stride, left_ptr + y, left_y,
+ ystep);
+ dst_x += stride4;
+ y += 4;
+ } while (y < height);
+ left_y += ystep << 2;
+ x += 4;
+ } while (x < width);
+ }
+ return;
+ }
+
+ const ptrdiff_t stride8 = stride << 3;
+ if (upsampled) {
+ int left_y = ystep;
+ int x = 0;
+ do {
+ uint8_t* dst_x = dst + x;
+ int y = 0;
+ do {
+ DirectionalZone3_8xH<true, 8>(
+ dst_x, stride, left_ptr + (y << upsample_shift), left_y, ystep);
+ dst_x += stride8;
+ y += 8;
+ } while (y < height);
+ left_y += ystep << 3;
+ x += 8;
+ } while (x < width);
+ } else {
+ int left_y = ystep;
+ int x = 0;
+ do {
+ uint8_t* dst_x = dst + x;
+ int y = 0;
+ do {
+ DirectionalZone3_8xH<false, 8>(
+ dst_x, stride, left_ptr + (y << upsample_shift), left_y, ystep);
+ dst_x += stride8;
+ y += 8;
+ } while (y < height);
+ left_y += ystep << 3;
+ x += 8;
+ } while (x < width);
+ }
+}
+
+//------------------------------------------------------------------------------
+// Directional Zone 2 Functions
+// 7.11.2.4 (8)
+
+// DirectionalBlend* selectively overwrites the values written by
+// DirectionalZone2FromLeftCol*. |zone_bounds| has one 16-bit index for each
+// row.
+template <int y_selector>
+inline void DirectionalBlend4_SSE4_1(uint8_t* dest,
+ const __m128i& dest_index_vect,
+ const __m128i& vals,
+ const __m128i& zone_bounds) {
+ const __m128i max_dest_x_vect = _mm_shufflelo_epi16(zone_bounds, y_selector);
+ const __m128i use_left = _mm_cmplt_epi16(dest_index_vect, max_dest_x_vect);
+ const __m128i original_vals = _mm_cvtepu8_epi16(Load4(dest));
+ const __m128i blended_vals = _mm_blendv_epi8(vals, original_vals, use_left);
+ Store4(dest, _mm_packus_epi16(blended_vals, blended_vals));
+}
+
+inline void DirectionalBlend8_SSE4_1(uint8_t* dest,
+ const __m128i& dest_index_vect,
+ const __m128i& vals,
+ const __m128i& zone_bounds,
+ const __m128i& bounds_selector) {
+ const __m128i max_dest_x_vect =
+ _mm_shuffle_epi8(zone_bounds, bounds_selector);
+ const __m128i use_left = _mm_cmplt_epi16(dest_index_vect, max_dest_x_vect);
+ const __m128i original_vals = _mm_cvtepu8_epi16(LoadLo8(dest));
+ const __m128i blended_vals = _mm_blendv_epi8(vals, original_vals, use_left);
+ StoreLo8(dest, _mm_packus_epi16(blended_vals, blended_vals));
+}
+
+constexpr int kDirectionalWeightBits = 5;
+// |source| is packed with 4 or 8 pairs of 8-bit values from left or top.
+// |shifts| is named to match the specification, with 4 or 8 pairs of (32 -
+// shift) and shift. Shift is guaranteed to be between 0 and 32.
+inline __m128i DirectionalZone2FromSource_SSE4_1(const uint8_t* const source,
+ const __m128i& shifts,
+ const __m128i& sampler) {
+ const __m128i src_vals = LoadUnaligned16(source);
+ __m128i vals = _mm_shuffle_epi8(src_vals, sampler);
+ vals = _mm_maddubs_epi16(vals, shifts);
+ return RightShiftWithRounding_U16(vals, kDirectionalWeightBits);
+}
+
+// Because the source values "move backwards" as the row index increases, the
+// indices derived from ystep are generally negative. This is accommodated by
+// making sure the relative indices are within [-15, 0] when the function is
+// called, and sliding them into the inclusive range [0, 15], relative to a
+// lower base address.
+constexpr int kPositiveIndexOffset = 15;
+
+template <bool upsampled>
+inline void DirectionalZone2FromLeftCol_4x4_SSE4_1(
+ uint8_t* dst, ptrdiff_t stride, const uint8_t* const left_column_base,
+ __m128i left_y) {
+ const int upsample_shift = static_cast<int>(upsampled);
+ const int scale_bits = 6 - upsample_shift;
+ const __m128i max_shifts = _mm_set1_epi8(32);
+ const __m128i shift_mask = _mm_set1_epi32(0x003F003F);
+ const __m128i index_increment = _mm_cvtsi32_si128(0x01010101);
+ const __m128i positive_offset = _mm_set1_epi8(kPositiveIndexOffset);
+ // Left_column and sampler are both offset by 15 so the indices are always
+ // positive.
+ const uint8_t* left_column = left_column_base - kPositiveIndexOffset;
+ for (int y = 0; y < 4; dst += stride, ++y) {
+ __m128i offset_y = _mm_srai_epi16(left_y, scale_bits);
+ offset_y = _mm_packs_epi16(offset_y, offset_y);
+
+ const __m128i adjacent = _mm_add_epi8(offset_y, index_increment);
+ __m128i sampler = _mm_unpacklo_epi8(offset_y, adjacent);
+ // Slide valid |offset_y| indices from range [-15, 0] to [0, 15] so they
+ // can work as shuffle indices. Some values may be out of bounds, but their
+ // pred results will be masked over by top prediction.
+ sampler = _mm_add_epi8(sampler, positive_offset);
+
+ __m128i shifts = _mm_srli_epi16(
+ _mm_and_si128(_mm_slli_epi16(left_y, upsample_shift), shift_mask), 1);
+ shifts = _mm_packus_epi16(shifts, shifts);
+ const __m128i opposite_shifts = _mm_sub_epi8(max_shifts, shifts);
+ shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
+ const __m128i vals = DirectionalZone2FromSource_SSE4_1(
+ left_column + (y << upsample_shift), shifts, sampler);
+ Store4(dst, _mm_packus_epi16(vals, vals));
+ }
+}
+
+// The height at which a load of 16 bytes will not contain enough source pixels
+// from |left_column| to supply an accurate row when computing 8 pixels at a
+// time. The values are found by inspection. By coincidence, all angles that
+// satisfy (ystep >> 6) == 2 map to the same value, so it is enough to look up
+// by ystep >> 6. The largest index for this lookup is 1023 >> 6 == 15.
+constexpr int kDirectionalZone2ShuffleInvalidHeight[16] = {
+ 1024, 1024, 16, 16, 16, 16, 0, 0, 18, 0, 0, 0, 0, 0, 0, 40};
+
+template <bool upsampled>
+inline void DirectionalZone2FromLeftCol_8x8_SSE4_1(
+ uint8_t* dst, ptrdiff_t stride, const uint8_t* const left_column,
+ __m128i left_y) {
+ const int upsample_shift = static_cast<int>(upsampled);
+ const int scale_bits = 6 - upsample_shift;
+ const __m128i max_shifts = _mm_set1_epi8(32);
+ const __m128i shift_mask = _mm_set1_epi32(0x003F003F);
+ const __m128i index_increment = _mm_set1_epi8(1);
+ const __m128i denegation = _mm_set1_epi8(kPositiveIndexOffset);
+ for (int y = 0; y < 8; dst += stride, ++y) {
+ __m128i offset_y = _mm_srai_epi16(left_y, scale_bits);
+ offset_y = _mm_packs_epi16(offset_y, offset_y);
+ const __m128i adjacent = _mm_add_epi8(offset_y, index_increment);
+
+ // Offset the relative index because ystep is negative in Zone 2 and shuffle
+ // indices must be nonnegative.
+ __m128i sampler = _mm_unpacklo_epi8(offset_y, adjacent);
+ sampler = _mm_add_epi8(sampler, denegation);
+
+ __m128i shifts = _mm_srli_epi16(
+ _mm_and_si128(_mm_slli_epi16(left_y, upsample_shift), shift_mask), 1);
+ shifts = _mm_packus_epi16(shifts, shifts);
+ const __m128i opposite_shifts = _mm_sub_epi8(max_shifts, shifts);
+ shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
+
+ // The specification adds (y << 6) to left_y, which is subject to
+ // upsampling, but this puts sampler indices out of the 0-15 range. It is
+ // equivalent to offset the source address by (y << upsample_shift) instead.
+ const __m128i vals = DirectionalZone2FromSource_SSE4_1(
+ left_column - kPositiveIndexOffset + (y << upsample_shift), shifts,
+ sampler);
+ StoreLo8(dst, _mm_packus_epi16(vals, vals));
+ }
+}
+
+// |zone_bounds| is an epi16 of the relative x index at which base >= -(1 <<
+// upsampled_top), for each row. When there are 4 values, they can be duplicated
+// with a non-register shuffle mask.
+// |shifts| is one pair of weights that applies throughout a given row.
+template <bool upsampled_top>
+inline void DirectionalZone1Blend_4x4(
+ uint8_t* dest, const uint8_t* const top_row, ptrdiff_t stride,
+ __m128i sampler, const __m128i& zone_bounds, const __m128i& shifts,
+ const __m128i& dest_index_x, int top_x, const int xstep) {
+ const int upsample_shift = static_cast<int>(upsampled_top);
+ const int scale_bits_x = 6 - upsample_shift;
+ top_x -= xstep;
+
+ int top_base_x = (top_x >> scale_bits_x);
+ const __m128i vals0 = DirectionalZone2FromSource_SSE4_1(
+ top_row + top_base_x, _mm_shufflelo_epi16(shifts, 0x00), sampler);
+ DirectionalBlend4_SSE4_1<0x00>(dest, dest_index_x, vals0, zone_bounds);
+ top_x -= xstep;
+ dest += stride;
+
+ top_base_x = (top_x >> scale_bits_x);
+ const __m128i vals1 = DirectionalZone2FromSource_SSE4_1(
+ top_row + top_base_x, _mm_shufflelo_epi16(shifts, 0x55), sampler);
+ DirectionalBlend4_SSE4_1<0x55>(dest, dest_index_x, vals1, zone_bounds);
+ top_x -= xstep;
+ dest += stride;
+
+ top_base_x = (top_x >> scale_bits_x);
+ const __m128i vals2 = DirectionalZone2FromSource_SSE4_1(
+ top_row + top_base_x, _mm_shufflelo_epi16(shifts, 0xAA), sampler);
+ DirectionalBlend4_SSE4_1<0xAA>(dest, dest_index_x, vals2, zone_bounds);
+ top_x -= xstep;
+ dest += stride;
+
+ top_base_x = (top_x >> scale_bits_x);
+ const __m128i vals3 = DirectionalZone2FromSource_SSE4_1(
+ top_row + top_base_x, _mm_shufflelo_epi16(shifts, 0xFF), sampler);
+ DirectionalBlend4_SSE4_1<0xFF>(dest, dest_index_x, vals3, zone_bounds);
+}
+
+template <bool upsampled_top, int height>
+inline void DirectionalZone1Blend_8xH(
+ uint8_t* dest, const uint8_t* const top_row, ptrdiff_t stride,
+ __m128i sampler, const __m128i& zone_bounds, const __m128i& shifts,
+ const __m128i& dest_index_x, int top_x, const int xstep) {
+ const int upsample_shift = static_cast<int>(upsampled_top);
+ const int scale_bits_x = 6 - upsample_shift;
+
+ __m128i y_selector = _mm_set1_epi32(0x01000100);
+ const __m128i index_increment = _mm_set1_epi32(0x02020202);
+ for (int y = 0; y < height; ++y,
+ y_selector = _mm_add_epi8(y_selector, index_increment),
+ dest += stride) {
+ top_x -= xstep;
+ const int top_base_x = top_x >> scale_bits_x;
+ const __m128i vals = DirectionalZone2FromSource_SSE4_1(
+ top_row + top_base_x, _mm_shuffle_epi8(shifts, y_selector), sampler);
+ DirectionalBlend8_SSE4_1(dest, dest_index_x, vals, zone_bounds, y_selector);
+ }
+}
+
+// 7.11.2.4 (8) 90 < angle > 180
+// The strategy for this function is to know how many blocks can be processed
+// with just pixels from |top_ptr|, then handle mixed blocks, then handle only
+// blocks that take from |left_ptr|. Additionally, a fast index-shuffle
+// approach is used for pred values from |left_column| in sections that permit
+// it.
+template <bool upsampled_left, bool upsampled_top>
+inline void DirectionalZone2_SSE4_1(void* dest, ptrdiff_t stride,
+ const uint8_t* const top_row,
+ const uint8_t* const left_column,
+ const int width, const int height,
+ const int xstep, const int ystep) {
+ auto* dst = static_cast<uint8_t*>(dest);
+ const int upsample_left_shift = static_cast<int>(upsampled_left);
+ const int upsample_top_shift = static_cast<int>(upsampled_top);
+ const __m128i max_shift = _mm_set1_epi8(32);
+ const ptrdiff_t stride8 = stride << 3;
+ const __m128i dest_index_x =
+ _mm_set_epi32(0x00070006, 0x00050004, 0x00030002, 0x00010000);
+ const __m128i sampler_top =
+ upsampled_top
+ ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100)
+ : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100);
+ const __m128i shift_mask = _mm_set1_epi32(0x003F003F);
+ // All columns from |min_top_only_x| to the right will only need |top_row| to
+ // compute. This assumes minimum |xstep| is 3.
+ const int min_top_only_x = std::min((height * xstep) >> 6, width);
+
+ // For steep angles, the source pixels from left_column may not fit in a
+ // 16-byte load for shuffling.
+ // TODO(petersonab): Find a more precise formula for this subject to x.
+ const int max_shuffle_height =
+ std::min(height, kDirectionalZone2ShuffleInvalidHeight[ystep >> 6]);
+
+ const int xstep8 = xstep << 3;
+ const __m128i xstep8_vect = _mm_set1_epi16(xstep8);
+ // Accumulate xstep across 8 rows.
+ const __m128i xstep_dup = _mm_set1_epi16(-xstep);
+ const __m128i increments = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
+ const __m128i xstep_for_shift = _mm_mullo_epi16(xstep_dup, increments);
+ // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1
+ const __m128i scaled_one = _mm_set1_epi16(-64);
+ __m128i xstep_bounds_base =
+ (xstep == 64) ? _mm_sub_epi16(scaled_one, xstep_for_shift)
+ : _mm_sub_epi16(_mm_set1_epi16(-1), xstep_for_shift);
+
+ const int left_base_increment = ystep >> 6;
+ const int ystep_remainder = ystep & 0x3F;
+ const int ystep8 = ystep << 3;
+ const int left_base_increment8 = ystep8 >> 6;
+ const int ystep_remainder8 = ystep8 & 0x3F;
+ const __m128i increment_left8 = _mm_set1_epi16(-ystep_remainder8);
+
+ // If the 64 scaling is regarded as a decimal point, the first value of the
+ // left_y vector omits the portion which is covered under the left_column
+ // offset. Following values need the full ystep as a relative offset.
+ const __m128i ystep_init = _mm_set1_epi16(-ystep_remainder);
+ const __m128i ystep_dup = _mm_set1_epi16(-ystep);
+ __m128i left_y = _mm_mullo_epi16(ystep_dup, dest_index_x);
+ left_y = _mm_add_epi16(ystep_init, left_y);
+
+ const __m128i increment_top8 = _mm_set1_epi16(8 << 6);
+ int x = 0;
+
+ // This loop treats each set of 4 columns in 3 stages with y-value boundaries.
+ // The first stage, before the first y-loop, covers blocks that are only
+ // computed from the top row. The second stage, comprising two y-loops, covers
+ // blocks that have a mixture of values computed from top or left. The final
+ // stage covers blocks that are only computed from the left.
+ for (int left_offset = -left_base_increment; x < min_top_only_x;
+ x += 8,
+ xstep_bounds_base = _mm_sub_epi16(xstep_bounds_base, increment_top8),
+ // Watch left_y because it can still get big.
+ left_y = _mm_add_epi16(left_y, increment_left8),
+ left_offset -= left_base_increment8) {
+ uint8_t* dst_x = dst + x;
+
+ // Round down to the nearest multiple of 8.
+ const int max_top_only_y = std::min(((x + 1) << 6) / xstep, height) & ~7;
+ DirectionalZone1_4xH(dst_x, stride, top_row + (x << upsample_top_shift),
+ max_top_only_y, -xstep, upsampled_top);
+ DirectionalZone1_4xH(dst_x + 4, stride,
+ top_row + ((x + 4) << upsample_top_shift),
+ max_top_only_y, -xstep, upsampled_top);
+
+ int y = max_top_only_y;
+ dst_x += stride * y;
+ const int xstep_y = xstep * y;
+ const __m128i xstep_y_vect = _mm_set1_epi16(xstep_y);
+ // All rows from |min_left_only_y| down for this set of columns, only need
+ // |left_column| to compute.
+ const int min_left_only_y = std::min(((x + 8) << 6) / xstep, height);
+ // At high angles such that min_left_only_y < 8, ystep is low and xstep is
+ // high. This means that max_shuffle_height is unbounded and xstep_bounds
+ // will overflow in 16 bits. This is prevented by stopping the first
+ // blending loop at min_left_only_y for such cases, which means we skip over
+ // the second blending loop as well.
+ const int left_shuffle_stop_y =
+ std::min(max_shuffle_height, min_left_only_y);
+ __m128i xstep_bounds = _mm_add_epi16(xstep_bounds_base, xstep_y_vect);
+ __m128i xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift, xstep_y_vect);
+ int top_x = -xstep_y;
+
+ for (; y < left_shuffle_stop_y;
+ y += 8, dst_x += stride8,
+ xstep_bounds = _mm_add_epi16(xstep_bounds, xstep8_vect),
+ xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift_y, xstep8_vect),
+ top_x -= xstep8) {
+ DirectionalZone2FromLeftCol_8x8_SSE4_1<upsampled_left>(
+ dst_x, stride,
+ left_column + ((left_offset + y) << upsample_left_shift), left_y);
+
+ __m128i shifts = _mm_srli_epi16(
+ _mm_and_si128(_mm_slli_epi16(xstep_for_shift_y, upsample_top_shift),
+ shift_mask),
+ 1);
+ shifts = _mm_packus_epi16(shifts, shifts);
+ __m128i opposite_shifts = _mm_sub_epi8(max_shift, shifts);
+ shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
+ __m128i xstep_bounds_off = _mm_srai_epi16(xstep_bounds, 6);
+ DirectionalZone1Blend_8xH<upsampled_top, 8>(
+ dst_x, top_row + (x << upsample_top_shift), stride, sampler_top,
+ xstep_bounds_off, shifts, dest_index_x, top_x, xstep);
+ }
+ // Pick up from the last y-value, using the 10% slower but secure method for
+ // left prediction.
+ const auto base_left_y = static_cast<int16_t>(_mm_extract_epi16(left_y, 0));
+ for (; y < min_left_only_y;
+ y += 8, dst_x += stride8,
+ xstep_bounds = _mm_add_epi16(xstep_bounds, xstep8_vect),
+ xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift_y, xstep8_vect),
+ top_x -= xstep8) {
+ const __m128i xstep_bounds_off = _mm_srai_epi16(xstep_bounds, 6);
+
+ DirectionalZone3_8xH<upsampled_left, 8>(
+ dst_x, stride,
+ left_column + ((left_offset + y) << upsample_left_shift), base_left_y,
+ -ystep);
+
+ __m128i shifts = _mm_srli_epi16(
+ _mm_and_si128(_mm_slli_epi16(xstep_for_shift_y, upsample_top_shift),
+ shift_mask),
+ 1);
+ shifts = _mm_packus_epi16(shifts, shifts);
+ __m128i opposite_shifts = _mm_sub_epi8(max_shift, shifts);
+ shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
+ DirectionalZone1Blend_8xH<upsampled_top, 8>(
+ dst_x, top_row + (x << upsample_top_shift), stride, sampler_top,
+ xstep_bounds_off, shifts, dest_index_x, top_x, xstep);
+ }
+ // Loop over y for left_only rows.
+ for (; y < height; y += 8, dst_x += stride8) {
+ DirectionalZone3_8xH<upsampled_left, 8>(
+ dst_x, stride,
+ left_column + ((left_offset + y) << upsample_left_shift), base_left_y,
+ -ystep);
+ }
+ }
+ for (; x < width; x += 4) {
+ DirectionalZone1_4xH(dst + x, stride, top_row + (x << upsample_top_shift),
+ height, -xstep, upsampled_top);
+ }
+}
+
+template <bool upsampled_left, bool upsampled_top>
+inline void DirectionalZone2_4_SSE4_1(void* dest, ptrdiff_t stride,
+ const uint8_t* const top_row,
+ const uint8_t* const left_column,
+ const int width, const int height,
+ const int xstep, const int ystep) {
+ auto* dst = static_cast<uint8_t*>(dest);
+ const int upsample_left_shift = static_cast<int>(upsampled_left);
+ const int upsample_top_shift = static_cast<int>(upsampled_top);
+ const __m128i max_shift = _mm_set1_epi8(32);
+ const ptrdiff_t stride4 = stride << 2;
+ const __m128i dest_index_x = _mm_set_epi32(0, 0, 0x00030002, 0x00010000);
+ const __m128i sampler_top =
+ upsampled_top
+ ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100)
+ : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100);
+ // All columns from |min_top_only_x| to the right will only need |top_row| to
+ // compute.
+ assert(xstep >= 3);
+ const int min_top_only_x = std::min((height * xstep) >> 6, width);
+
+ const int xstep4 = xstep << 2;
+ const __m128i xstep4_vect = _mm_set1_epi16(xstep4);
+ const __m128i xstep_dup = _mm_set1_epi16(-xstep);
+ const __m128i increments = _mm_set_epi32(0, 0, 0x00040003, 0x00020001);
+ __m128i xstep_for_shift = _mm_mullo_epi16(xstep_dup, increments);
+ const __m128i scaled_one = _mm_set1_epi16(-64);
+ // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1
+ __m128i xstep_bounds_base =
+ (xstep == 64) ? _mm_sub_epi16(scaled_one, xstep_for_shift)
+ : _mm_sub_epi16(_mm_set1_epi16(-1), xstep_for_shift);
+
+ const int left_base_increment = ystep >> 6;
+ const int ystep_remainder = ystep & 0x3F;
+ const int ystep4 = ystep << 2;
+ const int left_base_increment4 = ystep4 >> 6;
+ // This is guaranteed to be less than 64, but accumulation may bring it past
+ // 64 for higher x values.
+ const int ystep_remainder4 = ystep4 & 0x3F;
+ const __m128i increment_left4 = _mm_set1_epi16(-ystep_remainder4);
+ const __m128i increment_top4 = _mm_set1_epi16(4 << 6);
+
+ // If the 64 scaling is regarded as a decimal point, the first value of the
+ // left_y vector omits the portion which will go into the left_column offset.
+ // Following values need the full ystep as a relative offset.
+ const __m128i ystep_init = _mm_set1_epi16(-ystep_remainder);
+ const __m128i ystep_dup = _mm_set1_epi16(-ystep);
+ __m128i left_y = _mm_mullo_epi16(ystep_dup, dest_index_x);
+ left_y = _mm_add_epi16(ystep_init, left_y);
+ const __m128i shift_mask = _mm_set1_epi32(0x003F003F);
+
+ int x = 0;
+ // Loop over x for columns with a mixture of sources.
+ for (int left_offset = -left_base_increment; x < min_top_only_x; x += 4,
+ xstep_bounds_base = _mm_sub_epi16(xstep_bounds_base, increment_top4),
+ left_y = _mm_add_epi16(left_y, increment_left4),
+ left_offset -= left_base_increment4) {
+ uint8_t* dst_x = dst + x;
+
+ // Round down to the nearest multiple of 8.
+ const int max_top_only_y = std::min((x << 6) / xstep, height) & 0xFFFFFFF4;
+ DirectionalZone1_4xH(dst_x, stride, top_row + (x << upsample_top_shift),
+ max_top_only_y, -xstep, upsampled_top);
+ int y = max_top_only_y;
+ dst_x += stride * y;
+ const int xstep_y = xstep * y;
+ const __m128i xstep_y_vect = _mm_set1_epi16(xstep_y);
+ // All rows from |min_left_only_y| down for this set of columns, only need
+ // |left_column| to compute. Rounded up to the nearest multiple of 4.
+ const int min_left_only_y = std::min(((x + 4) << 6) / xstep, height);
+
+ __m128i xstep_bounds = _mm_add_epi16(xstep_bounds_base, xstep_y_vect);
+ __m128i xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift, xstep_y_vect);
+ int top_x = -xstep_y;
+
+ // Loop over y for mixed rows.
+ for (; y < min_left_only_y;
+ y += 4, dst_x += stride4,
+ xstep_bounds = _mm_add_epi16(xstep_bounds, xstep4_vect),
+ xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift_y, xstep4_vect),
+ top_x -= xstep4) {
+ DirectionalZone2FromLeftCol_4x4_SSE4_1<upsampled_left>(
+ dst_x, stride,
+ left_column + ((left_offset + y) * (1 << upsample_left_shift)),
+ left_y);
+
+ __m128i shifts = _mm_srli_epi16(
+ _mm_and_si128(_mm_slli_epi16(xstep_for_shift_y, upsample_top_shift),
+ shift_mask),
+ 1);
+ shifts = _mm_packus_epi16(shifts, shifts);
+ const __m128i opposite_shifts = _mm_sub_epi8(max_shift, shifts);
+ shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
+ const __m128i xstep_bounds_off = _mm_srai_epi16(xstep_bounds, 6);
+ DirectionalZone1Blend_4x4<upsampled_top>(
+ dst_x, top_row + (x << upsample_top_shift), stride, sampler_top,
+ xstep_bounds_off, shifts, dest_index_x, top_x, xstep);
+ }
+ // Loop over y for left-only rows, if any.
+ for (; y < height; y += 4, dst_x += stride4) {
+ DirectionalZone2FromLeftCol_4x4_SSE4_1<upsampled_left>(
+ dst_x, stride,
+ left_column + ((left_offset + y) << upsample_left_shift), left_y);
+ }
+ }
+ // Loop over top-only columns, if any.
+ for (; x < width; x += 4) {
+ DirectionalZone1_4xH(dst + x, stride, top_row + (x << upsample_top_shift),
+ height, -xstep, upsampled_top);
+ }
+}
+
+void DirectionalIntraPredictorZone2_SSE4_1(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column,
+ const int width, const int height,
+ const int xstep, const int ystep,
+ const bool upsampled_top,
+ const bool upsampled_left) {
+ // Increasing the negative buffer for this function allows more rows to be
+ // processed at a time without branching in an inner loop to check the base.
+ uint8_t top_buffer[288];
+ uint8_t left_buffer[288];
+ memcpy(top_buffer + 128, static_cast<const uint8_t*>(top_row) - 16, 160);
+ memcpy(left_buffer + 128, static_cast<const uint8_t*>(left_column) - 16, 160);
+ const uint8_t* top_ptr = top_buffer + 144;
+ const uint8_t* left_ptr = left_buffer + 144;
+ if (width == 4 || height == 4) {
+ if (upsampled_left) {
+ if (upsampled_top) {
+ DirectionalZone2_4_SSE4_1<true, true>(dest, stride, top_ptr, left_ptr,
+ width, height, xstep, ystep);
+ } else {
+ DirectionalZone2_4_SSE4_1<true, false>(dest, stride, top_ptr, left_ptr,
+ width, height, xstep, ystep);
+ }
+ } else {
+ if (upsampled_top) {
+ DirectionalZone2_4_SSE4_1<false, true>(dest, stride, top_ptr, left_ptr,
+ width, height, xstep, ystep);
+ } else {
+ DirectionalZone2_4_SSE4_1<false, false>(dest, stride, top_ptr, left_ptr,
+ width, height, xstep, ystep);
+ }
+ }
+ return;
+ }
+ if (upsampled_left) {
+ if (upsampled_top) {
+ DirectionalZone2_SSE4_1<true, true>(dest, stride, top_ptr, left_ptr,
+ width, height, xstep, ystep);
+ } else {
+ DirectionalZone2_SSE4_1<true, false>(dest, stride, top_ptr, left_ptr,
+ width, height, xstep, ystep);
+ }
+ } else {
+ if (upsampled_top) {
+ DirectionalZone2_SSE4_1<false, true>(dest, stride, top_ptr, left_ptr,
+ width, height, xstep, ystep);
+ } else {
+ DirectionalZone2_SSE4_1<false, false>(dest, stride, top_ptr, left_ptr,
+ width, height, xstep, ystep);
+ }
+ }
+}
+
+//------------------------------------------------------------------------------
+// FilterIntraPredictor_SSE4_1
+
+// Apply all filter taps to the given 7 packed 16-bit values, keeping the 8th
+// at zero to preserve the sum.
+inline void Filter4x2_SSE4_1(uint8_t* dst, const ptrdiff_t stride,
+ const __m128i& pixels, const __m128i& taps_0_1,
+ const __m128i& taps_2_3, const __m128i& taps_4_5,
+ const __m128i& taps_6_7) {
+ const __m128i mul_0_01 = _mm_maddubs_epi16(pixels, taps_0_1);
+ const __m128i mul_0_23 = _mm_maddubs_epi16(pixels, taps_2_3);
+ // |output_half| contains 8 partial sums.
+ __m128i output_half = _mm_hadd_epi16(mul_0_01, mul_0_23);
+ __m128i output = _mm_hadd_epi16(output_half, output_half);
+ const __m128i output_row0 =
+ _mm_packus_epi16(RightShiftWithRounding_S16(output, 4),
+ /* arbitrary pack arg */ output);
+ Store4(dst, output_row0);
+ const __m128i mul_1_01 = _mm_maddubs_epi16(pixels, taps_4_5);
+ const __m128i mul_1_23 = _mm_maddubs_epi16(pixels, taps_6_7);
+ output_half = _mm_hadd_epi16(mul_1_01, mul_1_23);
+ output = _mm_hadd_epi16(output_half, output_half);
+ const __m128i output_row1 =
+ _mm_packus_epi16(RightShiftWithRounding_S16(output, 4),
+ /* arbitrary pack arg */ output);
+ Store4(dst + stride, output_row1);
+}
+
+// 4xH transform sizes are given special treatment because LoadLo8 goes out
+// of bounds and every block involves the left column. This implementation
+// loads TL from the top row for the first block, so it is not
+inline void Filter4xH(uint8_t* dest, ptrdiff_t stride,
+ const uint8_t* const top_ptr,
+ const uint8_t* const left_ptr, FilterIntraPredictor pred,
+ const int height) {
+ const __m128i taps_0_1 = LoadUnaligned16(kFilterIntraTaps[pred][0]);
+ const __m128i taps_2_3 = LoadUnaligned16(kFilterIntraTaps[pred][2]);
+ const __m128i taps_4_5 = LoadUnaligned16(kFilterIntraTaps[pred][4]);
+ const __m128i taps_6_7 = LoadUnaligned16(kFilterIntraTaps[pred][6]);
+ __m128i top = Load4(top_ptr - 1);
+ __m128i pixels = _mm_insert_epi8(top, top_ptr[3], 4);
+ __m128i left = (height == 4 ? Load4(left_ptr) : LoadLo8(left_ptr));
+ left = _mm_slli_si128(left, 5);
+
+ // Relative pixels: top[-1], top[0], top[1], top[2], top[3], left[0], left[1],
+ // left[2], left[3], left[4], left[5], left[6], left[7]
+ pixels = _mm_or_si128(left, pixels);
+
+ // Duplicate first 8 bytes.
+ pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
+ Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+ taps_6_7);
+ dest += stride; // Move to y = 1.
+ pixels = Load4(dest);
+
+ // Relative pixels: top[0], top[1], top[2], top[3], empty, left[-2], left[-1],
+ // left[0], left[1], ...
+ pixels = _mm_or_si128(left, pixels);
+
+ // This mask rearranges bytes in the order: 6, 0, 1, 2, 3, 7, 8, 15. The last
+ // byte is an unused value, which shall be multiplied by 0 when we apply the
+ // filter.
+ constexpr int64_t kInsertTopLeftFirstMask = 0x0F08070302010006;
+
+ // Insert left[-1] in front as TL and put left[0] and left[1] at the end.
+ const __m128i pixel_order1 = _mm_set1_epi64x(kInsertTopLeftFirstMask);
+ pixels = _mm_shuffle_epi8(pixels, pixel_order1);
+ dest += stride; // Move to y = 2.
+ Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+ taps_6_7);
+ dest += stride; // Move to y = 3.
+
+ // Compute the middle 8 rows before using common code for the final 4 rows.
+ // Because the common code below this block assumes that
+ if (height == 16) {
+ // This shift allows us to use pixel_order2 twice after shifting by 2 later.
+ left = _mm_slli_si128(left, 1);
+ pixels = Load4(dest);
+
+ // Relative pixels: top[0], top[1], top[2], top[3], empty, empty, left[-4],
+ // left[-3], left[-2], left[-1], left[0], left[1], left[2], left[3]
+ pixels = _mm_or_si128(left, pixels);
+
+ // This mask rearranges bytes in the order: 9, 0, 1, 2, 3, 7, 8, 15. The
+ // last byte is an unused value, as above. The top-left was shifted to
+ // position nine to keep two empty spaces after the top pixels.
+ constexpr int64_t kInsertTopLeftSecondMask = 0x0F0B0A0302010009;
+
+ // Insert (relative) left[-1] in front as TL and put left[0] and left[1] at
+ // the end.
+ const __m128i pixel_order2 = _mm_set1_epi64x(kInsertTopLeftSecondMask);
+ pixels = _mm_shuffle_epi8(pixels, pixel_order2);
+ dest += stride; // Move to y = 4.
+
+ // First 4x2 in the if body.
+ Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+ taps_6_7);
+
+ // Clear all but final pixel in the first 8 of left column.
+ __m128i keep_top_left = _mm_srli_si128(left, 13);
+ dest += stride; // Move to y = 5.
+ pixels = Load4(dest);
+ left = _mm_srli_si128(left, 2);
+
+ // Relative pixels: top[0], top[1], top[2], top[3], left[-6],
+ // left[-5], left[-4], left[-3], left[-2], left[-1], left[0], left[1]
+ pixels = _mm_or_si128(left, pixels);
+ left = LoadLo8(left_ptr + 8);
+
+ pixels = _mm_shuffle_epi8(pixels, pixel_order2);
+ dest += stride; // Move to y = 6.
+
+ // Second 4x2 in the if body.
+ Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+ taps_6_7);
+
+ // Position TL value so we can use pixel_order1.
+ keep_top_left = _mm_slli_si128(keep_top_left, 6);
+ dest += stride; // Move to y = 7.
+ pixels = Load4(dest);
+ left = _mm_slli_si128(left, 7);
+ left = _mm_or_si128(left, keep_top_left);
+
+ // Relative pixels: top[0], top[1], top[2], top[3], empty, empty,
+ // left[-1], left[0], left[1], left[2], left[3], ...
+ pixels = _mm_or_si128(left, pixels);
+ pixels = _mm_shuffle_epi8(pixels, pixel_order1);
+ dest += stride; // Move to y = 8.
+
+ // Third 4x2 in the if body.
+ Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+ taps_6_7);
+ dest += stride; // Move to y = 9.
+
+ // Prepare final inputs.
+ pixels = Load4(dest);
+ left = _mm_srli_si128(left, 2);
+
+ // Relative pixels: top[0], top[1], top[2], top[3], left[-3], left[-2]
+ // left[-1], left[0], left[1], left[2], left[3], ...
+ pixels = _mm_or_si128(left, pixels);
+ pixels = _mm_shuffle_epi8(pixels, pixel_order1);
+ dest += stride; // Move to y = 10.
+
+ // Fourth 4x2 in the if body.
+ Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+ taps_6_7);
+ dest += stride; // Move to y = 11.
+ }
+
+ // In both the 8 and 16 case, we assume that the left vector has the next TL
+ // at position 8.
+ if (height > 4) {
+ // Erase prior left pixels by shifting TL to position 0.
+ left = _mm_srli_si128(left, 8);
+ left = _mm_slli_si128(left, 6);
+ pixels = Load4(dest);
+
+ // Relative pixels: top[0], top[1], top[2], top[3], empty, empty,
+ // left[-1], left[0], left[1], left[2], left[3], ...
+ pixels = _mm_or_si128(left, pixels);
+ pixels = _mm_shuffle_epi8(pixels, pixel_order1);
+ dest += stride; // Move to y = 12 or 4.
+
+ // First of final two 4x2 blocks.
+ Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+ taps_6_7);
+ dest += stride; // Move to y = 13 or 5.
+ pixels = Load4(dest);
+ left = _mm_srli_si128(left, 2);
+
+ // Relative pixels: top[0], top[1], top[2], top[3], left[-3], left[-2]
+ // left[-1], left[0], left[1], left[2], left[3], ...
+ pixels = _mm_or_si128(left, pixels);
+ pixels = _mm_shuffle_epi8(pixels, pixel_order1);
+ dest += stride; // Move to y = 14 or 6.
+
+ // Last of final two 4x2 blocks.
+ Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+ taps_6_7);
+ }
+}
+
+void FilterIntraPredictor_SSE4_1(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column,
+ FilterIntraPredictor pred, const int width,
+ const int height) {
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ auto* dst = static_cast<uint8_t*>(dest);
+ if (width == 4) {
+ Filter4xH(dst, stride, top_ptr, left_ptr, pred, height);
+ return;
+ }
+
+ // There is one set of 7 taps for each of the 4x2 output pixels.
+ const __m128i taps_0_1 = LoadUnaligned16(kFilterIntraTaps[pred][0]);
+ const __m128i taps_2_3 = LoadUnaligned16(kFilterIntraTaps[pred][2]);
+ const __m128i taps_4_5 = LoadUnaligned16(kFilterIntraTaps[pred][4]);
+ const __m128i taps_6_7 = LoadUnaligned16(kFilterIntraTaps[pred][6]);
+
+ // This mask rearranges bytes in the order: 0, 1, 2, 3, 4, 8, 9, 15. The 15 at
+ // the end is an unused value, which shall be multiplied by 0 when we apply
+ // the filter.
+ constexpr int64_t kCondenseLeftMask = 0x0F09080403020100;
+
+ // Takes the "left section" and puts it right after p0-p4.
+ const __m128i pixel_order1 = _mm_set1_epi64x(kCondenseLeftMask);
+
+ // This mask rearranges bytes in the order: 8, 0, 1, 2, 3, 9, 10, 15. The last
+ // byte is unused as above.
+ constexpr int64_t kInsertTopLeftMask = 0x0F0A090302010008;
+
+ // Shuffles the "top left" from the left section, to the front. Used when
+ // grabbing data from left_column and not top_row.
+ const __m128i pixel_order2 = _mm_set1_epi64x(kInsertTopLeftMask);
+
+ // This first pass takes care of the cases where the top left pixel comes from
+ // top_row.
+ __m128i pixels = LoadLo8(top_ptr - 1);
+ __m128i left = _mm_slli_si128(Load4(left_column), 8);
+ pixels = _mm_or_si128(pixels, left);
+
+ // Two sets of the same pixels to multiply with two sets of taps.
+ pixels = _mm_shuffle_epi8(pixels, pixel_order1);
+ Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5, taps_6_7);
+ left = _mm_srli_si128(left, 1);
+
+ // Load
+ pixels = Load4(dst + stride);
+
+ // Because of the above shift, this OR 'invades' the final of the first 8
+ // bytes of |pixels|. This is acceptable because the 8th filter tap is always
+ // a padded 0.
+ pixels = _mm_or_si128(pixels, left);
+ pixels = _mm_shuffle_epi8(pixels, pixel_order2);
+ const ptrdiff_t stride2 = stride << 1;
+ const ptrdiff_t stride4 = stride << 2;
+ Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+ taps_6_7);
+ dst += 4;
+ for (int x = 3; x < width - 4; x += 4) {
+ pixels = Load4(top_ptr + x);
+ pixels = _mm_insert_epi8(pixels, top_ptr[x + 4], 4);
+ pixels = _mm_insert_epi8(pixels, dst[-1], 5);
+ pixels = _mm_insert_epi8(pixels, dst[stride - 1], 6);
+
+ // Duplicate bottom half into upper half.
+ pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
+ Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+ taps_6_7);
+ pixels = Load4(dst + stride - 1);
+ pixels = _mm_insert_epi8(pixels, dst[stride + 3], 4);
+ pixels = _mm_insert_epi8(pixels, dst[stride2 - 1], 5);
+ pixels = _mm_insert_epi8(pixels, dst[stride + stride2 - 1], 6);
+
+ // Duplicate bottom half into upper half.
+ pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
+ Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3,
+ taps_4_5, taps_6_7);
+ dst += 4;
+ }
+
+ // Now we handle heights that reference previous blocks rather than top_row.
+ for (int y = 4; y < height; y += 4) {
+ // Leftmost 4x4 block for this height.
+ dst -= width;
+ dst += stride4;
+
+ // Top Left is not available by offset in these leftmost blocks.
+ pixels = Load4(dst - stride);
+ left = _mm_slli_si128(Load4(left_ptr + y - 1), 8);
+ left = _mm_insert_epi8(left, left_ptr[y + 3], 12);
+ pixels = _mm_or_si128(pixels, left);
+ pixels = _mm_shuffle_epi8(pixels, pixel_order2);
+ Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+ taps_6_7);
+
+ // The bytes shifted into positions 6 and 7 will be ignored by the shuffle.
+ left = _mm_srli_si128(left, 2);
+ pixels = Load4(dst + stride);
+ pixels = _mm_or_si128(pixels, left);
+ pixels = _mm_shuffle_epi8(pixels, pixel_order2);
+ Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3,
+ taps_4_5, taps_6_7);
+
+ dst += 4;
+
+ // Remaining 4x4 blocks for this height.
+ for (int x = 4; x < width; x += 4) {
+ pixels = Load4(dst - stride - 1);
+ pixels = _mm_insert_epi8(pixels, dst[-stride + 3], 4);
+ pixels = _mm_insert_epi8(pixels, dst[-1], 5);
+ pixels = _mm_insert_epi8(pixels, dst[stride - 1], 6);
+
+ // Duplicate bottom half into upper half.
+ pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
+ Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+ taps_6_7);
+ pixels = Load4(dst + stride - 1);
+ pixels = _mm_insert_epi8(pixels, dst[stride + 3], 4);
+ pixels = _mm_insert_epi8(pixels, dst[stride2 - 1], 5);
+ pixels = _mm_insert_epi8(pixels, dst[stride2 + stride - 1], 6);
+
+ // Duplicate bottom half into upper half.
+ pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
+ Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3,
+ taps_4_5, taps_6_7);
+ dst += 4;
+ }
+ }
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+ static_cast<void>(dsp);
+// These guards check if this version of the function was not superseded by
+// a higher optimization level, such as AVX. The corresponding #define also
+// prevents the C version from being added to the table.
+#if DSP_ENABLED_8BPP_SSE4_1(FilterIntraPredictor)
+ dsp->filter_intra_predictor = FilterIntraPredictor_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(DirectionalIntraPredictorZone1)
+ dsp->directional_intra_predictor_zone1 =
+ DirectionalIntraPredictorZone1_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(DirectionalIntraPredictorZone2)
+ dsp->directional_intra_predictor_zone2 =
+ DirectionalIntraPredictorZone2_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(DirectionalIntraPredictorZone3)
+ dsp->directional_intra_predictor_zone3 =
+ DirectionalIntraPredictorZone3_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorDcTop)
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcTop] =
+ DcDefs::_4x4::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_IntraPredictorDcTop)
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcTop] =
+ DcDefs::_4x8::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_IntraPredictorDcTop)
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcTop] =
+ DcDefs::_4x16::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_IntraPredictorDcTop)
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcTop] =
+ DcDefs::_8x4::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_IntraPredictorDcTop)
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcTop] =
+ DcDefs::_8x8::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_IntraPredictorDcTop)
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcTop] =
+ DcDefs::_8x16::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_IntraPredictorDcTop)
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcTop] =
+ DcDefs::_8x32::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_IntraPredictorDcTop)
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcTop] =
+ DcDefs::_16x4::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_IntraPredictorDcTop)
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcTop] =
+ DcDefs::_16x8::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_IntraPredictorDcTop)
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcTop] =
+ DcDefs::_16x16::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_IntraPredictorDcTop)
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcTop] =
+ DcDefs::_16x32::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x64_IntraPredictorDcTop)
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcTop] =
+ DcDefs::_16x64::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_IntraPredictorDcTop)
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcTop] =
+ DcDefs::_32x8::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_IntraPredictorDcTop)
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcTop] =
+ DcDefs::_32x16::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_IntraPredictorDcTop)
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcTop] =
+ DcDefs::_32x32::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x64_IntraPredictorDcTop)
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcTop] =
+ DcDefs::_32x64::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x16_IntraPredictorDcTop)
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcTop] =
+ DcDefs::_64x16::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x32_IntraPredictorDcTop)
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcTop] =
+ DcDefs::_64x32::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x64_IntraPredictorDcTop)
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcTop] =
+ DcDefs::_64x64::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorDcLeft)
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcLeft] =
+ DcDefs::_4x4::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_IntraPredictorDcLeft)
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcLeft] =
+ DcDefs::_4x8::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_IntraPredictorDcLeft)
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcLeft] =
+ DcDefs::_4x16::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_IntraPredictorDcLeft)
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcLeft] =
+ DcDefs::_8x4::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_IntraPredictorDcLeft)
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcLeft] =
+ DcDefs::_8x8::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_IntraPredictorDcLeft)
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcLeft] =
+ DcDefs::_8x16::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_IntraPredictorDcLeft)
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcLeft] =
+ DcDefs::_8x32::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_IntraPredictorDcLeft)
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcLeft] =
+ DcDefs::_16x4::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_IntraPredictorDcLeft)
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcLeft] =
+ DcDefs::_16x8::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_IntraPredictorDcLeft)
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcLeft] =
+ DcDefs::_16x16::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_IntraPredictorDcLeft)
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcLeft] =
+ DcDefs::_16x32::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x64_IntraPredictorDcLeft)
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcLeft] =
+ DcDefs::_16x64::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_IntraPredictorDcLeft)
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcLeft] =
+ DcDefs::_32x8::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_IntraPredictorDcLeft)
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcLeft] =
+ DcDefs::_32x16::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_IntraPredictorDcLeft)
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcLeft] =
+ DcDefs::_32x32::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x64_IntraPredictorDcLeft)
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcLeft] =
+ DcDefs::_32x64::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x16_IntraPredictorDcLeft)
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcLeft] =
+ DcDefs::_64x16::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x32_IntraPredictorDcLeft)
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcLeft] =
+ DcDefs::_64x32::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x64_IntraPredictorDcLeft)
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcLeft] =
+ DcDefs::_64x64::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorDc)
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDc] =
+ DcDefs::_4x4::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_IntraPredictorDc)
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDc] =
+ DcDefs::_4x8::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_IntraPredictorDc)
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDc] =
+ DcDefs::_4x16::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_IntraPredictorDc)
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDc] =
+ DcDefs::_8x4::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_IntraPredictorDc)
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDc] =
+ DcDefs::_8x8::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_IntraPredictorDc)
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDc] =
+ DcDefs::_8x16::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_IntraPredictorDc)
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDc] =
+ DcDefs::_8x32::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_IntraPredictorDc)
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDc] =
+ DcDefs::_16x4::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_IntraPredictorDc)
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDc] =
+ DcDefs::_16x8::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_IntraPredictorDc)
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDc] =
+ DcDefs::_16x16::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_IntraPredictorDc)
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDc] =
+ DcDefs::_16x32::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x64_IntraPredictorDc)
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDc] =
+ DcDefs::_16x64::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_IntraPredictorDc)
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDc] =
+ DcDefs::_32x8::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_IntraPredictorDc)
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDc] =
+ DcDefs::_32x16::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_IntraPredictorDc)
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDc] =
+ DcDefs::_32x32::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x64_IntraPredictorDc)
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDc] =
+ DcDefs::_32x64::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x16_IntraPredictorDc)
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDc] =
+ DcDefs::_64x16::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x32_IntraPredictorDc)
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDc] =
+ DcDefs::_64x32::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x64_IntraPredictorDc)
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDc] =
+ DcDefs::_64x64::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorPaeth)
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorPaeth] =
+ Paeth4x4_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_IntraPredictorPaeth)
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorPaeth] =
+ Paeth4x8_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_IntraPredictorPaeth)
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorPaeth] =
+ Paeth4x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_IntraPredictorPaeth)
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorPaeth] =
+ Paeth8x4_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_IntraPredictorPaeth)
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorPaeth] =
+ Paeth8x8_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_IntraPredictorPaeth)
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorPaeth] =
+ Paeth8x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_IntraPredictorPaeth)
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorPaeth] =
+ Paeth8x32_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_IntraPredictorPaeth)
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorPaeth] =
+ Paeth16x4_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_IntraPredictorPaeth)
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorPaeth] =
+ Paeth16x8_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_IntraPredictorPaeth)
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorPaeth] =
+ Paeth16x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_IntraPredictorPaeth)
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorPaeth] =
+ Paeth16x32_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x64_IntraPredictorPaeth)
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorPaeth] =
+ Paeth16x64_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_IntraPredictorPaeth)
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorPaeth] =
+ Paeth32x8_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_IntraPredictorPaeth)
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorPaeth] =
+ Paeth32x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_IntraPredictorPaeth)
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorPaeth] =
+ Paeth32x32_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x64_IntraPredictorPaeth)
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorPaeth] =
+ Paeth32x64_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x16_IntraPredictorPaeth)
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorPaeth] =
+ Paeth64x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x32_IntraPredictorPaeth)
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorPaeth] =
+ Paeth64x32_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x64_IntraPredictorPaeth)
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorPaeth] =
+ Paeth64x64_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorHorizontal] =
+ DirDefs::_4x4::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorHorizontal] =
+ DirDefs::_4x8::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorHorizontal] =
+ DirDefs::_4x16::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorHorizontal] =
+ DirDefs::_8x4::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorHorizontal] =
+ DirDefs::_8x8::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorHorizontal] =
+ DirDefs::_8x16::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorHorizontal] =
+ DirDefs::_8x32::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorHorizontal] =
+ DirDefs::_16x4::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorHorizontal] =
+ DirDefs::_16x8::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorHorizontal] =
+ DirDefs::_16x16::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorHorizontal] =
+ DirDefs::_16x32::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x64_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorHorizontal] =
+ DirDefs::_16x64::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorHorizontal] =
+ DirDefs::_32x8::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorHorizontal] =
+ DirDefs::_32x16::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorHorizontal] =
+ DirDefs::_32x32::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x64_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorHorizontal] =
+ DirDefs::_32x64::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x16_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorHorizontal] =
+ DirDefs::_64x16::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x32_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorHorizontal] =
+ DirDefs::_64x32::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x64_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorHorizontal] =
+ DirDefs::_64x64::Horizontal;
+#endif
+} // NOLINT(readability/fn_size)
+// TODO(petersonab): Split Init8bpp function into family-specific files.
+
+} // namespace
+} // namespace low_bitdepth
+
+//------------------------------------------------------------------------------
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+template <int height>
+inline void DcStore4xH_SSE4_1(void* const dest, ptrdiff_t stride,
+ const __m128i dc) {
+ const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0);
+ int y = height - 1;
+ auto* dst = static_cast<uint8_t*>(dest);
+ do {
+ StoreLo8(dst, dc_dup);
+ dst += stride;
+ } while (--y != 0);
+ StoreLo8(dst, dc_dup);
+}
+
+// WriteDuplicateN assumes dup has 4 32-bit "units," each of which comprises 2
+// identical shorts that need N total copies written into dest. The unpacking
+// works the same as in the 8bpp case, except that each 32-bit unit needs twice
+// as many copies.
+inline void WriteDuplicate4x4(void* const dest, ptrdiff_t stride,
+ const __m128i dup32) {
+ const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32);
+ auto* dst = static_cast<uint8_t*>(dest);
+ _mm_storel_epi64(reinterpret_cast<__m128i*>(dst), dup64_lo);
+ dst += stride;
+ _mm_storeh_pi(reinterpret_cast<__m64*>(dst), _mm_castsi128_ps(dup64_lo));
+ dst += stride;
+ const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32);
+ _mm_storel_epi64(reinterpret_cast<__m128i*>(dst), dup64_hi);
+ dst += stride;
+ _mm_storeh_pi(reinterpret_cast<__m64*>(dst), _mm_castsi128_ps(dup64_hi));
+}
+
+inline void WriteDuplicate8x4(void* const dest, ptrdiff_t stride,
+ const __m128i dup32) {
+ const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32);
+ const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32);
+
+ auto* dst = static_cast<uint8_t*>(dest);
+ const __m128i dup128_0 = _mm_unpacklo_epi64(dup64_lo, dup64_lo);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_0);
+ dst += stride;
+ const __m128i dup128_1 = _mm_unpackhi_epi64(dup64_lo, dup64_lo);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_1);
+ dst += stride;
+ const __m128i dup128_2 = _mm_unpacklo_epi64(dup64_hi, dup64_hi);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_2);
+ dst += stride;
+ const __m128i dup128_3 = _mm_unpackhi_epi64(dup64_hi, dup64_hi);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_3);
+}
+
+inline void WriteDuplicate16x4(void* const dest, ptrdiff_t stride,
+ const __m128i dup32) {
+ const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32);
+ const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32);
+
+ auto* dst = static_cast<uint8_t*>(dest);
+ const __m128i dup128_0 = _mm_unpacklo_epi64(dup64_lo, dup64_lo);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_0);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_0);
+ dst += stride;
+ const __m128i dup128_1 = _mm_unpackhi_epi64(dup64_lo, dup64_lo);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_1);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_1);
+ dst += stride;
+ const __m128i dup128_2 = _mm_unpacklo_epi64(dup64_hi, dup64_hi);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_2);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_2);
+ dst += stride;
+ const __m128i dup128_3 = _mm_unpackhi_epi64(dup64_hi, dup64_hi);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_3);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_3);
+}
+
+inline void WriteDuplicate32x4(void* const dest, ptrdiff_t stride,
+ const __m128i dup32) {
+ const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32);
+ const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32);
+
+ auto* dst = static_cast<uint8_t*>(dest);
+ const __m128i dup128_0 = _mm_unpacklo_epi64(dup64_lo, dup64_lo);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_0);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_0);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 32), dup128_0);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 48), dup128_0);
+ dst += stride;
+ const __m128i dup128_1 = _mm_unpackhi_epi64(dup64_lo, dup64_lo);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_1);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_1);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 32), dup128_1);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 48), dup128_1);
+ dst += stride;
+ const __m128i dup128_2 = _mm_unpacklo_epi64(dup64_hi, dup64_hi);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_2);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_2);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 32), dup128_2);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 48), dup128_2);
+ dst += stride;
+ const __m128i dup128_3 = _mm_unpackhi_epi64(dup64_hi, dup64_hi);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_3);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_3);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 32), dup128_3);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 48), dup128_3);
+}
+
+inline void WriteDuplicate64x4(void* const dest, ptrdiff_t stride,
+ const __m128i dup32) {
+ const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32);
+ const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32);
+
+ auto* dst = static_cast<uint8_t*>(dest);
+ const __m128i dup128_0 = _mm_unpacklo_epi64(dup64_lo, dup64_lo);
+ for (int x = 0; x < 128; x += 16) {
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + x), dup128_0);
+ }
+ dst += stride;
+ const __m128i dup128_1 = _mm_unpackhi_epi64(dup64_lo, dup64_lo);
+ for (int x = 0; x < 128; x += 16) {
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + x), dup128_1);
+ }
+ dst += stride;
+ const __m128i dup128_2 = _mm_unpacklo_epi64(dup64_hi, dup64_hi);
+ for (int x = 0; x < 128; x += 16) {
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + x), dup128_2);
+ }
+ dst += stride;
+ const __m128i dup128_3 = _mm_unpackhi_epi64(dup64_hi, dup64_hi);
+ for (int x = 0; x < 128; x += 16) {
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + x), dup128_3);
+ }
+}
+
+// ColStoreN<height> copies each of the |height| values in |column| across its
+// corresponding row in dest.
+template <WriteDuplicateFunc writefn>
+inline void ColStore4_SSE4_1(void* const dest, ptrdiff_t stride,
+ const void* const column) {
+ const __m128i col_data = LoadLo8(column);
+ const __m128i col_dup32 = _mm_unpacklo_epi16(col_data, col_data);
+ writefn(dest, stride, col_dup32);
+}
+
+template <WriteDuplicateFunc writefn>
+inline void ColStore8_SSE4_1(void* const dest, ptrdiff_t stride,
+ const void* const column) {
+ const __m128i col_data = LoadUnaligned16(column);
+ const __m128i col_dup32_lo = _mm_unpacklo_epi16(col_data, col_data);
+ const __m128i col_dup32_hi = _mm_unpackhi_epi16(col_data, col_data);
+ auto* dst = static_cast<uint8_t*>(dest);
+ writefn(dst, stride, col_dup32_lo);
+ const ptrdiff_t stride4 = stride << 2;
+ dst += stride4;
+ writefn(dst, stride, col_dup32_hi);
+}
+
+template <WriteDuplicateFunc writefn>
+inline void ColStore16_SSE4_1(void* const dest, ptrdiff_t stride,
+ const void* const column) {
+ const ptrdiff_t stride4 = stride << 2;
+ auto* dst = static_cast<uint8_t*>(dest);
+ for (int y = 0; y < 32; y += 16) {
+ const __m128i col_data =
+ LoadUnaligned16(static_cast<const uint8_t*>(column) + y);
+ const __m128i col_dup32_lo = _mm_unpacklo_epi16(col_data, col_data);
+ const __m128i col_dup32_hi = _mm_unpackhi_epi16(col_data, col_data);
+ writefn(dst, stride, col_dup32_lo);
+ dst += stride4;
+ writefn(dst, stride, col_dup32_hi);
+ dst += stride4;
+ }
+}
+
+template <WriteDuplicateFunc writefn>
+inline void ColStore32_SSE4_1(void* const dest, ptrdiff_t stride,
+ const void* const column) {
+ const ptrdiff_t stride4 = stride << 2;
+ auto* dst = static_cast<uint8_t*>(dest);
+ for (int y = 0; y < 64; y += 16) {
+ const __m128i col_data =
+ LoadUnaligned16(static_cast<const uint8_t*>(column) + y);
+ const __m128i col_dup32_lo = _mm_unpacklo_epi16(col_data, col_data);
+ const __m128i col_dup32_hi = _mm_unpackhi_epi16(col_data, col_data);
+ writefn(dst, stride, col_dup32_lo);
+ dst += stride4;
+ writefn(dst, stride, col_dup32_hi);
+ dst += stride4;
+ }
+}
+
+template <WriteDuplicateFunc writefn>
+inline void ColStore64_SSE4_1(void* const dest, ptrdiff_t stride,
+ const void* const column) {
+ const ptrdiff_t stride4 = stride << 2;
+ auto* dst = static_cast<uint8_t*>(dest);
+ for (int y = 0; y < 128; y += 16) {
+ const __m128i col_data =
+ LoadUnaligned16(static_cast<const uint8_t*>(column) + y);
+ const __m128i col_dup32_lo = _mm_unpacklo_epi16(col_data, col_data);
+ const __m128i col_dup32_hi = _mm_unpackhi_epi16(col_data, col_data);
+ writefn(dst, stride, col_dup32_lo);
+ dst += stride4;
+ writefn(dst, stride, col_dup32_hi);
+ dst += stride4;
+ }
+}
+
+// |ref| points to 8 bytes containing 4 packed int16 values.
+inline __m128i DcSum4_SSE4_1(const void* ref) {
+ const __m128i vals = _mm_loadl_epi64(static_cast<const __m128i*>(ref));
+ const __m128i ones = _mm_set1_epi16(1);
+
+ // half_sum[31:0] = a1+a2
+ // half_sum[63:32] = a3+a4
+ const __m128i half_sum = _mm_madd_epi16(vals, ones);
+ // Place half_sum[63:32] in shift_sum[31:0].
+ const __m128i shift_sum = _mm_srli_si128(half_sum, 4);
+ return _mm_add_epi32(half_sum, shift_sum);
+}
+
+struct DcDefs {
+ DcDefs() = delete;
+
+ using _4x4 = DcPredFuncs_SSE4_1<2, 2, DcSum4_SSE4_1, DcSum4_SSE4_1,
+ DcStore4xH_SSE4_1<4>, 0, 0>;
+};
+
+struct DirDefs {
+ DirDefs() = delete;
+
+ using _4x4 = DirectionalPredFuncs_SSE4_1<ColStore4_SSE4_1<WriteDuplicate4x4>>;
+ using _4x8 = DirectionalPredFuncs_SSE4_1<ColStore8_SSE4_1<WriteDuplicate4x4>>;
+ using _4x16 =
+ DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate4x4>>;
+ using _8x4 = DirectionalPredFuncs_SSE4_1<ColStore4_SSE4_1<WriteDuplicate8x4>>;
+ using _8x8 = DirectionalPredFuncs_SSE4_1<ColStore8_SSE4_1<WriteDuplicate8x4>>;
+ using _8x16 =
+ DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate8x4>>;
+ using _8x32 =
+ DirectionalPredFuncs_SSE4_1<ColStore32_SSE4_1<WriteDuplicate8x4>>;
+ using _16x4 =
+ DirectionalPredFuncs_SSE4_1<ColStore4_SSE4_1<WriteDuplicate16x4>>;
+ using _16x8 =
+ DirectionalPredFuncs_SSE4_1<ColStore8_SSE4_1<WriteDuplicate16x4>>;
+ using _16x16 =
+ DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate16x4>>;
+ using _16x32 =
+ DirectionalPredFuncs_SSE4_1<ColStore32_SSE4_1<WriteDuplicate16x4>>;
+ using _16x64 =
+ DirectionalPredFuncs_SSE4_1<ColStore64_SSE4_1<WriteDuplicate16x4>>;
+ using _32x8 =
+ DirectionalPredFuncs_SSE4_1<ColStore8_SSE4_1<WriteDuplicate32x4>>;
+ using _32x16 =
+ DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate32x4>>;
+ using _32x32 =
+ DirectionalPredFuncs_SSE4_1<ColStore32_SSE4_1<WriteDuplicate32x4>>;
+ using _32x64 =
+ DirectionalPredFuncs_SSE4_1<ColStore64_SSE4_1<WriteDuplicate32x4>>;
+ using _64x16 =
+ DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate64x4>>;
+ using _64x32 =
+ DirectionalPredFuncs_SSE4_1<ColStore32_SSE4_1<WriteDuplicate64x4>>;
+ using _64x64 =
+ DirectionalPredFuncs_SSE4_1<ColStore64_SSE4_1<WriteDuplicate64x4>>;
+};
+
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+ assert(dsp != nullptr);
+ static_cast<void>(dsp);
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x4_IntraPredictorDcTop)
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcTop] =
+ DcDefs::_4x4::DcTop;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x4_IntraPredictorDcLeft)
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcLeft] =
+ DcDefs::_4x4::DcLeft;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x4_IntraPredictorDc)
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDc] =
+ DcDefs::_4x4::Dc;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x4_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorHorizontal] =
+ DirDefs::_4x4::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x8_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorHorizontal] =
+ DirDefs::_4x8::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x16_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorHorizontal] =
+ DirDefs::_4x16::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x4_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorHorizontal] =
+ DirDefs::_8x4::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x8_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorHorizontal] =
+ DirDefs::_8x8::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x16_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorHorizontal] =
+ DirDefs::_8x16::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x32_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorHorizontal] =
+ DirDefs::_8x32::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x4_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorHorizontal] =
+ DirDefs::_16x4::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x8_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorHorizontal] =
+ DirDefs::_16x8::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x16_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorHorizontal] =
+ DirDefs::_16x16::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x32_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorHorizontal] =
+ DirDefs::_16x32::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x64_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorHorizontal] =
+ DirDefs::_16x64::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x8_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorHorizontal] =
+ DirDefs::_32x8::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x16_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorHorizontal] =
+ DirDefs::_32x16::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x32_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorHorizontal] =
+ DirDefs::_32x32::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x64_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorHorizontal] =
+ DirDefs::_32x64::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize64x16_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorHorizontal] =
+ DirDefs::_64x16::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize64x32_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorHorizontal] =
+ DirDefs::_64x32::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize64x64_IntraPredictorHorizontal)
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorHorizontal] =
+ DirDefs::_64x64::Horizontal;
+#endif
+}
+
+} // namespace
+} // namespace high_bitdepth
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+void IntraPredInit_SSE4_1() {
+ low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ high_bitdepth::Init10bpp();
+#endif
+}
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_TARGETING_SSE4_1
+namespace libgav1 {
+namespace dsp {
+
+void IntraPredInit_SSE4_1() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/intrapred_sse4.h b/src/dsp/x86/intrapred_sse4.h
new file mode 100644
index 0000000..7f4fcd7
--- /dev/null
+++ b/src/dsp/x86/intrapred_sse4.h
@@ -0,0 +1,1060 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_INTRAPRED_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_INTRAPRED_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::intra_predictors, Dsp::directional_intra_predictor_zone*,
+// Dsp::cfl_intra_predictors, Dsp::cfl_subsamplers and
+// Dsp::filter_intra_predictor, see the defines below for specifics. These
+// functions are not thread-safe.
+void IntraPredInit_SSE4_1();
+void IntraPredCflInit_SSE4_1();
+void IntraPredSmoothInit_SSE4_1();
+
+} // namespace dsp
+} // namespace libgav1
+
+// If sse4 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the sse4 implementation should be used.
+#if LIBGAV1_TARGETING_SSE4_1
+#ifndef LIBGAV1_Dsp8bpp_FilterIntraPredictor
+#define LIBGAV1_Dsp8bpp_FilterIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone1
+#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone1 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone2
+#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone2 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone3
+#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone3 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcTop LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDcTop LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDcTop LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDcTop LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDcTop LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDcTop LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDcTop LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDcTop LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDcTop LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDcTop \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDcTop \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDcTop \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDcTop LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDcTop \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDcTop \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDcTop \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDcTop \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDcTop \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDcTop \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcLeft LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDcLeft LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDcLeft \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDcLeft LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDcLeft LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDcLeft \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDcLeft \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDcLeft \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDcLeft \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDcLeft \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDcLeft \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDcLeft \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDcLeft \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDcLeft \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDcLeft \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDcLeft \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDcLeft \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDcLeft \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDcLeft \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorPaeth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorPaeth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorPaeth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorPaeth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorPaeth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorPaeth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorPaeth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorPaeth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorPaeth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorPaeth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorPaeth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorPaeth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorPaeth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorPaeth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorPaeth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorPaeth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorPaeth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorPaeth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorPaeth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmooth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmooth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmooth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmooth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmooth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmooth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmooth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmooth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmooth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmooth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmooth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmooth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmooth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmooth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmooth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmooth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmooth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmooth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmooth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+//------------------------------------------------------------------------------
+// 10bpp
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDcTop
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDcTop LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDcLeft
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDcLeft \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDc
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif // LIBGAV1_TARGETING_SSE4_1
+
+#endif // LIBGAV1_SRC_DSP_X86_INTRAPRED_SSE4_H_
diff --git a/src/dsp/x86/inverse_transform_sse4.cc b/src/dsp/x86/inverse_transform_sse4.cc
new file mode 100644
index 0000000..787d706
--- /dev/null
+++ b/src/dsp/x86/inverse_transform_sse4.cc
@@ -0,0 +1,3086 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/inverse_transform.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <smmintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/dsp/x86/transpose_sse4.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+// Include the constants and utility functions inside the anonymous namespace.
+#include "src/dsp/inverse_transform.inc"
+
+template <int store_width, int store_count>
+LIBGAV1_ALWAYS_INLINE void StoreDst(int16_t* dst, int32_t stride, int32_t idx,
+ const __m128i* s) {
+ // NOTE: It is expected that the compiler will unroll these loops.
+ if (store_width == 16) {
+ for (int i = 0; i < store_count; i += 4) {
+ StoreUnaligned16(&dst[i * stride + idx], s[i]);
+ StoreUnaligned16(&dst[(i + 1) * stride + idx], s[i + 1]);
+ StoreUnaligned16(&dst[(i + 2) * stride + idx], s[i + 2]);
+ StoreUnaligned16(&dst[(i + 3) * stride + idx], s[i + 3]);
+ }
+ }
+ if (store_width == 8) {
+ for (int i = 0; i < store_count; i += 4) {
+ StoreLo8(&dst[i * stride + idx], s[i]);
+ StoreLo8(&dst[(i + 1) * stride + idx], s[i + 1]);
+ StoreLo8(&dst[(i + 2) * stride + idx], s[i + 2]);
+ StoreLo8(&dst[(i + 3) * stride + idx], s[i + 3]);
+ }
+ }
+}
+
+template <int load_width, int load_count>
+LIBGAV1_ALWAYS_INLINE void LoadSrc(const int16_t* src, int32_t stride,
+ int32_t idx, __m128i* x) {
+ // NOTE: It is expected that the compiler will unroll these loops.
+ if (load_width == 16) {
+ for (int i = 0; i < load_count; i += 4) {
+ x[i] = LoadUnaligned16(&src[i * stride + idx]);
+ x[i + 1] = LoadUnaligned16(&src[(i + 1) * stride + idx]);
+ x[i + 2] = LoadUnaligned16(&src[(i + 2) * stride + idx]);
+ x[i + 3] = LoadUnaligned16(&src[(i + 3) * stride + idx]);
+ }
+ }
+ if (load_width == 8) {
+ for (int i = 0; i < load_count; i += 4) {
+ x[i] = LoadLo8(&src[i * stride + idx]);
+ x[i + 1] = LoadLo8(&src[(i + 1) * stride + idx]);
+ x[i + 2] = LoadLo8(&src[(i + 2) * stride + idx]);
+ x[i + 3] = LoadLo8(&src[(i + 3) * stride + idx]);
+ }
+ }
+}
+
+// Butterfly rotate 4 values.
+LIBGAV1_ALWAYS_INLINE void ButterflyRotation_4(__m128i* a, __m128i* b,
+ const int angle,
+ const bool flip) {
+ const int16_t cos128 = Cos128(angle);
+ const int16_t sin128 = Sin128(angle);
+ const __m128i psin_pcos = _mm_set1_epi32(
+ static_cast<uint16_t>(cos128) | (static_cast<uint32_t>(sin128) << 16));
+ const __m128i ba = _mm_unpacklo_epi16(*a, *b);
+ const __m128i ab = _mm_unpacklo_epi16(*b, *a);
+ const __m128i sign =
+ _mm_set_epi32(0x80000001, 0x80000001, 0x80000001, 0x80000001);
+ // -sin cos, -sin cos, -sin cos, -sin cos
+ const __m128i msin_pcos = _mm_sign_epi16(psin_pcos, sign);
+ const __m128i x0 = _mm_madd_epi16(ba, msin_pcos);
+ const __m128i y0 = _mm_madd_epi16(ab, psin_pcos);
+ const __m128i x1 = RightShiftWithRounding_S32(x0, 12);
+ const __m128i y1 = RightShiftWithRounding_S32(y0, 12);
+ const __m128i x = _mm_packs_epi32(x1, x1);
+ const __m128i y = _mm_packs_epi32(y1, y1);
+ if (flip) {
+ *a = y;
+ *b = x;
+ } else {
+ *a = x;
+ *b = y;
+ }
+}
+
+// Butterfly rotate 8 values.
+LIBGAV1_ALWAYS_INLINE void ButterflyRotation_8(__m128i* a, __m128i* b,
+ const int angle,
+ const bool flip) {
+ const int16_t cos128 = Cos128(angle);
+ const int16_t sin128 = Sin128(angle);
+ const __m128i psin_pcos = _mm_set1_epi32(
+ static_cast<uint16_t>(cos128) | (static_cast<uint32_t>(sin128) << 16));
+ const __m128i sign =
+ _mm_set_epi32(0x80000001, 0x80000001, 0x80000001, 0x80000001);
+ // -sin cos, -sin cos, -sin cos, -sin cos
+ const __m128i msin_pcos = _mm_sign_epi16(psin_pcos, sign);
+ const __m128i ba = _mm_unpacklo_epi16(*a, *b);
+ const __m128i ab = _mm_unpacklo_epi16(*b, *a);
+ const __m128i ba_hi = _mm_unpackhi_epi16(*a, *b);
+ const __m128i ab_hi = _mm_unpackhi_epi16(*b, *a);
+ const __m128i x0 = _mm_madd_epi16(ba, msin_pcos);
+ const __m128i y0 = _mm_madd_epi16(ab, psin_pcos);
+ const __m128i x0_hi = _mm_madd_epi16(ba_hi, msin_pcos);
+ const __m128i y0_hi = _mm_madd_epi16(ab_hi, psin_pcos);
+ const __m128i x1 = RightShiftWithRounding_S32(x0, 12);
+ const __m128i y1 = RightShiftWithRounding_S32(y0, 12);
+ const __m128i x1_hi = RightShiftWithRounding_S32(x0_hi, 12);
+ const __m128i y1_hi = RightShiftWithRounding_S32(y0_hi, 12);
+ const __m128i x = _mm_packs_epi32(x1, x1_hi);
+ const __m128i y = _mm_packs_epi32(y1, y1_hi);
+ if (flip) {
+ *a = y;
+ *b = x;
+ } else {
+ *a = x;
+ *b = y;
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE void ButterflyRotation_FirstIsZero(__m128i* a, __m128i* b,
+ const int angle,
+ const bool flip) {
+ const int16_t cos128 = Cos128(angle);
+ const int16_t sin128 = Sin128(angle);
+ const __m128i pcos = _mm_set1_epi16(cos128 << 3);
+ const __m128i psin = _mm_set1_epi16(-(sin128 << 3));
+ const __m128i x = _mm_mulhrs_epi16(*b, psin);
+ const __m128i y = _mm_mulhrs_epi16(*b, pcos);
+ if (flip) {
+ *a = y;
+ *b = x;
+ } else {
+ *a = x;
+ *b = y;
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE void ButterflyRotation_SecondIsZero(__m128i* a,
+ __m128i* b,
+ const int angle,
+ const bool flip) {
+ const int16_t cos128 = Cos128(angle);
+ const int16_t sin128 = Sin128(angle);
+ const __m128i pcos = _mm_set1_epi16(cos128 << 3);
+ const __m128i psin = _mm_set1_epi16(sin128 << 3);
+ const __m128i x = _mm_mulhrs_epi16(*a, pcos);
+ const __m128i y = _mm_mulhrs_epi16(*a, psin);
+ if (flip) {
+ *a = y;
+ *b = x;
+ } else {
+ *a = x;
+ *b = y;
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE void HadamardRotation(__m128i* a, __m128i* b, bool flip) {
+ __m128i x, y;
+ if (flip) {
+ y = _mm_adds_epi16(*b, *a);
+ x = _mm_subs_epi16(*b, *a);
+ } else {
+ x = _mm_adds_epi16(*a, *b);
+ y = _mm_subs_epi16(*a, *b);
+ }
+ *a = x;
+ *b = y;
+}
+
+using ButterflyRotationFunc = void (*)(__m128i* a, __m128i* b, int angle,
+ bool flip);
+
+LIBGAV1_ALWAYS_INLINE __m128i ShiftResidual(const __m128i residual,
+ const __m128i v_row_shift_add,
+ const __m128i v_row_shift) {
+ const __m128i k7ffd = _mm_set1_epi16(0x7ffd);
+ // The max row_shift is 2, so int16_t values greater than 0x7ffd may
+ // overflow. Generate a mask for this case.
+ const __m128i mask = _mm_cmpgt_epi16(residual, k7ffd);
+ const __m128i x = _mm_add_epi16(residual, v_row_shift_add);
+ // Assume int16_t values.
+ const __m128i a = _mm_sra_epi16(x, v_row_shift);
+ // Assume uint16_t values.
+ const __m128i b = _mm_srl_epi16(x, v_row_shift);
+ // Select the correct shifted value.
+ return _mm_blendv_epi8(a, b, mask);
+}
+
+//------------------------------------------------------------------------------
+// Discrete Cosine Transforms (DCT).
+
+template <int width>
+LIBGAV1_ALWAYS_INLINE bool DctDcOnly(void* dest, int adjusted_tx_height,
+ bool should_round, int row_shift) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int16_t*>(dest);
+ const __m128i v_src_lo = _mm_shufflelo_epi16(_mm_cvtsi32_si128(dst[0]), 0);
+ const __m128i v_src =
+ (width == 4) ? v_src_lo : _mm_shuffle_epi32(v_src_lo, 0);
+ const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0);
+ const __m128i v_kTransformRowMultiplier =
+ _mm_set1_epi16(kTransformRowMultiplier << 3);
+ const __m128i v_src_round =
+ _mm_mulhrs_epi16(v_src, v_kTransformRowMultiplier);
+ const __m128i s0 = _mm_blendv_epi8(v_src, v_src_round, v_mask);
+ const int16_t cos128 = Cos128(32);
+ const __m128i xy = _mm_mulhrs_epi16(s0, _mm_set1_epi16(cos128 << 3));
+
+ // Expand to 32 bits to prevent int16_t overflows during the shift add.
+ const __m128i v_row_shift_add = _mm_set1_epi32(row_shift);
+ const __m128i v_row_shift = _mm_cvtepu32_epi64(v_row_shift_add);
+ const __m128i a = _mm_cvtepi16_epi32(xy);
+ const __m128i a1 = _mm_cvtepi16_epi32(_mm_srli_si128(xy, 8));
+ const __m128i b = _mm_add_epi32(a, v_row_shift_add);
+ const __m128i b1 = _mm_add_epi32(a1, v_row_shift_add);
+ const __m128i c = _mm_sra_epi32(b, v_row_shift);
+ const __m128i c1 = _mm_sra_epi32(b1, v_row_shift);
+ const __m128i xy_shifted = _mm_packs_epi32(c, c1);
+
+ if (width == 4) {
+ StoreLo8(dst, xy_shifted);
+ } else {
+ for (int i = 0; i < width; i += 8) {
+ StoreUnaligned16(dst, xy_shifted);
+ dst += 8;
+ }
+ }
+ return true;
+}
+
+template <int height>
+LIBGAV1_ALWAYS_INLINE bool DctDcOnlyColumn(void* dest, int adjusted_tx_height,
+ int width) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int16_t*>(dest);
+ const int16_t cos128 = Cos128(32);
+
+ // Calculate dc values for first row.
+ if (width == 4) {
+ const __m128i v_src = LoadLo8(dst);
+ const __m128i xy = _mm_mulhrs_epi16(v_src, _mm_set1_epi16(cos128 << 3));
+ StoreLo8(dst, xy);
+ } else {
+ int i = 0;
+ do {
+ const __m128i v_src = LoadUnaligned16(&dst[i]);
+ const __m128i xy = _mm_mulhrs_epi16(v_src, _mm_set1_epi16(cos128 << 3));
+ StoreUnaligned16(&dst[i], xy);
+ i += 8;
+ } while (i < width);
+ }
+
+ // Copy first row to the rest of the block.
+ for (int y = 1; y < height; ++y) {
+ memcpy(&dst[y * width], dst, width * sizeof(dst[0]));
+ }
+ return true;
+}
+
+template <ButterflyRotationFunc butterfly_rotation,
+ bool is_fast_butterfly = false>
+LIBGAV1_ALWAYS_INLINE void Dct4Stages(__m128i* s) {
+ // stage 12.
+ if (is_fast_butterfly) {
+ ButterflyRotation_SecondIsZero(&s[0], &s[1], 32, true);
+ ButterflyRotation_SecondIsZero(&s[2], &s[3], 48, false);
+ } else {
+ butterfly_rotation(&s[0], &s[1], 32, true);
+ butterfly_rotation(&s[2], &s[3], 48, false);
+ }
+
+ // stage 17.
+ HadamardRotation(&s[0], &s[3], false);
+ HadamardRotation(&s[1], &s[2], false);
+}
+
+// Process 4 dct4 rows or columns, depending on the transpose flag.
+template <ButterflyRotationFunc butterfly_rotation, bool stage_is_rectangular>
+LIBGAV1_ALWAYS_INLINE void Dct4_SSE4_1(void* dest, int32_t step,
+ bool transpose) {
+ auto* const dst = static_cast<int16_t*>(dest);
+ __m128i s[4], x[4];
+
+ if (stage_is_rectangular) {
+ if (transpose) {
+ __m128i input[8];
+ LoadSrc<8, 8>(dst, step, 0, input);
+ Transpose4x8To8x4_U16(input, x);
+ } else {
+ LoadSrc<16, 4>(dst, step, 0, x);
+ }
+ } else {
+ LoadSrc<8, 4>(dst, step, 0, x);
+ if (transpose) {
+ Transpose4x4_U16(x, x);
+ }
+ }
+ // stage 1.
+ // kBitReverseLookup 0, 2, 1, 3
+ s[0] = x[0];
+ s[1] = x[2];
+ s[2] = x[1];
+ s[3] = x[3];
+
+ Dct4Stages<butterfly_rotation>(s);
+
+ if (stage_is_rectangular) {
+ if (transpose) {
+ __m128i output[8];
+ Transpose8x4To4x8_U16(s, output);
+ StoreDst<8, 8>(dst, step, 0, output);
+ } else {
+ StoreDst<16, 4>(dst, step, 0, s);
+ }
+ } else {
+ if (transpose) {
+ Transpose4x4_U16(s, s);
+ }
+ StoreDst<8, 4>(dst, step, 0, s);
+ }
+}
+
+template <ButterflyRotationFunc butterfly_rotation,
+ bool is_fast_butterfly = false>
+LIBGAV1_ALWAYS_INLINE void Dct8Stages(__m128i* s) {
+ // stage 8.
+ if (is_fast_butterfly) {
+ ButterflyRotation_SecondIsZero(&s[4], &s[7], 56, false);
+ ButterflyRotation_FirstIsZero(&s[5], &s[6], 24, false);
+ } else {
+ butterfly_rotation(&s[4], &s[7], 56, false);
+ butterfly_rotation(&s[5], &s[6], 24, false);
+ }
+
+ // stage 13.
+ HadamardRotation(&s[4], &s[5], false);
+ HadamardRotation(&s[6], &s[7], true);
+
+ // stage 18.
+ butterfly_rotation(&s[6], &s[5], 32, true);
+
+ // stage 22.
+ HadamardRotation(&s[0], &s[7], false);
+ HadamardRotation(&s[1], &s[6], false);
+ HadamardRotation(&s[2], &s[5], false);
+ HadamardRotation(&s[3], &s[4], false);
+}
+
+// Process dct8 rows or columns, depending on the transpose flag.
+template <ButterflyRotationFunc butterfly_rotation, bool stage_is_rectangular>
+LIBGAV1_ALWAYS_INLINE void Dct8_SSE4_1(void* dest, int32_t step,
+ bool transpose) {
+ auto* const dst = static_cast<int16_t*>(dest);
+ __m128i s[8], x[8];
+
+ if (stage_is_rectangular) {
+ if (transpose) {
+ __m128i input[4];
+ LoadSrc<16, 4>(dst, step, 0, input);
+ Transpose8x4To4x8_U16(input, x);
+ } else {
+ LoadSrc<8, 8>(dst, step, 0, x);
+ }
+ } else {
+ if (transpose) {
+ __m128i input[8];
+ LoadSrc<16, 8>(dst, step, 0, input);
+ Transpose8x8_U16(input, x);
+ } else {
+ LoadSrc<16, 8>(dst, step, 0, x);
+ }
+ }
+
+ // stage 1.
+ // kBitReverseLookup 0, 4, 2, 6, 1, 5, 3, 7,
+ s[0] = x[0];
+ s[1] = x[4];
+ s[2] = x[2];
+ s[3] = x[6];
+ s[4] = x[1];
+ s[5] = x[5];
+ s[6] = x[3];
+ s[7] = x[7];
+
+ Dct4Stages<butterfly_rotation>(s);
+ Dct8Stages<butterfly_rotation>(s);
+
+ if (stage_is_rectangular) {
+ if (transpose) {
+ __m128i output[4];
+ Transpose4x8To8x4_U16(s, output);
+ StoreDst<16, 4>(dst, step, 0, output);
+ } else {
+ StoreDst<8, 8>(dst, step, 0, s);
+ }
+ } else {
+ if (transpose) {
+ __m128i output[8];
+ Transpose8x8_U16(s, output);
+ StoreDst<16, 8>(dst, step, 0, output);
+ } else {
+ StoreDst<16, 8>(dst, step, 0, s);
+ }
+ }
+}
+
+template <ButterflyRotationFunc butterfly_rotation,
+ bool is_fast_butterfly = false>
+LIBGAV1_ALWAYS_INLINE void Dct16Stages(__m128i* s) {
+ // stage 5.
+ if (is_fast_butterfly) {
+ ButterflyRotation_SecondIsZero(&s[8], &s[15], 60, false);
+ ButterflyRotation_FirstIsZero(&s[9], &s[14], 28, false);
+ ButterflyRotation_SecondIsZero(&s[10], &s[13], 44, false);
+ ButterflyRotation_FirstIsZero(&s[11], &s[12], 12, false);
+ } else {
+ butterfly_rotation(&s[8], &s[15], 60, false);
+ butterfly_rotation(&s[9], &s[14], 28, false);
+ butterfly_rotation(&s[10], &s[13], 44, false);
+ butterfly_rotation(&s[11], &s[12], 12, false);
+ }
+
+ // stage 9.
+ HadamardRotation(&s[8], &s[9], false);
+ HadamardRotation(&s[10], &s[11], true);
+ HadamardRotation(&s[12], &s[13], false);
+ HadamardRotation(&s[14], &s[15], true);
+
+ // stage 14.
+ butterfly_rotation(&s[14], &s[9], 48, true);
+ butterfly_rotation(&s[13], &s[10], 112, true);
+
+ // stage 19.
+ HadamardRotation(&s[8], &s[11], false);
+ HadamardRotation(&s[9], &s[10], false);
+ HadamardRotation(&s[12], &s[15], true);
+ HadamardRotation(&s[13], &s[14], true);
+
+ // stage 23.
+ butterfly_rotation(&s[13], &s[10], 32, true);
+ butterfly_rotation(&s[12], &s[11], 32, true);
+
+ // stage 26.
+ HadamardRotation(&s[0], &s[15], false);
+ HadamardRotation(&s[1], &s[14], false);
+ HadamardRotation(&s[2], &s[13], false);
+ HadamardRotation(&s[3], &s[12], false);
+ HadamardRotation(&s[4], &s[11], false);
+ HadamardRotation(&s[5], &s[10], false);
+ HadamardRotation(&s[6], &s[9], false);
+ HadamardRotation(&s[7], &s[8], false);
+}
+
+// Process dct16 rows or columns, depending on the transpose flag.
+template <ButterflyRotationFunc butterfly_rotation, bool stage_is_rectangular>
+LIBGAV1_ALWAYS_INLINE void Dct16_SSE4_1(void* dest, int32_t step,
+ bool transpose) {
+ auto* const dst = static_cast<int16_t*>(dest);
+ __m128i s[16], x[16];
+
+ if (stage_is_rectangular) {
+ if (transpose) {
+ __m128i input[4];
+ LoadSrc<16, 4>(dst, step, 0, input);
+ Transpose8x4To4x8_U16(input, x);
+ LoadSrc<16, 4>(dst, step, 8, input);
+ Transpose8x4To4x8_U16(input, &x[8]);
+ } else {
+ LoadSrc<8, 16>(dst, step, 0, x);
+ }
+ } else {
+ if (transpose) {
+ for (int idx = 0; idx < 16; idx += 8) {
+ __m128i input[8];
+ LoadSrc<16, 8>(dst, step, idx, input);
+ Transpose8x8_U16(input, &x[idx]);
+ }
+ } else {
+ LoadSrc<16, 16>(dst, step, 0, x);
+ }
+ }
+
+ // stage 1
+ // kBitReverseLookup 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15,
+ s[0] = x[0];
+ s[1] = x[8];
+ s[2] = x[4];
+ s[3] = x[12];
+ s[4] = x[2];
+ s[5] = x[10];
+ s[6] = x[6];
+ s[7] = x[14];
+ s[8] = x[1];
+ s[9] = x[9];
+ s[10] = x[5];
+ s[11] = x[13];
+ s[12] = x[3];
+ s[13] = x[11];
+ s[14] = x[7];
+ s[15] = x[15];
+
+ Dct4Stages<butterfly_rotation>(s);
+ Dct8Stages<butterfly_rotation>(s);
+ Dct16Stages<butterfly_rotation>(s);
+
+ if (stage_is_rectangular) {
+ if (transpose) {
+ __m128i output[4];
+ Transpose4x8To8x4_U16(s, output);
+ StoreDst<16, 4>(dst, step, 0, output);
+ Transpose4x8To8x4_U16(&s[8], output);
+ StoreDst<16, 4>(dst, step, 8, output);
+ } else {
+ StoreDst<8, 16>(dst, step, 0, s);
+ }
+ } else {
+ if (transpose) {
+ for (int idx = 0; idx < 16; idx += 8) {
+ __m128i output[8];
+ Transpose8x8_U16(&s[idx], output);
+ StoreDst<16, 8>(dst, step, idx, output);
+ }
+ } else {
+ StoreDst<16, 16>(dst, step, 0, s);
+ }
+ }
+}
+
+template <ButterflyRotationFunc butterfly_rotation,
+ bool is_fast_butterfly = false>
+LIBGAV1_ALWAYS_INLINE void Dct32Stages(__m128i* s) {
+ // stage 3
+ if (is_fast_butterfly) {
+ ButterflyRotation_SecondIsZero(&s[16], &s[31], 62, false);
+ ButterflyRotation_FirstIsZero(&s[17], &s[30], 30, false);
+ ButterflyRotation_SecondIsZero(&s[18], &s[29], 46, false);
+ ButterflyRotation_FirstIsZero(&s[19], &s[28], 14, false);
+ ButterflyRotation_SecondIsZero(&s[20], &s[27], 54, false);
+ ButterflyRotation_FirstIsZero(&s[21], &s[26], 22, false);
+ ButterflyRotation_SecondIsZero(&s[22], &s[25], 38, false);
+ ButterflyRotation_FirstIsZero(&s[23], &s[24], 6, false);
+ } else {
+ butterfly_rotation(&s[16], &s[31], 62, false);
+ butterfly_rotation(&s[17], &s[30], 30, false);
+ butterfly_rotation(&s[18], &s[29], 46, false);
+ butterfly_rotation(&s[19], &s[28], 14, false);
+ butterfly_rotation(&s[20], &s[27], 54, false);
+ butterfly_rotation(&s[21], &s[26], 22, false);
+ butterfly_rotation(&s[22], &s[25], 38, false);
+ butterfly_rotation(&s[23], &s[24], 6, false);
+ }
+ // stage 6.
+ HadamardRotation(&s[16], &s[17], false);
+ HadamardRotation(&s[18], &s[19], true);
+ HadamardRotation(&s[20], &s[21], false);
+ HadamardRotation(&s[22], &s[23], true);
+ HadamardRotation(&s[24], &s[25], false);
+ HadamardRotation(&s[26], &s[27], true);
+ HadamardRotation(&s[28], &s[29], false);
+ HadamardRotation(&s[30], &s[31], true);
+
+ // stage 10.
+ butterfly_rotation(&s[30], &s[17], 24 + 32, true);
+ butterfly_rotation(&s[29], &s[18], 24 + 64 + 32, true);
+ butterfly_rotation(&s[26], &s[21], 24, true);
+ butterfly_rotation(&s[25], &s[22], 24 + 64, true);
+
+ // stage 15.
+ HadamardRotation(&s[16], &s[19], false);
+ HadamardRotation(&s[17], &s[18], false);
+ HadamardRotation(&s[20], &s[23], true);
+ HadamardRotation(&s[21], &s[22], true);
+ HadamardRotation(&s[24], &s[27], false);
+ HadamardRotation(&s[25], &s[26], false);
+ HadamardRotation(&s[28], &s[31], true);
+ HadamardRotation(&s[29], &s[30], true);
+
+ // stage 20.
+ butterfly_rotation(&s[29], &s[18], 48, true);
+ butterfly_rotation(&s[28], &s[19], 48, true);
+ butterfly_rotation(&s[27], &s[20], 48 + 64, true);
+ butterfly_rotation(&s[26], &s[21], 48 + 64, true);
+
+ // stage 24.
+ HadamardRotation(&s[16], &s[23], false);
+ HadamardRotation(&s[17], &s[22], false);
+ HadamardRotation(&s[18], &s[21], false);
+ HadamardRotation(&s[19], &s[20], false);
+ HadamardRotation(&s[24], &s[31], true);
+ HadamardRotation(&s[25], &s[30], true);
+ HadamardRotation(&s[26], &s[29], true);
+ HadamardRotation(&s[27], &s[28], true);
+
+ // stage 27.
+ butterfly_rotation(&s[27], &s[20], 32, true);
+ butterfly_rotation(&s[26], &s[21], 32, true);
+ butterfly_rotation(&s[25], &s[22], 32, true);
+ butterfly_rotation(&s[24], &s[23], 32, true);
+
+ // stage 29.
+ HadamardRotation(&s[0], &s[31], false);
+ HadamardRotation(&s[1], &s[30], false);
+ HadamardRotation(&s[2], &s[29], false);
+ HadamardRotation(&s[3], &s[28], false);
+ HadamardRotation(&s[4], &s[27], false);
+ HadamardRotation(&s[5], &s[26], false);
+ HadamardRotation(&s[6], &s[25], false);
+ HadamardRotation(&s[7], &s[24], false);
+ HadamardRotation(&s[8], &s[23], false);
+ HadamardRotation(&s[9], &s[22], false);
+ HadamardRotation(&s[10], &s[21], false);
+ HadamardRotation(&s[11], &s[20], false);
+ HadamardRotation(&s[12], &s[19], false);
+ HadamardRotation(&s[13], &s[18], false);
+ HadamardRotation(&s[14], &s[17], false);
+ HadamardRotation(&s[15], &s[16], false);
+}
+
+// Process dct32 rows or columns, depending on the transpose flag.
+LIBGAV1_ALWAYS_INLINE void Dct32_SSE4_1(void* dest, const int32_t step,
+ const bool transpose) {
+ auto* const dst = static_cast<int16_t*>(dest);
+ __m128i s[32], x[32];
+
+ if (transpose) {
+ for (int idx = 0; idx < 32; idx += 8) {
+ __m128i input[8];
+ LoadSrc<16, 8>(dst, step, idx, input);
+ Transpose8x8_U16(input, &x[idx]);
+ }
+ } else {
+ LoadSrc<16, 32>(dst, step, 0, x);
+ }
+
+ // stage 1
+ // kBitReverseLookup
+ // 0, 16, 8, 24, 4, 20, 12, 28, 2, 18, 10, 26, 6, 22, 14, 30,
+ s[0] = x[0];
+ s[1] = x[16];
+ s[2] = x[8];
+ s[3] = x[24];
+ s[4] = x[4];
+ s[5] = x[20];
+ s[6] = x[12];
+ s[7] = x[28];
+ s[8] = x[2];
+ s[9] = x[18];
+ s[10] = x[10];
+ s[11] = x[26];
+ s[12] = x[6];
+ s[13] = x[22];
+ s[14] = x[14];
+ s[15] = x[30];
+
+ // 1, 17, 9, 25, 5, 21, 13, 29, 3, 19, 11, 27, 7, 23, 15, 31,
+ s[16] = x[1];
+ s[17] = x[17];
+ s[18] = x[9];
+ s[19] = x[25];
+ s[20] = x[5];
+ s[21] = x[21];
+ s[22] = x[13];
+ s[23] = x[29];
+ s[24] = x[3];
+ s[25] = x[19];
+ s[26] = x[11];
+ s[27] = x[27];
+ s[28] = x[7];
+ s[29] = x[23];
+ s[30] = x[15];
+ s[31] = x[31];
+
+ Dct4Stages<ButterflyRotation_8>(s);
+ Dct8Stages<ButterflyRotation_8>(s);
+ Dct16Stages<ButterflyRotation_8>(s);
+ Dct32Stages<ButterflyRotation_8>(s);
+
+ if (transpose) {
+ for (int idx = 0; idx < 32; idx += 8) {
+ __m128i output[8];
+ Transpose8x8_U16(&s[idx], output);
+ StoreDst<16, 8>(dst, step, idx, output);
+ }
+ } else {
+ StoreDst<16, 32>(dst, step, 0, s);
+ }
+}
+
+// Allow the compiler to call this function instead of force inlining. Tests
+// show the performance is slightly faster.
+void Dct64_SSE4_1(void* dest, int32_t step, bool transpose) {
+ auto* const dst = static_cast<int16_t*>(dest);
+ __m128i s[64], x[32];
+
+ if (transpose) {
+ // The last 32 values of every row are always zero if the |tx_width| is
+ // 64.
+ for (int idx = 0; idx < 32; idx += 8) {
+ __m128i input[8];
+ LoadSrc<16, 8>(dst, step, idx, input);
+ Transpose8x8_U16(input, &x[idx]);
+ }
+ } else {
+ // The last 32 values of every column are always zero if the |tx_height| is
+ // 64.
+ LoadSrc<16, 32>(dst, step, 0, x);
+ }
+
+ // stage 1
+ // kBitReverseLookup
+ // 0, 32, 16, 48, 8, 40, 24, 56, 4, 36, 20, 52, 12, 44, 28, 60,
+ s[0] = x[0];
+ s[2] = x[16];
+ s[4] = x[8];
+ s[6] = x[24];
+ s[8] = x[4];
+ s[10] = x[20];
+ s[12] = x[12];
+ s[14] = x[28];
+
+ // 2, 34, 18, 50, 10, 42, 26, 58, 6, 38, 22, 54, 14, 46, 30, 62,
+ s[16] = x[2];
+ s[18] = x[18];
+ s[20] = x[10];
+ s[22] = x[26];
+ s[24] = x[6];
+ s[26] = x[22];
+ s[28] = x[14];
+ s[30] = x[30];
+
+ // 1, 33, 17, 49, 9, 41, 25, 57, 5, 37, 21, 53, 13, 45, 29, 61,
+ s[32] = x[1];
+ s[34] = x[17];
+ s[36] = x[9];
+ s[38] = x[25];
+ s[40] = x[5];
+ s[42] = x[21];
+ s[44] = x[13];
+ s[46] = x[29];
+
+ // 3, 35, 19, 51, 11, 43, 27, 59, 7, 39, 23, 55, 15, 47, 31, 63
+ s[48] = x[3];
+ s[50] = x[19];
+ s[52] = x[11];
+ s[54] = x[27];
+ s[56] = x[7];
+ s[58] = x[23];
+ s[60] = x[15];
+ s[62] = x[31];
+
+ Dct4Stages<ButterflyRotation_8, /*is_fast_butterfly=*/true>(s);
+ Dct8Stages<ButterflyRotation_8, /*is_fast_butterfly=*/true>(s);
+ Dct16Stages<ButterflyRotation_8, /*is_fast_butterfly=*/true>(s);
+ Dct32Stages<ButterflyRotation_8, /*is_fast_butterfly=*/true>(s);
+
+ //-- start dct 64 stages
+ // stage 2.
+ ButterflyRotation_SecondIsZero(&s[32], &s[63], 63 - 0, false);
+ ButterflyRotation_FirstIsZero(&s[33], &s[62], 63 - 32, false);
+ ButterflyRotation_SecondIsZero(&s[34], &s[61], 63 - 16, false);
+ ButterflyRotation_FirstIsZero(&s[35], &s[60], 63 - 48, false);
+ ButterflyRotation_SecondIsZero(&s[36], &s[59], 63 - 8, false);
+ ButterflyRotation_FirstIsZero(&s[37], &s[58], 63 - 40, false);
+ ButterflyRotation_SecondIsZero(&s[38], &s[57], 63 - 24, false);
+ ButterflyRotation_FirstIsZero(&s[39], &s[56], 63 - 56, false);
+ ButterflyRotation_SecondIsZero(&s[40], &s[55], 63 - 4, false);
+ ButterflyRotation_FirstIsZero(&s[41], &s[54], 63 - 36, false);
+ ButterflyRotation_SecondIsZero(&s[42], &s[53], 63 - 20, false);
+ ButterflyRotation_FirstIsZero(&s[43], &s[52], 63 - 52, false);
+ ButterflyRotation_SecondIsZero(&s[44], &s[51], 63 - 12, false);
+ ButterflyRotation_FirstIsZero(&s[45], &s[50], 63 - 44, false);
+ ButterflyRotation_SecondIsZero(&s[46], &s[49], 63 - 28, false);
+ ButterflyRotation_FirstIsZero(&s[47], &s[48], 63 - 60, false);
+
+ // stage 4.
+ HadamardRotation(&s[32], &s[33], false);
+ HadamardRotation(&s[34], &s[35], true);
+ HadamardRotation(&s[36], &s[37], false);
+ HadamardRotation(&s[38], &s[39], true);
+ HadamardRotation(&s[40], &s[41], false);
+ HadamardRotation(&s[42], &s[43], true);
+ HadamardRotation(&s[44], &s[45], false);
+ HadamardRotation(&s[46], &s[47], true);
+ HadamardRotation(&s[48], &s[49], false);
+ HadamardRotation(&s[50], &s[51], true);
+ HadamardRotation(&s[52], &s[53], false);
+ HadamardRotation(&s[54], &s[55], true);
+ HadamardRotation(&s[56], &s[57], false);
+ HadamardRotation(&s[58], &s[59], true);
+ HadamardRotation(&s[60], &s[61], false);
+ HadamardRotation(&s[62], &s[63], true);
+
+ // stage 7.
+ ButterflyRotation_8(&s[62], &s[33], 60 - 0, true);
+ ButterflyRotation_8(&s[61], &s[34], 60 - 0 + 64, true);
+ ButterflyRotation_8(&s[58], &s[37], 60 - 32, true);
+ ButterflyRotation_8(&s[57], &s[38], 60 - 32 + 64, true);
+ ButterflyRotation_8(&s[54], &s[41], 60 - 16, true);
+ ButterflyRotation_8(&s[53], &s[42], 60 - 16 + 64, true);
+ ButterflyRotation_8(&s[50], &s[45], 60 - 48, true);
+ ButterflyRotation_8(&s[49], &s[46], 60 - 48 + 64, true);
+
+ // stage 11.
+ HadamardRotation(&s[32], &s[35], false);
+ HadamardRotation(&s[33], &s[34], false);
+ HadamardRotation(&s[36], &s[39], true);
+ HadamardRotation(&s[37], &s[38], true);
+ HadamardRotation(&s[40], &s[43], false);
+ HadamardRotation(&s[41], &s[42], false);
+ HadamardRotation(&s[44], &s[47], true);
+ HadamardRotation(&s[45], &s[46], true);
+ HadamardRotation(&s[48], &s[51], false);
+ HadamardRotation(&s[49], &s[50], false);
+ HadamardRotation(&s[52], &s[55], true);
+ HadamardRotation(&s[53], &s[54], true);
+ HadamardRotation(&s[56], &s[59], false);
+ HadamardRotation(&s[57], &s[58], false);
+ HadamardRotation(&s[60], &s[63], true);
+ HadamardRotation(&s[61], &s[62], true);
+
+ // stage 16.
+ ButterflyRotation_8(&s[61], &s[34], 56, true);
+ ButterflyRotation_8(&s[60], &s[35], 56, true);
+ ButterflyRotation_8(&s[59], &s[36], 56 + 64, true);
+ ButterflyRotation_8(&s[58], &s[37], 56 + 64, true);
+ ButterflyRotation_8(&s[53], &s[42], 56 - 32, true);
+ ButterflyRotation_8(&s[52], &s[43], 56 - 32, true);
+ ButterflyRotation_8(&s[51], &s[44], 56 - 32 + 64, true);
+ ButterflyRotation_8(&s[50], &s[45], 56 - 32 + 64, true);
+
+ // stage 21.
+ HadamardRotation(&s[32], &s[39], false);
+ HadamardRotation(&s[33], &s[38], false);
+ HadamardRotation(&s[34], &s[37], false);
+ HadamardRotation(&s[35], &s[36], false);
+ HadamardRotation(&s[40], &s[47], true);
+ HadamardRotation(&s[41], &s[46], true);
+ HadamardRotation(&s[42], &s[45], true);
+ HadamardRotation(&s[43], &s[44], true);
+ HadamardRotation(&s[48], &s[55], false);
+ HadamardRotation(&s[49], &s[54], false);
+ HadamardRotation(&s[50], &s[53], false);
+ HadamardRotation(&s[51], &s[52], false);
+ HadamardRotation(&s[56], &s[63], true);
+ HadamardRotation(&s[57], &s[62], true);
+ HadamardRotation(&s[58], &s[61], true);
+ HadamardRotation(&s[59], &s[60], true);
+
+ // stage 25.
+ ButterflyRotation_8(&s[59], &s[36], 48, true);
+ ButterflyRotation_8(&s[58], &s[37], 48, true);
+ ButterflyRotation_8(&s[57], &s[38], 48, true);
+ ButterflyRotation_8(&s[56], &s[39], 48, true);
+ ButterflyRotation_8(&s[55], &s[40], 112, true);
+ ButterflyRotation_8(&s[54], &s[41], 112, true);
+ ButterflyRotation_8(&s[53], &s[42], 112, true);
+ ButterflyRotation_8(&s[52], &s[43], 112, true);
+
+ // stage 28.
+ HadamardRotation(&s[32], &s[47], false);
+ HadamardRotation(&s[33], &s[46], false);
+ HadamardRotation(&s[34], &s[45], false);
+ HadamardRotation(&s[35], &s[44], false);
+ HadamardRotation(&s[36], &s[43], false);
+ HadamardRotation(&s[37], &s[42], false);
+ HadamardRotation(&s[38], &s[41], false);
+ HadamardRotation(&s[39], &s[40], false);
+ HadamardRotation(&s[48], &s[63], true);
+ HadamardRotation(&s[49], &s[62], true);
+ HadamardRotation(&s[50], &s[61], true);
+ HadamardRotation(&s[51], &s[60], true);
+ HadamardRotation(&s[52], &s[59], true);
+ HadamardRotation(&s[53], &s[58], true);
+ HadamardRotation(&s[54], &s[57], true);
+ HadamardRotation(&s[55], &s[56], true);
+
+ // stage 30.
+ ButterflyRotation_8(&s[55], &s[40], 32, true);
+ ButterflyRotation_8(&s[54], &s[41], 32, true);
+ ButterflyRotation_8(&s[53], &s[42], 32, true);
+ ButterflyRotation_8(&s[52], &s[43], 32, true);
+ ButterflyRotation_8(&s[51], &s[44], 32, true);
+ ButterflyRotation_8(&s[50], &s[45], 32, true);
+ ButterflyRotation_8(&s[49], &s[46], 32, true);
+ ButterflyRotation_8(&s[48], &s[47], 32, true);
+
+ // stage 31.
+ for (int i = 0; i < 32; i += 4) {
+ HadamardRotation(&s[i], &s[63 - i], false);
+ HadamardRotation(&s[i + 1], &s[63 - i - 1], false);
+ HadamardRotation(&s[i + 2], &s[63 - i - 2], false);
+ HadamardRotation(&s[i + 3], &s[63 - i - 3], false);
+ }
+ //-- end dct 64 stages
+
+ if (transpose) {
+ for (int idx = 0; idx < 64; idx += 8) {
+ __m128i output[8];
+ Transpose8x8_U16(&s[idx], output);
+ StoreDst<16, 8>(dst, step, idx, output);
+ }
+ } else {
+ StoreDst<16, 64>(dst, step, 0, s);
+ }
+}
+
+//------------------------------------------------------------------------------
+// Asymmetric Discrete Sine Transforms (ADST).
+
+template <bool stage_is_rectangular>
+LIBGAV1_ALWAYS_INLINE void Adst4_SSE4_1(void* dest, int32_t step,
+ bool transpose) {
+ auto* const dst = static_cast<int16_t*>(dest);
+ __m128i s[8], x[4];
+
+ if (stage_is_rectangular) {
+ if (transpose) {
+ __m128i input[8];
+ LoadSrc<8, 8>(dst, step, 0, input);
+ Transpose4x8To8x4_U16(input, x);
+ } else {
+ LoadSrc<16, 4>(dst, step, 0, x);
+ }
+ } else {
+ LoadSrc<8, 4>(dst, step, 0, x);
+ if (transpose) {
+ Transpose4x4_U16(x, x);
+ }
+ }
+
+ const __m128i kAdst4Multiplier_1 = _mm_set1_epi16(kAdst4Multiplier[1]);
+ const __m128i kAdst4Multiplier_2 = _mm_set1_epi16(kAdst4Multiplier[2]);
+ const __m128i kAdst4Multiplier_3 = _mm_set1_epi16(kAdst4Multiplier[3]);
+ const __m128i kAdst4Multiplier_m0_1 =
+ _mm_set1_epi32(static_cast<uint16_t>(kAdst4Multiplier[1]) |
+ (static_cast<uint32_t>(-kAdst4Multiplier[0]) << 16));
+ const __m128i kAdst4Multiplier_3_0 =
+ _mm_set1_epi32(static_cast<uint16_t>(kAdst4Multiplier[0]) |
+ (static_cast<uint32_t>(kAdst4Multiplier[3]) << 16));
+
+ // stage 1.
+ const __m128i x3_x0 = _mm_unpacklo_epi16(x[0], x[3]);
+ const __m128i x2_x0 = _mm_unpacklo_epi16(x[0], x[2]);
+ const __m128i zero_x1 = _mm_cvtepu16_epi32(x[1]);
+ const __m128i zero_x2 = _mm_cvtepu16_epi32(x[2]);
+ const __m128i zero_x3 = _mm_cvtepu16_epi32(x[3]);
+
+ s[5] = _mm_madd_epi16(zero_x3, kAdst4Multiplier_1);
+ s[6] = _mm_madd_epi16(zero_x3, kAdst4Multiplier_3);
+
+ // stage 2.
+ // ((src[0] - src[2]) + src[3]) * kAdst4Multiplier[2]
+ const __m128i k2_x3_x0 = _mm_madd_epi16(x3_x0, kAdst4Multiplier_2);
+ const __m128i k2_zero_x2 = _mm_madd_epi16(zero_x2, kAdst4Multiplier_2);
+ const __m128i b7 = _mm_sub_epi32(k2_x3_x0, k2_zero_x2);
+
+ // stage 3.
+ s[0] = _mm_madd_epi16(x2_x0, kAdst4Multiplier_3_0);
+ s[1] = _mm_madd_epi16(x2_x0, kAdst4Multiplier_m0_1);
+ s[2] = b7;
+ s[3] = _mm_madd_epi16(zero_x1, kAdst4Multiplier_2);
+
+ // stage 4.
+ s[0] = _mm_add_epi32(s[0], s[5]);
+ s[1] = _mm_sub_epi32(s[1], s[6]);
+
+ // stages 5 and 6.
+ x[0] = _mm_add_epi32(s[0], s[3]);
+ x[1] = _mm_add_epi32(s[1], s[3]);
+ x[2] = _mm_add_epi32(s[0], s[1]);
+ x[3] = _mm_sub_epi32(x[2], s[3]);
+
+ x[0] = RightShiftWithRounding_S32(x[0], 12);
+ x[1] = RightShiftWithRounding_S32(x[1], 12);
+ x[2] = RightShiftWithRounding_S32(s[2], 12);
+ x[3] = RightShiftWithRounding_S32(x[3], 12);
+
+ x[0] = _mm_packs_epi32(x[0], x[1]);
+ x[2] = _mm_packs_epi32(x[2], x[3]);
+ x[1] = _mm_srli_si128(x[0], 8);
+ x[3] = _mm_srli_si128(x[2], 8);
+
+ if (stage_is_rectangular) {
+ if (transpose) {
+ __m128i output[8];
+ Transpose8x4To4x8_U16(x, output);
+ StoreDst<8, 8>(dst, step, 0, output);
+ } else {
+ StoreDst<16, 4>(dst, step, 0, x);
+ }
+ } else {
+ if (transpose) {
+ Transpose4x4_U16(x, x);
+ }
+ StoreDst<8, 4>(dst, step, 0, x);
+ }
+}
+
+constexpr int16_t kAdst4DcOnlyMultiplier[8] = {1321, 0, 2482, 0,
+ 3344, 0, 2482, 1321};
+
+LIBGAV1_ALWAYS_INLINE bool Adst4DcOnly(void* dest, int adjusted_tx_height,
+ bool should_round, int row_shift) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int16_t*>(dest);
+ const __m128i v_src =
+ _mm_shuffle_epi32(_mm_shufflelo_epi16(_mm_cvtsi32_si128(dst[0]), 0), 0);
+ const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0);
+ const __m128i v_kTransformRowMultiplier =
+ _mm_set1_epi16(kTransformRowMultiplier << 3);
+ const __m128i v_src_round =
+ _mm_mulhrs_epi16(v_src, v_kTransformRowMultiplier);
+ const __m128i s0 = _mm_blendv_epi8(v_src, v_src_round, v_mask);
+ const __m128i v_kAdst4DcOnlyMultipliers =
+ LoadUnaligned16(kAdst4DcOnlyMultiplier);
+ // s0*k0 s0*k1 s0*k2 s0*k1
+ // +
+ // s0*0 s0*0 s0*0 s0*k0
+ const __m128i x3 = _mm_madd_epi16(s0, v_kAdst4DcOnlyMultipliers);
+ const __m128i dst_0 = RightShiftWithRounding_S32(x3, 12);
+ const __m128i v_row_shift_add = _mm_set1_epi32(row_shift);
+ const __m128i v_row_shift = _mm_cvtepu32_epi64(v_row_shift_add);
+ const __m128i a = _mm_add_epi32(dst_0, v_row_shift_add);
+ const __m128i b = _mm_sra_epi32(a, v_row_shift);
+ const __m128i c = _mm_packs_epi32(b, b);
+ StoreLo8(dst, c);
+
+ return true;
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst4DcOnlyColumn(void* dest, int adjusted_tx_height,
+ int width) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int16_t*>(dest);
+ int i = 0;
+ do {
+ const __m128i v_src = _mm_cvtepi16_epi32(LoadLo8(&dst[i]));
+ const __m128i kAdst4Multiplier_0 = _mm_set1_epi32(kAdst4Multiplier[0]);
+ const __m128i kAdst4Multiplier_1 = _mm_set1_epi32(kAdst4Multiplier[1]);
+ const __m128i kAdst4Multiplier_2 = _mm_set1_epi32(kAdst4Multiplier[2]);
+ const __m128i s0 = _mm_mullo_epi32(kAdst4Multiplier_0, v_src);
+ const __m128i s1 = _mm_mullo_epi32(kAdst4Multiplier_1, v_src);
+ const __m128i s2 = _mm_mullo_epi32(kAdst4Multiplier_2, v_src);
+ const __m128i x0 = s0;
+ const __m128i x1 = s1;
+ const __m128i x2 = s2;
+ const __m128i x3 = _mm_add_epi32(s0, s1);
+ const __m128i dst_0 = RightShiftWithRounding_S32(x0, 12);
+ const __m128i dst_1 = RightShiftWithRounding_S32(x1, 12);
+ const __m128i dst_2 = RightShiftWithRounding_S32(x2, 12);
+ const __m128i dst_3 = RightShiftWithRounding_S32(x3, 12);
+ const __m128i dst_0_1 = _mm_packs_epi32(dst_0, dst_1);
+ const __m128i dst_2_3 = _mm_packs_epi32(dst_2, dst_3);
+ StoreLo8(&dst[i], dst_0_1);
+ StoreHi8(&dst[i + width * 1], dst_0_1);
+ StoreLo8(&dst[i + width * 2], dst_2_3);
+ StoreHi8(&dst[i + width * 3], dst_2_3);
+ i += 4;
+ } while (i < width);
+
+ return true;
+}
+
+template <ButterflyRotationFunc butterfly_rotation, bool stage_is_rectangular>
+LIBGAV1_ALWAYS_INLINE void Adst8_SSE4_1(void* dest, int32_t step,
+ bool transpose) {
+ auto* const dst = static_cast<int16_t*>(dest);
+ __m128i s[8], x[8];
+
+ if (stage_is_rectangular) {
+ if (transpose) {
+ __m128i input[4];
+ LoadSrc<16, 4>(dst, step, 0, input);
+ Transpose8x4To4x8_U16(input, x);
+ } else {
+ LoadSrc<8, 8>(dst, step, 0, x);
+ }
+ } else {
+ if (transpose) {
+ __m128i input[8];
+ LoadSrc<16, 8>(dst, step, 0, input);
+ Transpose8x8_U16(input, x);
+ } else {
+ LoadSrc<16, 8>(dst, step, 0, x);
+ }
+ }
+
+ // stage 1.
+ s[0] = x[7];
+ s[1] = x[0];
+ s[2] = x[5];
+ s[3] = x[2];
+ s[4] = x[3];
+ s[5] = x[4];
+ s[6] = x[1];
+ s[7] = x[6];
+
+ // stage 2.
+ butterfly_rotation(&s[0], &s[1], 60 - 0, true);
+ butterfly_rotation(&s[2], &s[3], 60 - 16, true);
+ butterfly_rotation(&s[4], &s[5], 60 - 32, true);
+ butterfly_rotation(&s[6], &s[7], 60 - 48, true);
+
+ // stage 3.
+ HadamardRotation(&s[0], &s[4], false);
+ HadamardRotation(&s[1], &s[5], false);
+ HadamardRotation(&s[2], &s[6], false);
+ HadamardRotation(&s[3], &s[7], false);
+
+ // stage 4.
+ butterfly_rotation(&s[4], &s[5], 48 - 0, true);
+ butterfly_rotation(&s[7], &s[6], 48 - 32, true);
+
+ // stage 5.
+ HadamardRotation(&s[0], &s[2], false);
+ HadamardRotation(&s[4], &s[6], false);
+ HadamardRotation(&s[1], &s[3], false);
+ HadamardRotation(&s[5], &s[7], false);
+
+ // stage 6.
+ butterfly_rotation(&s[2], &s[3], 32, true);
+ butterfly_rotation(&s[6], &s[7], 32, true);
+
+ // stage 7.
+ const __m128i v_zero = _mm_setzero_si128();
+ x[0] = s[0];
+ x[1] = _mm_subs_epi16(v_zero, s[4]);
+ x[2] = s[6];
+ x[3] = _mm_subs_epi16(v_zero, s[2]);
+ x[4] = s[3];
+ x[5] = _mm_subs_epi16(v_zero, s[7]);
+ x[6] = s[5];
+ x[7] = _mm_subs_epi16(v_zero, s[1]);
+
+ if (stage_is_rectangular) {
+ if (transpose) {
+ __m128i output[4];
+ Transpose4x8To8x4_U16(x, output);
+ StoreDst<16, 4>(dst, step, 0, output);
+ } else {
+ StoreDst<8, 8>(dst, step, 0, x);
+ }
+ } else {
+ if (transpose) {
+ __m128i output[8];
+ Transpose8x8_U16(x, output);
+ StoreDst<16, 8>(dst, step, 0, output);
+ } else {
+ StoreDst<16, 8>(dst, step, 0, x);
+ }
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst8DcOnly(void* dest, int adjusted_tx_height,
+ bool should_round, int row_shift) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int16_t*>(dest);
+ __m128i s[8];
+
+ const __m128i v_src = _mm_shufflelo_epi16(_mm_cvtsi32_si128(dst[0]), 0);
+ const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0);
+ const __m128i v_kTransformRowMultiplier =
+ _mm_set1_epi16(kTransformRowMultiplier << 3);
+ const __m128i v_src_round =
+ _mm_mulhrs_epi16(v_src, v_kTransformRowMultiplier);
+ // stage 1.
+ s[1] = _mm_blendv_epi8(v_src, v_src_round, v_mask);
+
+ // stage 2.
+ ButterflyRotation_FirstIsZero(&s[0], &s[1], 60, true);
+
+ // stage 3.
+ s[4] = s[0];
+ s[5] = s[1];
+
+ // stage 4.
+ ButterflyRotation_4(&s[4], &s[5], 48, true);
+
+ // stage 5.
+ s[2] = s[0];
+ s[3] = s[1];
+ s[6] = s[4];
+ s[7] = s[5];
+
+ // stage 6.
+ ButterflyRotation_4(&s[2], &s[3], 32, true);
+ ButterflyRotation_4(&s[6], &s[7], 32, true);
+
+ // stage 7.
+ __m128i x[8];
+ const __m128i v_zero = _mm_setzero_si128();
+ x[0] = s[0];
+ x[1] = _mm_subs_epi16(v_zero, s[4]);
+ x[2] = s[6];
+ x[3] = _mm_subs_epi16(v_zero, s[2]);
+ x[4] = s[3];
+ x[5] = _mm_subs_epi16(v_zero, s[7]);
+ x[6] = s[5];
+ x[7] = _mm_subs_epi16(v_zero, s[1]);
+
+ const __m128i x1_x0 = _mm_unpacklo_epi16(x[0], x[1]);
+ const __m128i x3_x2 = _mm_unpacklo_epi16(x[2], x[3]);
+ const __m128i x5_x4 = _mm_unpacklo_epi16(x[4], x[5]);
+ const __m128i x7_x6 = _mm_unpacklo_epi16(x[6], x[7]);
+ const __m128i x3_x0 = _mm_unpacklo_epi32(x1_x0, x3_x2);
+ const __m128i x7_x4 = _mm_unpacklo_epi32(x5_x4, x7_x6);
+
+ const __m128i v_row_shift_add = _mm_set1_epi32(row_shift);
+ const __m128i v_row_shift = _mm_cvtepu32_epi64(v_row_shift_add);
+ const __m128i a = _mm_add_epi32(_mm_cvtepi16_epi32(x3_x0), v_row_shift_add);
+ const __m128i a1 = _mm_add_epi32(_mm_cvtepi16_epi32(x7_x4), v_row_shift_add);
+ const __m128i b = _mm_sra_epi32(a, v_row_shift);
+ const __m128i b1 = _mm_sra_epi32(a1, v_row_shift);
+ StoreUnaligned16(dst, _mm_packs_epi32(b, b1));
+
+ return true;
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst8DcOnlyColumn(void* dest, int adjusted_tx_height,
+ int width) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int16_t*>(dest);
+ __m128i s[8];
+
+ int i = 0;
+ do {
+ const __m128i v_src = LoadLo8(dst);
+ // stage 1.
+ s[1] = v_src;
+
+ // stage 2.
+ ButterflyRotation_FirstIsZero(&s[0], &s[1], 60, true);
+
+ // stage 3.
+ s[4] = s[0];
+ s[5] = s[1];
+
+ // stage 4.
+ ButterflyRotation_4(&s[4], &s[5], 48, true);
+
+ // stage 5.
+ s[2] = s[0];
+ s[3] = s[1];
+ s[6] = s[4];
+ s[7] = s[5];
+
+ // stage 6.
+ ButterflyRotation_4(&s[2], &s[3], 32, true);
+ ButterflyRotation_4(&s[6], &s[7], 32, true);
+
+ // stage 7.
+ __m128i x[8];
+ const __m128i v_zero = _mm_setzero_si128();
+ x[0] = s[0];
+ x[1] = _mm_subs_epi16(v_zero, s[4]);
+ x[2] = s[6];
+ x[3] = _mm_subs_epi16(v_zero, s[2]);
+ x[4] = s[3];
+ x[5] = _mm_subs_epi16(v_zero, s[7]);
+ x[6] = s[5];
+ x[7] = _mm_subs_epi16(v_zero, s[1]);
+
+ for (int j = 0; j < 8; ++j) {
+ StoreLo8(&dst[j * width], x[j]);
+ }
+ i += 4;
+ dst += 4;
+ } while (i < width);
+
+ return true;
+}
+
+template <ButterflyRotationFunc butterfly_rotation, bool stage_is_rectangular>
+LIBGAV1_ALWAYS_INLINE void Adst16_SSE4_1(void* dest, int32_t step,
+ bool transpose) {
+ auto* const dst = static_cast<int16_t*>(dest);
+ __m128i s[16], x[16];
+
+ if (stage_is_rectangular) {
+ if (transpose) {
+ __m128i input[4];
+ LoadSrc<16, 4>(dst, step, 0, input);
+ Transpose8x4To4x8_U16(input, x);
+ LoadSrc<16, 4>(dst, step, 8, input);
+ Transpose8x4To4x8_U16(input, &x[8]);
+ } else {
+ LoadSrc<8, 16>(dst, step, 0, x);
+ }
+ } else {
+ if (transpose) {
+ for (int idx = 0; idx < 16; idx += 8) {
+ __m128i input[8];
+ LoadSrc<16, 8>(dst, step, idx, input);
+ Transpose8x8_U16(input, &x[idx]);
+ }
+ } else {
+ LoadSrc<16, 16>(dst, step, 0, x);
+ }
+ }
+
+ // stage 1.
+ s[0] = x[15];
+ s[1] = x[0];
+ s[2] = x[13];
+ s[3] = x[2];
+ s[4] = x[11];
+ s[5] = x[4];
+ s[6] = x[9];
+ s[7] = x[6];
+ s[8] = x[7];
+ s[9] = x[8];
+ s[10] = x[5];
+ s[11] = x[10];
+ s[12] = x[3];
+ s[13] = x[12];
+ s[14] = x[1];
+ s[15] = x[14];
+
+ // stage 2.
+ butterfly_rotation(&s[0], &s[1], 62 - 0, true);
+ butterfly_rotation(&s[2], &s[3], 62 - 8, true);
+ butterfly_rotation(&s[4], &s[5], 62 - 16, true);
+ butterfly_rotation(&s[6], &s[7], 62 - 24, true);
+ butterfly_rotation(&s[8], &s[9], 62 - 32, true);
+ butterfly_rotation(&s[10], &s[11], 62 - 40, true);
+ butterfly_rotation(&s[12], &s[13], 62 - 48, true);
+ butterfly_rotation(&s[14], &s[15], 62 - 56, true);
+
+ // stage 3.
+ HadamardRotation(&s[0], &s[8], false);
+ HadamardRotation(&s[1], &s[9], false);
+ HadamardRotation(&s[2], &s[10], false);
+ HadamardRotation(&s[3], &s[11], false);
+ HadamardRotation(&s[4], &s[12], false);
+ HadamardRotation(&s[5], &s[13], false);
+ HadamardRotation(&s[6], &s[14], false);
+ HadamardRotation(&s[7], &s[15], false);
+
+ // stage 4.
+ butterfly_rotation(&s[8], &s[9], 56 - 0, true);
+ butterfly_rotation(&s[13], &s[12], 8 + 0, true);
+ butterfly_rotation(&s[10], &s[11], 56 - 32, true);
+ butterfly_rotation(&s[15], &s[14], 8 + 32, true);
+
+ // stage 5.
+ HadamardRotation(&s[0], &s[4], false);
+ HadamardRotation(&s[8], &s[12], false);
+ HadamardRotation(&s[1], &s[5], false);
+ HadamardRotation(&s[9], &s[13], false);
+ HadamardRotation(&s[2], &s[6], false);
+ HadamardRotation(&s[10], &s[14], false);
+ HadamardRotation(&s[3], &s[7], false);
+ HadamardRotation(&s[11], &s[15], false);
+
+ // stage 6.
+ butterfly_rotation(&s[4], &s[5], 48 - 0, true);
+ butterfly_rotation(&s[12], &s[13], 48 - 0, true);
+ butterfly_rotation(&s[7], &s[6], 48 - 32, true);
+ butterfly_rotation(&s[15], &s[14], 48 - 32, true);
+
+ // stage 7.
+ HadamardRotation(&s[0], &s[2], false);
+ HadamardRotation(&s[4], &s[6], false);
+ HadamardRotation(&s[8], &s[10], false);
+ HadamardRotation(&s[12], &s[14], false);
+ HadamardRotation(&s[1], &s[3], false);
+ HadamardRotation(&s[5], &s[7], false);
+ HadamardRotation(&s[9], &s[11], false);
+ HadamardRotation(&s[13], &s[15], false);
+
+ // stage 8.
+ butterfly_rotation(&s[2], &s[3], 32, true);
+ butterfly_rotation(&s[6], &s[7], 32, true);
+ butterfly_rotation(&s[10], &s[11], 32, true);
+ butterfly_rotation(&s[14], &s[15], 32, true);
+
+ // stage 9.
+ const __m128i v_zero = _mm_setzero_si128();
+ x[0] = s[0];
+ x[1] = _mm_subs_epi16(v_zero, s[8]);
+ x[2] = s[12];
+ x[3] = _mm_subs_epi16(v_zero, s[4]);
+ x[4] = s[6];
+ x[5] = _mm_subs_epi16(v_zero, s[14]);
+ x[6] = s[10];
+ x[7] = _mm_subs_epi16(v_zero, s[2]);
+ x[8] = s[3];
+ x[9] = _mm_subs_epi16(v_zero, s[11]);
+ x[10] = s[15];
+ x[11] = _mm_subs_epi16(v_zero, s[7]);
+ x[12] = s[5];
+ x[13] = _mm_subs_epi16(v_zero, s[13]);
+ x[14] = s[9];
+ x[15] = _mm_subs_epi16(v_zero, s[1]);
+
+ if (stage_is_rectangular) {
+ if (transpose) {
+ __m128i output[4];
+ Transpose4x8To8x4_U16(x, output);
+ StoreDst<16, 4>(dst, step, 0, output);
+ Transpose4x8To8x4_U16(&x[8], output);
+ StoreDst<16, 4>(dst, step, 8, output);
+ } else {
+ StoreDst<8, 16>(dst, step, 0, x);
+ }
+ } else {
+ if (transpose) {
+ for (int idx = 0; idx < 16; idx += 8) {
+ __m128i output[8];
+ Transpose8x8_U16(&x[idx], output);
+ StoreDst<16, 8>(dst, step, idx, output);
+ }
+ } else {
+ StoreDst<16, 16>(dst, step, 0, x);
+ }
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE void Adst16DcOnlyInternal(__m128i* s, __m128i* x) {
+ // stage 2.
+ ButterflyRotation_FirstIsZero(&s[0], &s[1], 62, true);
+
+ // stage 3.
+ s[8] = s[0];
+ s[9] = s[1];
+
+ // stage 4.
+ ButterflyRotation_4(&s[8], &s[9], 56, true);
+
+ // stage 5.
+ s[4] = s[0];
+ s[12] = s[8];
+ s[5] = s[1];
+ s[13] = s[9];
+
+ // stage 6.
+ ButterflyRotation_4(&s[4], &s[5], 48, true);
+ ButterflyRotation_4(&s[12], &s[13], 48, true);
+
+ // stage 7.
+ s[2] = s[0];
+ s[6] = s[4];
+ s[10] = s[8];
+ s[14] = s[12];
+ s[3] = s[1];
+ s[7] = s[5];
+ s[11] = s[9];
+ s[15] = s[13];
+
+ // stage 8.
+ ButterflyRotation_4(&s[2], &s[3], 32, true);
+ ButterflyRotation_4(&s[6], &s[7], 32, true);
+ ButterflyRotation_4(&s[10], &s[11], 32, true);
+ ButterflyRotation_4(&s[14], &s[15], 32, true);
+
+ // stage 9.
+ const __m128i v_zero = _mm_setzero_si128();
+ x[0] = s[0];
+ x[1] = _mm_subs_epi16(v_zero, s[8]);
+ x[2] = s[12];
+ x[3] = _mm_subs_epi16(v_zero, s[4]);
+ x[4] = s[6];
+ x[5] = _mm_subs_epi16(v_zero, s[14]);
+ x[6] = s[10];
+ x[7] = _mm_subs_epi16(v_zero, s[2]);
+ x[8] = s[3];
+ x[9] = _mm_subs_epi16(v_zero, s[11]);
+ x[10] = s[15];
+ x[11] = _mm_subs_epi16(v_zero, s[7]);
+ x[12] = s[5];
+ x[13] = _mm_subs_epi16(v_zero, s[13]);
+ x[14] = s[9];
+ x[15] = _mm_subs_epi16(v_zero, s[1]);
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst16DcOnly(void* dest, int adjusted_tx_height,
+ bool should_round, int row_shift) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int16_t*>(dest);
+ __m128i s[16];
+ __m128i x[16];
+
+ const __m128i v_src = _mm_shufflelo_epi16(_mm_cvtsi32_si128(dst[0]), 0);
+ const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0);
+ const __m128i v_kTransformRowMultiplier =
+ _mm_set1_epi16(kTransformRowMultiplier << 3);
+ const __m128i v_src_round =
+ _mm_mulhrs_epi16(v_src, v_kTransformRowMultiplier);
+ // stage 1.
+ s[1] = _mm_blendv_epi8(v_src, v_src_round, v_mask);
+
+ Adst16DcOnlyInternal(s, x);
+
+ for (int i = 0; i < 2; ++i) {
+ const __m128i x1_x0 = _mm_unpacklo_epi16(x[0 + i * 8], x[1 + i * 8]);
+ const __m128i x3_x2 = _mm_unpacklo_epi16(x[2 + i * 8], x[3 + i * 8]);
+ const __m128i x5_x4 = _mm_unpacklo_epi16(x[4 + i * 8], x[5 + i * 8]);
+ const __m128i x7_x6 = _mm_unpacklo_epi16(x[6 + i * 8], x[7 + i * 8]);
+ const __m128i x3_x0 = _mm_unpacklo_epi32(x1_x0, x3_x2);
+ const __m128i x7_x4 = _mm_unpacklo_epi32(x5_x4, x7_x6);
+
+ const __m128i v_row_shift_add = _mm_set1_epi32(row_shift);
+ const __m128i v_row_shift = _mm_cvtepu32_epi64(v_row_shift_add);
+ const __m128i a = _mm_add_epi32(_mm_cvtepi16_epi32(x3_x0), v_row_shift_add);
+ const __m128i a1 =
+ _mm_add_epi32(_mm_cvtepi16_epi32(x7_x4), v_row_shift_add);
+ const __m128i b = _mm_sra_epi32(a, v_row_shift);
+ const __m128i b1 = _mm_sra_epi32(a1, v_row_shift);
+ StoreUnaligned16(&dst[i * 8], _mm_packs_epi32(b, b1));
+ }
+ return true;
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst16DcOnlyColumn(void* dest,
+ int adjusted_tx_height,
+ int width) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int16_t*>(dest);
+ int i = 0;
+ do {
+ __m128i s[16];
+ __m128i x[16];
+ const __m128i v_src = LoadUnaligned16(dst);
+ // stage 1.
+ s[1] = v_src;
+
+ Adst16DcOnlyInternal(s, x);
+
+ for (int j = 0; j < 16; ++j) {
+ StoreLo8(&dst[j * width], x[j]);
+ }
+ i += 4;
+ dst += 4;
+ } while (i < width);
+
+ return true;
+}
+
+//------------------------------------------------------------------------------
+// Identity Transforms.
+
+template <bool is_row_shift>
+LIBGAV1_ALWAYS_INLINE void Identity4_SSE4_1(void* dest, int32_t step) {
+ auto* const dst = static_cast<int16_t*>(dest);
+
+ if (is_row_shift) {
+ const int shift = 1;
+ const __m128i v_dual_round = _mm_set1_epi16((1 + (shift << 1)) << 11);
+ const __m128i v_multiplier_one =
+ _mm_set1_epi32((kIdentity4Multiplier << 16) | 0x0001);
+ for (int i = 0; i < 4; i += 2) {
+ const __m128i v_src = LoadUnaligned16(&dst[i * step]);
+ const __m128i v_src_round = _mm_unpacklo_epi16(v_dual_round, v_src);
+ const __m128i v_src_round_hi = _mm_unpackhi_epi16(v_dual_round, v_src);
+ const __m128i a = _mm_madd_epi16(v_src_round, v_multiplier_one);
+ const __m128i a_hi = _mm_madd_epi16(v_src_round_hi, v_multiplier_one);
+ const __m128i b = _mm_srai_epi32(a, 12 + shift);
+ const __m128i b_hi = _mm_srai_epi32(a_hi, 12 + shift);
+ StoreUnaligned16(&dst[i * step], _mm_packs_epi32(b, b_hi));
+ }
+ } else {
+ const __m128i v_multiplier =
+ _mm_set1_epi16(kIdentity4MultiplierFraction << 3);
+ for (int i = 0; i < 4; i += 2) {
+ const __m128i v_src = LoadUnaligned16(&dst[i * step]);
+ const __m128i a = _mm_mulhrs_epi16(v_src, v_multiplier);
+ const __m128i b = _mm_adds_epi16(a, v_src);
+ StoreUnaligned16(&dst[i * step], b);
+ }
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Identity4DcOnly(void* dest, int adjusted_tx_height,
+ bool should_round, int tx_height) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int16_t*>(dest);
+ const __m128i v_src0 = _mm_cvtsi32_si128(dst[0]);
+ const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0);
+ const __m128i v_kTransformRowMultiplier =
+ _mm_set1_epi16(kTransformRowMultiplier << 3);
+ const __m128i v_src_round =
+ _mm_mulhrs_epi16(v_src0, v_kTransformRowMultiplier);
+ const __m128i v_src = _mm_blendv_epi8(v_src0, v_src_round, v_mask);
+
+ const int shift = (tx_height < 16) ? 0 : 1;
+ const __m128i v_dual_round = _mm_set1_epi16((1 + (shift << 1)) << 11);
+ const __m128i v_multiplier_one =
+ _mm_set1_epi32((kIdentity4Multiplier << 16) | 0x0001);
+ const __m128i v_src_round_lo = _mm_unpacklo_epi16(v_dual_round, v_src);
+ const __m128i a = _mm_madd_epi16(v_src_round_lo, v_multiplier_one);
+ const __m128i b = _mm_srai_epi32(a, 12 + shift);
+ dst[0] = _mm_extract_epi16(_mm_packs_epi32(b, b), 0);
+ return true;
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity4ColumnStoreToFrame(
+ Array2DView<uint8_t> frame, const int start_x, const int start_y,
+ const int tx_width, const int tx_height, const int16_t* source) {
+ const int stride = frame.columns();
+ uint8_t* dst = frame[start_y] + start_x;
+
+ const __m128i v_multiplier_fraction =
+ _mm_set1_epi16(static_cast<int16_t>(kIdentity4MultiplierFraction << 3));
+ const __m128i v_eight = _mm_set1_epi16(8);
+
+ if (tx_width == 4) {
+ int i = 0;
+ do {
+ const __m128i v_src = LoadLo8(&source[i * tx_width]);
+ const __m128i v_src_mult = _mm_mulhrs_epi16(v_src, v_multiplier_fraction);
+ const __m128i frame_data = Load4(dst);
+ const __m128i v_dst_i = _mm_adds_epi16(v_src_mult, v_src);
+ const __m128i a = _mm_adds_epi16(v_dst_i, v_eight);
+ const __m128i b = _mm_srai_epi16(a, 4);
+ const __m128i c = _mm_cvtepu8_epi16(frame_data);
+ const __m128i d = _mm_adds_epi16(c, b);
+ Store4(dst, _mm_packus_epi16(d, d));
+ dst += stride;
+ } while (++i < tx_height);
+ } else {
+ int i = 0;
+ do {
+ const int row = i * tx_width;
+ int j = 0;
+ do {
+ const __m128i v_src = LoadUnaligned16(&source[row + j]);
+ const __m128i v_src_mult =
+ _mm_mulhrs_epi16(v_src, v_multiplier_fraction);
+ const __m128i frame_data = LoadLo8(dst + j);
+ const __m128i v_dst_i = _mm_adds_epi16(v_src_mult, v_src);
+ const __m128i a = _mm_adds_epi16(v_dst_i, v_eight);
+ const __m128i b = _mm_srai_epi16(a, 4);
+ const __m128i c = _mm_cvtepu8_epi16(frame_data);
+ const __m128i d = _mm_adds_epi16(c, b);
+ StoreLo8(dst + j, _mm_packus_epi16(d, d));
+ j += 8;
+ } while (j < tx_width);
+ dst += stride;
+ } while (++i < tx_height);
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity4RowColumnStoreToFrame(
+ Array2DView<uint8_t> frame, const int start_x, const int start_y,
+ const int tx_width, const int tx_height, const int16_t* source) {
+ const int stride = frame.columns();
+ uint8_t* dst = frame[start_y] + start_x;
+
+ const __m128i v_multiplier_fraction =
+ _mm_set1_epi16(static_cast<int16_t>(kIdentity4MultiplierFraction << 3));
+ const __m128i v_eight = _mm_set1_epi16(8);
+ const __m128i v_kTransformRowMultiplier =
+ _mm_set1_epi16(kTransformRowMultiplier << 3);
+
+ if (tx_width == 4) {
+ int i = 0;
+ do {
+ const __m128i v_src = LoadLo8(&source[i * tx_width]);
+ const __m128i v_src_mult = _mm_mulhrs_epi16(v_src, v_multiplier_fraction);
+ const __m128i frame_data = Load4(dst);
+ const __m128i v_dst_row = _mm_adds_epi16(v_src_mult, v_src);
+ const __m128i v_src_mult2 =
+ _mm_mulhrs_epi16(v_dst_row, v_multiplier_fraction);
+ const __m128i frame_data16 = _mm_cvtepu8_epi16(frame_data);
+ const __m128i v_dst_col = _mm_adds_epi16(v_src_mult2, v_dst_row);
+ const __m128i a = _mm_adds_epi16(v_dst_col, v_eight);
+ const __m128i b = _mm_srai_epi16(a, 4);
+ const __m128i c = _mm_adds_epi16(frame_data16, b);
+ Store4(dst, _mm_packus_epi16(c, c));
+ dst += stride;
+ } while (++i < tx_height);
+ } else {
+ int i = 0;
+ do {
+ const int row = i * tx_width;
+ int j = 0;
+ do {
+ const __m128i v_src = LoadUnaligned16(&source[row + j]);
+ const __m128i v_src_round =
+ _mm_mulhrs_epi16(v_src, v_kTransformRowMultiplier);
+ const __m128i v_dst_row = _mm_adds_epi16(v_src_round, v_src_round);
+ const __m128i v_src_mult2 =
+ _mm_mulhrs_epi16(v_dst_row, v_multiplier_fraction);
+ const __m128i frame_data = LoadLo8(dst + j);
+ const __m128i frame_data16 = _mm_cvtepu8_epi16(frame_data);
+ const __m128i v_dst_col = _mm_adds_epi16(v_src_mult2, v_dst_row);
+ const __m128i a = _mm_adds_epi16(v_dst_col, v_eight);
+ const __m128i b = _mm_srai_epi16(a, 4);
+ const __m128i c = _mm_adds_epi16(frame_data16, b);
+ StoreLo8(dst + j, _mm_packus_epi16(c, c));
+ j += 8;
+ } while (j < tx_width);
+ dst += stride;
+ } while (++i < tx_height);
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity8Row32_SSE4_1(void* dest, int32_t step) {
+ auto* const dst = static_cast<int16_t*>(dest);
+
+ // When combining the identity8 multiplier with the row shift, the
+ // calculations for tx_height equal to 32 can be simplified from
+ // ((A * 2) + 2) >> 2) to ((A + 1) >> 1).
+ const __m128i v_row_multiplier = _mm_set1_epi16(1 << 14);
+ for (int h = 0; h < 4; ++h) {
+ const __m128i v_src = LoadUnaligned16(&dst[h * step]);
+ const __m128i v_src_mult = _mm_mulhrs_epi16(v_src, v_row_multiplier);
+ StoreUnaligned16(&dst[h * step], v_src_mult);
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity8Row4_SSE4_1(void* dest, int32_t step) {
+ auto* const dst = static_cast<int16_t*>(dest);
+
+ for (int h = 0; h < 4; ++h) {
+ const __m128i v_src = LoadUnaligned16(&dst[h * step]);
+ // For bitdepth == 8, the identity row clamps to a signed 16bit value, so
+ // saturating add here is ok.
+ const __m128i a = _mm_adds_epi16(v_src, v_src);
+ StoreUnaligned16(&dst[h * step], a);
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Identity8DcOnly(void* dest, int adjusted_tx_height,
+ bool should_round, int row_shift) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int16_t*>(dest);
+ const __m128i v_src0 = _mm_cvtsi32_si128(dst[0]);
+ const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0);
+ const __m128i v_kTransformRowMultiplier =
+ _mm_set1_epi16(kTransformRowMultiplier << 3);
+ const __m128i v_src_round =
+ _mm_mulhrs_epi16(v_src0, v_kTransformRowMultiplier);
+ const __m128i v_src =
+ _mm_cvtepi16_epi32(_mm_blendv_epi8(v_src0, v_src_round, v_mask));
+ const __m128i v_srcx2 = _mm_add_epi32(v_src, v_src);
+ const __m128i v_row_shift_add = _mm_set1_epi32(row_shift);
+ const __m128i v_row_shift = _mm_cvtepu32_epi64(v_row_shift_add);
+ const __m128i a = _mm_add_epi32(v_srcx2, v_row_shift_add);
+ const __m128i b = _mm_sra_epi32(a, v_row_shift);
+ dst[0] = _mm_extract_epi16(_mm_packs_epi32(b, b), 0);
+ return true;
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity8ColumnStoreToFrame_SSE4_1(
+ Array2DView<uint8_t> frame, const int start_x, const int start_y,
+ const int tx_width, const int tx_height, const int16_t* source) {
+ const int stride = frame.columns();
+ uint8_t* dst = frame[start_y] + start_x;
+ const __m128i v_eight = _mm_set1_epi16(8);
+ if (tx_width == 4) {
+ int i = 0;
+ do {
+ const int row = i * tx_width;
+ const __m128i v_src = LoadLo8(&source[row]);
+ const __m128i v_dst_i = _mm_adds_epi16(v_src, v_src);
+ const __m128i frame_data = Load4(dst);
+ const __m128i a = _mm_adds_epi16(v_dst_i, v_eight);
+ const __m128i b = _mm_srai_epi16(a, 4);
+ const __m128i c = _mm_cvtepu8_epi16(frame_data);
+ const __m128i d = _mm_adds_epi16(c, b);
+ Store4(dst, _mm_packus_epi16(d, d));
+ dst += stride;
+ } while (++i < tx_height);
+ } else {
+ int i = 0;
+ do {
+ const int row = i * tx_width;
+ int j = 0;
+ do {
+ const __m128i v_src = LoadUnaligned16(&source[row + j]);
+ const __m128i v_dst_i = _mm_adds_epi16(v_src, v_src);
+ const __m128i frame_data = LoadLo8(dst + j);
+ const __m128i a = _mm_adds_epi16(v_dst_i, v_eight);
+ const __m128i b = _mm_srai_epi16(a, 4);
+ const __m128i c = _mm_cvtepu8_epi16(frame_data);
+ const __m128i d = _mm_adds_epi16(c, b);
+ StoreLo8(dst + j, _mm_packus_epi16(d, d));
+ j += 8;
+ } while (j < tx_width);
+ dst += stride;
+ } while (++i < tx_height);
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity16Row_SSE4_1(void* dest, int32_t step,
+ int shift) {
+ auto* const dst = static_cast<int16_t*>(dest);
+
+ const __m128i v_dual_round = _mm_set1_epi16((1 + (shift << 1)) << 11);
+ const __m128i v_multiplier_one =
+ _mm_set1_epi32((kIdentity16Multiplier << 16) | 0x0001);
+ const __m128i v_shift = _mm_set_epi64x(0, 12 + shift);
+
+ for (int h = 0; h < 4; ++h) {
+ const __m128i v_src = LoadUnaligned16(&dst[h * step]);
+ const __m128i v_src2 = LoadUnaligned16(&dst[h * step + 8]);
+ const __m128i v_src_round0 = _mm_unpacklo_epi16(v_dual_round, v_src);
+ const __m128i v_src_round1 = _mm_unpackhi_epi16(v_dual_round, v_src);
+ const __m128i v_src2_round0 = _mm_unpacklo_epi16(v_dual_round, v_src2);
+ const __m128i v_src2_round1 = _mm_unpackhi_epi16(v_dual_round, v_src2);
+ const __m128i madd0 = _mm_madd_epi16(v_src_round0, v_multiplier_one);
+ const __m128i madd1 = _mm_madd_epi16(v_src_round1, v_multiplier_one);
+ const __m128i madd20 = _mm_madd_epi16(v_src2_round0, v_multiplier_one);
+ const __m128i madd21 = _mm_madd_epi16(v_src2_round1, v_multiplier_one);
+ const __m128i shift0 = _mm_sra_epi32(madd0, v_shift);
+ const __m128i shift1 = _mm_sra_epi32(madd1, v_shift);
+ const __m128i shift20 = _mm_sra_epi32(madd20, v_shift);
+ const __m128i shift21 = _mm_sra_epi32(madd21, v_shift);
+ StoreUnaligned16(&dst[h * step], _mm_packs_epi32(shift0, shift1));
+ StoreUnaligned16(&dst[h * step + 8], _mm_packs_epi32(shift20, shift21));
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Identity16DcOnly(void* dest, int adjusted_tx_height,
+ bool should_round, int shift) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int16_t*>(dest);
+ const __m128i v_src0 = _mm_cvtsi32_si128(dst[0]);
+ const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0);
+ const __m128i v_kTransformRowMultiplier =
+ _mm_set1_epi16(kTransformRowMultiplier << 3);
+ const __m128i v_src_round0 =
+ _mm_mulhrs_epi16(v_src0, v_kTransformRowMultiplier);
+ const __m128i v_src = _mm_blendv_epi8(v_src0, v_src_round0, v_mask);
+ const __m128i v_dual_round = _mm_set1_epi16((1 + (shift << 1)) << 11);
+ const __m128i v_multiplier_one =
+ _mm_set1_epi32((kIdentity16Multiplier << 16) | 0x0001);
+ const __m128i v_shift = _mm_set_epi64x(0, 12 + shift);
+ const __m128i v_src_round = _mm_unpacklo_epi16(v_dual_round, v_src);
+ const __m128i a = _mm_madd_epi16(v_src_round, v_multiplier_one);
+ const __m128i b = _mm_sra_epi32(a, v_shift);
+ dst[0] = _mm_extract_epi16(_mm_packs_epi32(b, b), 0);
+ return true;
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity16ColumnStoreToFrame_SSE4_1(
+ Array2DView<uint8_t> frame, const int start_x, const int start_y,
+ const int tx_width, const int tx_height, const int16_t* source) {
+ const int stride = frame.columns();
+ uint8_t* dst = frame[start_y] + start_x;
+ const __m128i v_eight = _mm_set1_epi16(8);
+ const __m128i v_multiplier =
+ _mm_set1_epi16(static_cast<int16_t>(kIdentity4MultiplierFraction << 4));
+
+ if (tx_width == 4) {
+ int i = 0;
+ do {
+ const __m128i v_src = LoadLo8(&source[i * tx_width]);
+ const __m128i v_src_mult = _mm_mulhrs_epi16(v_src, v_multiplier);
+ const __m128i frame_data = Load4(dst);
+ const __m128i v_srcx2 = _mm_adds_epi16(v_src, v_src);
+ const __m128i v_dst_i = _mm_adds_epi16(v_src_mult, v_srcx2);
+ const __m128i a = _mm_adds_epi16(v_dst_i, v_eight);
+ const __m128i b = _mm_srai_epi16(a, 4);
+ const __m128i c = _mm_cvtepu8_epi16(frame_data);
+ const __m128i d = _mm_adds_epi16(c, b);
+ Store4(dst, _mm_packus_epi16(d, d));
+ dst += stride;
+ } while (++i < tx_height);
+ } else {
+ int i = 0;
+ do {
+ const int row = i * tx_width;
+ int j = 0;
+ do {
+ const __m128i v_src = LoadUnaligned16(&source[row + j]);
+ const __m128i v_src_mult = _mm_mulhrs_epi16(v_src, v_multiplier);
+ const __m128i frame_data = LoadLo8(dst + j);
+ const __m128i v_srcx2 = _mm_adds_epi16(v_src, v_src);
+ const __m128i v_dst_i = _mm_adds_epi16(v_src_mult, v_srcx2);
+ const __m128i a = _mm_adds_epi16(v_dst_i, v_eight);
+ const __m128i b = _mm_srai_epi16(a, 4);
+ const __m128i c = _mm_cvtepu8_epi16(frame_data);
+ const __m128i d = _mm_adds_epi16(c, b);
+ StoreLo8(dst + j, _mm_packus_epi16(d, d));
+ j += 8;
+ } while (j < tx_width);
+ dst += stride;
+ } while (++i < tx_height);
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity32Row16_SSE4_1(void* dest,
+ const int32_t step) {
+ auto* const dst = static_cast<int16_t*>(dest);
+
+ // When combining the identity32 multiplier with the row shift, the
+ // calculation for tx_height equal to 16 can be simplified from
+ // ((A * 4) + 1) >> 1) to (A * 2).
+ for (int h = 0; h < 4; ++h) {
+ for (int i = 0; i < 32; i += 8) {
+ const __m128i v_src = LoadUnaligned16(&dst[h * step + i]);
+ // For bitdepth == 8, the identity row clamps to a signed 16bit value, so
+ // saturating add here is ok.
+ const __m128i v_dst_i = _mm_adds_epi16(v_src, v_src);
+ StoreUnaligned16(&dst[h * step + i], v_dst_i);
+ }
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Identity32DcOnly(void* dest,
+ int adjusted_tx_height) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int16_t*>(dest);
+ const __m128i v_src0 = _mm_cvtsi32_si128(dst[0]);
+ const __m128i v_kTransformRowMultiplier =
+ _mm_set1_epi16(kTransformRowMultiplier << 3);
+ const __m128i v_src = _mm_mulhrs_epi16(v_src0, v_kTransformRowMultiplier);
+
+ // When combining the identity32 multiplier with the row shift, the
+ // calculation for tx_height equal to 16 can be simplified from
+ // ((A * 4) + 1) >> 1) to (A * 2).
+ const __m128i v_dst_0 = _mm_adds_epi16(v_src, v_src);
+ dst[0] = _mm_extract_epi16(v_dst_0, 0);
+ return true;
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity32ColumnStoreToFrame(
+ Array2DView<uint8_t> frame, const int start_x, const int start_y,
+ const int tx_width, const int tx_height, const int16_t* source) {
+ const int stride = frame.columns();
+ uint8_t* dst = frame[start_y] + start_x;
+ const __m128i v_two = _mm_set1_epi16(2);
+
+ int i = 0;
+ do {
+ const int row = i * tx_width;
+ int j = 0;
+ do {
+ const __m128i v_dst_i = LoadUnaligned16(&source[row + j]);
+ const __m128i frame_data = LoadLo8(dst + j);
+ const __m128i a = _mm_adds_epi16(v_dst_i, v_two);
+ const __m128i b = _mm_srai_epi16(a, 2);
+ const __m128i c = _mm_cvtepu8_epi16(frame_data);
+ const __m128i d = _mm_adds_epi16(c, b);
+ StoreLo8(dst + j, _mm_packus_epi16(d, d));
+ j += 8;
+ } while (j < tx_width);
+ dst += stride;
+ } while (++i < tx_height);
+}
+
+//------------------------------------------------------------------------------
+// Walsh Hadamard Transform.
+
+// Process 4 wht4 rows and columns.
+LIBGAV1_ALWAYS_INLINE void Wht4_SSE4_1(Array2DView<uint8_t> frame,
+ const int start_x, const int start_y,
+ const void* source,
+ const int adjusted_tx_height) {
+ const auto* const src = static_cast<const int16_t*>(source);
+ __m128i s[4], x[4];
+
+ if (adjusted_tx_height == 1) {
+ // Special case: only src[0] is nonzero.
+ // src[0] 0 0 0
+ // 0 0 0 0
+ // 0 0 0 0
+ // 0 0 0 0
+ //
+ // After the row and column transforms are applied, we have:
+ // f h h h
+ // g i i i
+ // g i i i
+ // g i i i
+ // where f, g, h, i are computed as follows.
+ int16_t f = (src[0] >> 2) - (src[0] >> 3);
+ const int16_t g = f >> 1;
+ f = f - (f >> 1);
+ const int16_t h = (src[0] >> 3) - (src[0] >> 4);
+ const int16_t i = (src[0] >> 4);
+ s[0] = _mm_set1_epi16(h);
+ s[0] = _mm_insert_epi16(s[0], f, 0);
+ s[1] = _mm_set1_epi16(i);
+ s[1] = _mm_insert_epi16(s[1], g, 0);
+ s[2] = s[3] = s[1];
+ } else {
+ x[0] = LoadLo8(&src[0 * 4]);
+ x[2] = LoadLo8(&src[1 * 4]);
+ x[3] = LoadLo8(&src[2 * 4]);
+ x[1] = LoadLo8(&src[3 * 4]);
+
+ // Row transforms.
+ Transpose4x4_U16(x, x);
+ s[0] = _mm_srai_epi16(x[0], 2);
+ s[2] = _mm_srai_epi16(x[1], 2);
+ s[3] = _mm_srai_epi16(x[2], 2);
+ s[1] = _mm_srai_epi16(x[3], 2);
+ s[0] = _mm_add_epi16(s[0], s[2]);
+ s[3] = _mm_sub_epi16(s[3], s[1]);
+ __m128i e = _mm_sub_epi16(s[0], s[3]);
+ e = _mm_srai_epi16(e, 1);
+ s[1] = _mm_sub_epi16(e, s[1]);
+ s[2] = _mm_sub_epi16(e, s[2]);
+ s[0] = _mm_sub_epi16(s[0], s[1]);
+ s[3] = _mm_add_epi16(s[3], s[2]);
+ Transpose4x4_U16(s, s);
+
+ // Column transforms.
+ s[0] = _mm_add_epi16(s[0], s[2]);
+ s[3] = _mm_sub_epi16(s[3], s[1]);
+ e = _mm_sub_epi16(s[0], s[3]);
+ e = _mm_srai_epi16(e, 1);
+ s[1] = _mm_sub_epi16(e, s[1]);
+ s[2] = _mm_sub_epi16(e, s[2]);
+ s[0] = _mm_sub_epi16(s[0], s[1]);
+ s[3] = _mm_add_epi16(s[3], s[2]);
+ }
+
+ // Store to frame.
+ const int stride = frame.columns();
+ uint8_t* dst = frame[start_y] + start_x;
+ for (int row = 0; row < 4; ++row) {
+ const __m128i frame_data = Load4(dst);
+ const __m128i a = _mm_cvtepu8_epi16(frame_data);
+ // Saturate to prevent overflowing int16_t
+ const __m128i b = _mm_adds_epi16(a, s[row]);
+ Store4(dst, _mm_packus_epi16(b, b));
+ dst += stride;
+ }
+}
+
+//------------------------------------------------------------------------------
+// row/column transform loops
+
+template <bool enable_flip_rows = false>
+LIBGAV1_ALWAYS_INLINE void StoreToFrameWithRound(
+ Array2DView<uint8_t> frame, const int start_x, const int start_y,
+ const int tx_width, const int tx_height, const int16_t* source,
+ TransformType tx_type) {
+ const bool flip_rows =
+ enable_flip_rows ? kTransformFlipRowsMask.Contains(tx_type) : false;
+ const __m128i v_eight = _mm_set1_epi16(8);
+ const int stride = frame.columns();
+ uint8_t* dst = frame[start_y] + start_x;
+ if (tx_width == 4) {
+ for (int i = 0; i < tx_height; ++i) {
+ const int row = flip_rows ? (tx_height - i - 1) * 4 : i * 4;
+ const __m128i residual = LoadLo8(&source[row]);
+ const __m128i frame_data = Load4(dst);
+ // Saturate to prevent overflowing int16_t
+ const __m128i a = _mm_adds_epi16(residual, v_eight);
+ const __m128i b = _mm_srai_epi16(a, 4);
+ const __m128i c = _mm_cvtepu8_epi16(frame_data);
+ const __m128i d = _mm_adds_epi16(c, b);
+ Store4(dst, _mm_packus_epi16(d, d));
+ dst += stride;
+ }
+ } else if (tx_width == 8) {
+ for (int i = 0; i < tx_height; ++i) {
+ const int row = flip_rows ? (tx_height - i - 1) * 8 : i * 8;
+ const __m128i residual = LoadUnaligned16(&source[row]);
+ const __m128i frame_data = LoadLo8(dst);
+ // Saturate to prevent overflowing int16_t
+ const __m128i b = _mm_adds_epi16(residual, v_eight);
+ const __m128i c = _mm_srai_epi16(b, 4);
+ const __m128i d = _mm_cvtepu8_epi16(frame_data);
+ const __m128i e = _mm_adds_epi16(d, c);
+ StoreLo8(dst, _mm_packus_epi16(e, e));
+ dst += stride;
+ }
+ } else {
+ for (int i = 0; i < tx_height; ++i) {
+ const int y = start_y + i;
+ const int row = flip_rows ? (tx_height - i - 1) * tx_width : i * tx_width;
+ int j = 0;
+ do {
+ const int x = start_x + j;
+ const __m128i residual = LoadUnaligned16(&source[row + j]);
+ const __m128i residual_hi = LoadUnaligned16(&source[row + j + 8]);
+ const __m128i frame_data = LoadUnaligned16(frame[y] + x);
+ const __m128i b = _mm_adds_epi16(residual, v_eight);
+ const __m128i b_hi = _mm_adds_epi16(residual_hi, v_eight);
+ const __m128i c = _mm_srai_epi16(b, 4);
+ const __m128i c_hi = _mm_srai_epi16(b_hi, 4);
+ const __m128i d = _mm_cvtepu8_epi16(frame_data);
+ const __m128i d_hi = _mm_cvtepu8_epi16(_mm_srli_si128(frame_data, 8));
+ const __m128i e = _mm_adds_epi16(d, c);
+ const __m128i e_hi = _mm_adds_epi16(d_hi, c_hi);
+ StoreUnaligned16(frame[y] + x, _mm_packus_epi16(e, e_hi));
+ j += 16;
+ } while (j < tx_width);
+ }
+ }
+}
+
+template <int tx_height>
+LIBGAV1_ALWAYS_INLINE void FlipColumns(int16_t* source, int tx_width) {
+ const __m128i word_reverse_8 =
+ _mm_set_epi32(0x01000302, 0x05040706, 0x09080b0a, 0x0d0c0f0e);
+ if (tx_width >= 16) {
+ int i = 0;
+ do {
+ // read 16 shorts
+ const __m128i v3210 = LoadUnaligned16(&source[i]);
+ const __m128i v7654 = LoadUnaligned16(&source[i + 8]);
+ const __m128i v0123 = _mm_shuffle_epi8(v3210, word_reverse_8);
+ const __m128i v4567 = _mm_shuffle_epi8(v7654, word_reverse_8);
+ StoreUnaligned16(&source[i], v4567);
+ StoreUnaligned16(&source[i + 8], v0123);
+ i += 16;
+ } while (i < tx_width * tx_height);
+ } else if (tx_width == 8) {
+ for (int i = 0; i < 8 * tx_height; i += 8) {
+ const __m128i a = LoadUnaligned16(&source[i]);
+ const __m128i b = _mm_shuffle_epi8(a, word_reverse_8);
+ StoreUnaligned16(&source[i], b);
+ }
+ } else {
+ const __m128i dual_word_reverse_4 =
+ _mm_set_epi32(0x09080b0a, 0x0d0c0f0e, 0x01000302, 0x05040706);
+ // Process two rows per iteration.
+ for (int i = 0; i < 4 * tx_height; i += 8) {
+ const __m128i a = LoadUnaligned16(&source[i]);
+ const __m128i b = _mm_shuffle_epi8(a, dual_word_reverse_4);
+ StoreUnaligned16(&source[i], b);
+ }
+ }
+}
+
+template <int tx_width>
+LIBGAV1_ALWAYS_INLINE void ApplyRounding(int16_t* source, int num_rows) {
+ const __m128i v_kTransformRowMultiplier =
+ _mm_set1_epi16(kTransformRowMultiplier << 3);
+ if (tx_width == 4) {
+ // Process two rows per iteration.
+ int i = 0;
+ do {
+ const __m128i a = LoadUnaligned16(&source[i]);
+ const __m128i b = _mm_mulhrs_epi16(a, v_kTransformRowMultiplier);
+ StoreUnaligned16(&source[i], b);
+ i += 8;
+ } while (i < tx_width * num_rows);
+ } else {
+ int i = 0;
+ do {
+ // The last 32 values of every row are always zero if the |tx_width| is
+ // 64.
+ const int non_zero_width = (tx_width < 64) ? tx_width : 32;
+ int j = 0;
+ do {
+ const __m128i a = LoadUnaligned16(&source[i * tx_width + j]);
+ const __m128i b = _mm_mulhrs_epi16(a, v_kTransformRowMultiplier);
+ StoreUnaligned16(&source[i * tx_width + j], b);
+ j += 8;
+ } while (j < non_zero_width);
+ } while (++i < num_rows);
+ }
+}
+
+template <int tx_width>
+LIBGAV1_ALWAYS_INLINE void RowShift(int16_t* source, int num_rows,
+ int row_shift) {
+ const __m128i v_row_shift_add = _mm_set1_epi16(row_shift);
+ const __m128i v_row_shift = _mm_cvtepu16_epi64(v_row_shift_add);
+ if (tx_width == 4) {
+ // Process two rows per iteration.
+ int i = 0;
+ do {
+ const __m128i residual = LoadUnaligned16(&source[i]);
+ const __m128i shifted_residual =
+ ShiftResidual(residual, v_row_shift_add, v_row_shift);
+ StoreUnaligned16(&source[i], shifted_residual);
+ i += 8;
+ } while (i < tx_width * num_rows);
+ } else {
+ int i = 0;
+ do {
+ for (int j = 0; j < tx_width; j += 8) {
+ const __m128i residual = LoadUnaligned16(&source[i * tx_width + j]);
+ const __m128i shifted_residual =
+ ShiftResidual(residual, v_row_shift_add, v_row_shift);
+ StoreUnaligned16(&source[i * tx_width + j], shifted_residual);
+ }
+ } while (++i < num_rows);
+ }
+}
+
+void Dct4TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
+ TransformSize tx_size, int adjusted_tx_height,
+ void* src_buffer, int /*start_x*/,
+ int /*start_y*/, void* /*dst_frame*/) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_height = kTransformHeight[tx_size];
+ const bool should_round = (tx_height == 8);
+ const int row_shift = static_cast<int>(tx_height == 16);
+
+ if (DctDcOnly<4>(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<4>(src, adjusted_tx_height);
+ }
+
+ if (adjusted_tx_height <= 4) {
+ // Process 4 1d dct4 rows in parallel.
+ Dct4_SSE4_1<ButterflyRotation_4, false>(src, /*step=*/4,
+ /*transpose=*/true);
+ } else {
+ // Process 8 1d dct4 rows in parallel per iteration.
+ int i = 0;
+ do {
+ Dct4_SSE4_1<ButterflyRotation_8, true>(&src[i * 4], /*step=*/4,
+ /*transpose=*/true);
+ i += 8;
+ } while (i < adjusted_tx_height);
+ }
+ if (tx_height == 16) {
+ RowShift<4>(src, adjusted_tx_height, 1);
+ }
+}
+
+void Dct4TransformLoopColumn_SSE4_1(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int start_x, int start_y, void* dst_frame) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<4>(src, tx_width);
+ }
+
+ if (!DctDcOnlyColumn<4>(src, adjusted_tx_height, tx_width)) {
+ if (tx_width == 4) {
+ // Process 4 1d dct4 columns in parallel.
+ Dct4_SSE4_1<ButterflyRotation_4, false>(src, tx_width,
+ /*transpose=*/false);
+ } else {
+ // Process 8 1d dct4 columns in parallel per iteration.
+ int i = 0;
+ do {
+ Dct4_SSE4_1<ButterflyRotation_8, true>(&src[i], tx_width,
+ /*transpose=*/false);
+ i += 8;
+ } while (i < tx_width);
+ }
+ }
+ auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+ StoreToFrameWithRound(frame, start_x, start_y, tx_width, 4, src, tx_type);
+}
+
+void Dct8TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
+ TransformSize tx_size, int adjusted_tx_height,
+ void* src_buffer, int /*start_x*/,
+ int /*start_y*/, void* /*dst_frame*/) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
+
+ if (DctDcOnly<8>(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<8>(src, adjusted_tx_height);
+ }
+
+ if (adjusted_tx_height <= 4) {
+ // Process 4 1d dct8 rows in parallel.
+ Dct8_SSE4_1<ButterflyRotation_4, true>(src, /*step=*/8, /*transpose=*/true);
+ } else {
+ // Process 8 1d dct8 rows in parallel per iteration.
+ int i = 0;
+ do {
+ Dct8_SSE4_1<ButterflyRotation_8, false>(&src[i * 8], /*step=*/8,
+ /*transpose=*/true);
+ i += 8;
+ } while (i < adjusted_tx_height);
+ }
+ if (row_shift > 0) {
+ RowShift<8>(src, adjusted_tx_height, row_shift);
+ }
+}
+
+void Dct8TransformLoopColumn_SSE4_1(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int start_x, int start_y, void* dst_frame) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<8>(src, tx_width);
+ }
+
+ if (!DctDcOnlyColumn<8>(src, adjusted_tx_height, tx_width)) {
+ if (tx_width == 4) {
+ // Process 4 1d dct8 columns in parallel.
+ Dct8_SSE4_1<ButterflyRotation_4, true>(src, 4, /*transpose=*/false);
+ } else {
+ // Process 8 1d dct8 columns in parallel per iteration.
+ int i = 0;
+ do {
+ Dct8_SSE4_1<ButterflyRotation_8, false>(&src[i], tx_width,
+ /*transpose=*/false);
+ i += 8;
+ } while (i < tx_width);
+ }
+ }
+ auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+ StoreToFrameWithRound(frame, start_x, start_y, tx_width, 8, src, tx_type);
+}
+
+void Dct16TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
+ TransformSize tx_size, int adjusted_tx_height,
+ void* src_buffer, int /*start_x*/,
+ int /*start_y*/, void* /*dst_frame*/) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
+
+ if (DctDcOnly<16>(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<16>(src, adjusted_tx_height);
+ }
+
+ if (adjusted_tx_height <= 4) {
+ // Process 4 1d dct16 rows in parallel.
+ Dct16_SSE4_1<ButterflyRotation_4, true>(src, 16, /*transpose=*/true);
+ } else {
+ int i = 0;
+ do {
+ // Process 8 1d dct16 rows in parallel per iteration.
+ Dct16_SSE4_1<ButterflyRotation_8, false>(&src[i * 16], 16,
+ /*transpose=*/true);
+ i += 8;
+ } while (i < adjusted_tx_height);
+ }
+ // row_shift is always non zero here.
+ RowShift<16>(src, adjusted_tx_height, row_shift);
+}
+
+void Dct16TransformLoopColumn_SSE4_1(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int start_x, int start_y,
+ void* dst_frame) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<16>(src, tx_width);
+ }
+
+ if (!DctDcOnlyColumn<16>(src, adjusted_tx_height, tx_width)) {
+ if (tx_width == 4) {
+ // Process 4 1d dct16 columns in parallel.
+ Dct16_SSE4_1<ButterflyRotation_4, true>(src, 4, /*transpose=*/false);
+ } else {
+ int i = 0;
+ do {
+ // Process 8 1d dct16 columns in parallel per iteration.
+ Dct16_SSE4_1<ButterflyRotation_8, false>(&src[i], tx_width,
+ /*transpose=*/false);
+ i += 8;
+ } while (i < tx_width);
+ }
+ }
+ auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+ StoreToFrameWithRound(frame, start_x, start_y, tx_width, 16, src, tx_type);
+}
+
+void Dct32TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
+ TransformSize tx_size, int adjusted_tx_height,
+ void* src_buffer, int /*start_x*/,
+ int /*start_y*/, void* /*dst_frame*/) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
+
+ if (DctDcOnly<32>(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<32>(src, adjusted_tx_height);
+ }
+ // Process 8 1d dct32 rows in parallel per iteration.
+ int i = 0;
+ do {
+ Dct32_SSE4_1(&src[i * 32], 32, /*transpose=*/true);
+ i += 8;
+ } while (i < adjusted_tx_height);
+ // row_shift is always non zero here.
+ RowShift<32>(src, adjusted_tx_height, row_shift);
+}
+
+void Dct32TransformLoopColumn_SSE4_1(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int start_x, int start_y,
+ void* dst_frame) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (!DctDcOnlyColumn<32>(src, adjusted_tx_height, tx_width)) {
+ // Process 8 1d dct32 columns in parallel per iteration.
+ int i = 0;
+ do {
+ Dct32_SSE4_1(&src[i], tx_width, /*transpose=*/false);
+ i += 8;
+ } while (i < tx_width);
+ }
+ auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+ StoreToFrameWithRound(frame, start_x, start_y, tx_width, 32, src, tx_type);
+}
+
+void Dct64TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
+ TransformSize tx_size, int adjusted_tx_height,
+ void* src_buffer, int /*start_x*/,
+ int /*start_y*/, void* /*dst_frame*/) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
+
+ if (DctDcOnly<64>(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<64>(src, adjusted_tx_height);
+ }
+ // Process 8 1d dct64 rows in parallel per iteration.
+ int i = 0;
+ do {
+ Dct64_SSE4_1(&src[i * 64], 64, /*transpose=*/true);
+ i += 8;
+ } while (i < adjusted_tx_height);
+ // row_shift is always non zero here.
+ RowShift<64>(src, adjusted_tx_height, row_shift);
+}
+
+void Dct64TransformLoopColumn_SSE4_1(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int start_x, int start_y,
+ void* dst_frame) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (!DctDcOnlyColumn<64>(src, adjusted_tx_height, tx_width)) {
+ // Process 8 1d dct64 columns in parallel per iteration.
+ int i = 0;
+ do {
+ Dct64_SSE4_1(&src[i], tx_width, /*transpose=*/false);
+ i += 8;
+ } while (i < tx_width);
+ }
+ auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+ StoreToFrameWithRound(frame, start_x, start_y, tx_width, 64, src, tx_type);
+}
+
+void Adst4TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
+ TransformSize tx_size, int adjusted_tx_height,
+ void* src_buffer, int /*start_x*/,
+ int /*start_y*/, void* /*dst_frame*/) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_height = kTransformHeight[tx_size];
+ const int row_shift = static_cast<int>(tx_height == 16);
+ const bool should_round = (tx_height == 8);
+
+ if (Adst4DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<4>(src, adjusted_tx_height);
+ }
+
+ // Process 4 1d adst4 rows in parallel per iteration.
+ int i = 0;
+ do {
+ Adst4_SSE4_1<false>(&src[i * 4], /*step=*/4, /*transpose=*/true);
+ i += 4;
+ } while (i < adjusted_tx_height);
+
+ if (row_shift != 0) {
+ RowShift<4>(src, adjusted_tx_height, 1);
+ }
+}
+
+void Adst4TransformLoopColumn_SSE4_1(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int start_x, int start_y,
+ void* dst_frame) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<4>(src, tx_width);
+ }
+
+ if (!Adst4DcOnlyColumn(src, adjusted_tx_height, tx_width)) {
+ // Process 4 1d adst4 columns in parallel per iteration.
+ int i = 0;
+ do {
+ Adst4_SSE4_1<false>(&src[i], tx_width, /*transpose=*/false);
+ i += 4;
+ } while (i < tx_width);
+ }
+ auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+ StoreToFrameWithRound</*enable_flip_rows=*/true>(frame, start_x, start_y,
+ tx_width, 4, src, tx_type);
+}
+
+void Adst8TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
+ TransformSize tx_size, int adjusted_tx_height,
+ void* src_buffer, int /*start_x*/,
+ int /*start_y*/, void* /*dst_frame*/) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
+
+ if (Adst8DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<8>(src, adjusted_tx_height);
+ }
+
+ if (adjusted_tx_height <= 4) {
+ // Process 4 1d adst8 rows in parallel.
+ Adst8_SSE4_1<ButterflyRotation_4, true>(src, /*step=*/8,
+ /*transpose=*/true);
+ } else {
+ // Process 8 1d adst8 rows in parallel per iteration.
+ int i = 0;
+ do {
+ Adst8_SSE4_1<ButterflyRotation_8, false>(&src[i * 8], /*step=*/8,
+ /*transpose=*/true);
+ i += 8;
+ } while (i < adjusted_tx_height);
+ }
+ if (row_shift > 0) {
+ RowShift<8>(src, adjusted_tx_height, row_shift);
+ }
+}
+
+void Adst8TransformLoopColumn_SSE4_1(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int start_x, int start_y,
+ void* dst_frame) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<8>(src, tx_width);
+ }
+
+ if (!Adst8DcOnlyColumn(src, adjusted_tx_height, tx_width)) {
+ if (tx_width == 4) {
+ // Process 4 1d adst8 columns in parallel.
+ Adst8_SSE4_1<ButterflyRotation_4, true>(src, 4, /*transpose=*/false);
+ } else {
+ // Process 8 1d adst8 columns in parallel per iteration.
+ int i = 0;
+ do {
+ Adst8_SSE4_1<ButterflyRotation_8, false>(&src[i], tx_width,
+ /*transpose=*/false);
+ i += 8;
+ } while (i < tx_width);
+ }
+ }
+ auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+ StoreToFrameWithRound</*enable_flip_rows=*/true>(frame, start_x, start_y,
+ tx_width, 8, src, tx_type);
+}
+
+void Adst16TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int /*start_x*/, int /*start_y*/,
+ void* /*dst_frame*/) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
+
+ if (Adst16DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<16>(src, adjusted_tx_height);
+ }
+
+ if (adjusted_tx_height <= 4) {
+ // Process 4 1d adst16 rows in parallel.
+ Adst16_SSE4_1<ButterflyRotation_4, true>(src, 16, /*transpose=*/true);
+ } else {
+ int i = 0;
+ do {
+ // Process 8 1d adst16 rows in parallel per iteration.
+ Adst16_SSE4_1<ButterflyRotation_8, false>(&src[i * 16], 16,
+ /*transpose=*/true);
+ i += 8;
+ } while (i < adjusted_tx_height);
+ }
+ // row_shift is always non zero here.
+ RowShift<16>(src, adjusted_tx_height, row_shift);
+}
+
+void Adst16TransformLoopColumn_SSE4_1(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int start_x, int start_y,
+ void* dst_frame) {
+ auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<16>(src, tx_width);
+ }
+
+ if (!Adst16DcOnlyColumn(src, adjusted_tx_height, tx_width)) {
+ if (tx_width == 4) {
+ // Process 4 1d adst16 columns in parallel.
+ Adst16_SSE4_1<ButterflyRotation_4, true>(src, 4, /*transpose=*/false);
+ } else {
+ int i = 0;
+ do {
+ // Process 8 1d adst16 columns in parallel per iteration.
+ Adst16_SSE4_1<ButterflyRotation_8, false>(&src[i], tx_width,
+ /*transpose=*/false);
+ i += 8;
+ } while (i < tx_width);
+ }
+ }
+ StoreToFrameWithRound</*enable_flip_rows=*/true>(frame, start_x, start_y,
+ tx_width, 16, src, tx_type);
+}
+
+void Identity4TransformLoopRow_SSE4_1(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int /*start_x*/, int /*start_y*/,
+ void* /*dst_frame*/) {
+ // Special case: Process row calculations during column transform call.
+ // Improves performance.
+ if (tx_type == kTransformTypeIdentityIdentity &&
+ tx_size == kTransformSize4x4) {
+ return;
+ }
+
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_height = kTransformHeight[tx_size];
+ const bool should_round = (tx_height == 8);
+ if (Identity4DcOnly(src, adjusted_tx_height, should_round, tx_height)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<4>(src, adjusted_tx_height);
+ }
+ if (tx_height < 16) {
+ int i = 0;
+ do {
+ Identity4_SSE4_1<false>(&src[i * 4], /*step=*/4);
+ i += 4;
+ } while (i < adjusted_tx_height);
+ } else {
+ int i = 0;
+ do {
+ Identity4_SSE4_1<true>(&src[i * 4], /*step=*/4);
+ i += 4;
+ } while (i < adjusted_tx_height);
+ }
+}
+
+void Identity4TransformLoopColumn_SSE4_1(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height,
+ void* src_buffer, int start_x,
+ int start_y, void* dst_frame) {
+ auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ // Special case: Process row calculations during column transform call.
+ if (tx_type == kTransformTypeIdentityIdentity &&
+ (tx_size == kTransformSize4x4 || tx_size == kTransformSize8x4)) {
+ Identity4RowColumnStoreToFrame(frame, start_x, start_y, tx_width,
+ adjusted_tx_height, src);
+ return;
+ }
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<4>(src, tx_width);
+ }
+
+ Identity4ColumnStoreToFrame(frame, start_x, start_y, tx_width,
+ adjusted_tx_height, src);
+}
+
+void Identity8TransformLoopRow_SSE4_1(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int /*start_x*/, int /*start_y*/,
+ void* /*dst_frame*/) {
+ // Special case: Process row calculations during column transform call.
+ // Improves performance.
+ if (tx_type == kTransformTypeIdentityIdentity &&
+ tx_size == kTransformSize8x4) {
+ return;
+ }
+
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_height = kTransformHeight[tx_size];
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
+ if (Identity8DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<8>(src, adjusted_tx_height);
+ }
+
+ // When combining the identity8 multiplier with the row shift, the
+ // calculations for tx_height == 8 and tx_height == 16 can be simplified
+ // from ((A * 2) + 1) >> 1) to A.
+ if ((tx_height & 0x18) != 0) {
+ return;
+ }
+ if (tx_height == 32) {
+ int i = 0;
+ do {
+ Identity8Row32_SSE4_1(&src[i * 8], /*step=*/8);
+ i += 4;
+ } while (i < adjusted_tx_height);
+ return;
+ }
+
+ assert(tx_size == kTransformSize8x4);
+ int i = 0;
+ do {
+ Identity8Row4_SSE4_1(&src[i * 8], /*step=*/8);
+ i += 4;
+ } while (i < adjusted_tx_height);
+}
+
+void Identity8TransformLoopColumn_SSE4_1(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height,
+ void* src_buffer, int start_x,
+ int start_y, void* dst_frame) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<8>(src, tx_width);
+ }
+
+ auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+ Identity8ColumnStoreToFrame_SSE4_1(frame, start_x, start_y, tx_width,
+ adjusted_tx_height, src);
+}
+
+void Identity16TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int /*start_x*/, int /*start_y*/,
+ void* /*dst_frame*/) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
+ if (Identity16DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<16>(src, adjusted_tx_height);
+ }
+ int i = 0;
+ do {
+ Identity16Row_SSE4_1(&src[i * 16], /*step=*/16,
+ kTransformRowShift[tx_size]);
+ i += 4;
+ } while (i < adjusted_tx_height);
+}
+
+void Identity16TransformLoopColumn_SSE4_1(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height,
+ void* src_buffer, int start_x,
+ int start_y, void* dst_frame) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<16>(src, tx_width);
+ }
+ auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+ Identity16ColumnStoreToFrame_SSE4_1(frame, start_x, start_y, tx_width,
+ adjusted_tx_height, src);
+}
+
+void Identity32TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int /*start_x*/, int /*start_y*/,
+ void* /*dst_frame*/) {
+ const int tx_height = kTransformHeight[tx_size];
+ // When combining the identity32 multiplier with the row shift, the
+ // calculations for tx_height == 8 and tx_height == 32 can be simplified
+ // from ((A * 4) + 2) >> 2) to A.
+ if ((tx_height & 0x28) != 0) {
+ return;
+ }
+
+ // Process kTransformSize32x16. The src is always rounded before the
+ // identity transform and shifted by 1 afterwards.
+ auto* src = static_cast<int16_t*>(src_buffer);
+ if (Identity32DcOnly(src, adjusted_tx_height)) {
+ return;
+ }
+
+ assert(tx_size == kTransformSize32x16);
+ ApplyRounding<32>(src, adjusted_tx_height);
+ int i = 0;
+ do {
+ Identity32Row16_SSE4_1(&src[i * 32], /*step=*/32);
+ i += 4;
+ } while (i < adjusted_tx_height);
+}
+
+void Identity32TransformLoopColumn_SSE4_1(TransformType /*tx_type*/,
+ TransformSize tx_size,
+ int adjusted_tx_height,
+ void* src_buffer, int start_x,
+ int start_y, void* dst_frame) {
+ auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ Identity32ColumnStoreToFrame(frame, start_x, start_y, tx_width,
+ adjusted_tx_height, src);
+}
+
+void Wht4TransformLoopRow_SSE4_1(TransformType tx_type, TransformSize tx_size,
+ int /*adjusted_tx_height*/,
+ void* /*src_buffer*/, int /*start_x*/,
+ int /*start_y*/, void* /*dst_frame*/) {
+ assert(tx_type == kTransformTypeDctDct);
+ assert(tx_size == kTransformSize4x4);
+ static_cast<void>(tx_type);
+ static_cast<void>(tx_size);
+ // Do both row and column transforms in the column-transform pass.
+}
+
+void Wht4TransformLoopColumn_SSE4_1(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int start_x, int start_y, void* dst_frame) {
+ assert(tx_type == kTransformTypeDctDct);
+ assert(tx_size == kTransformSize4x4);
+ static_cast<void>(tx_type);
+ static_cast<void>(tx_size);
+
+ // Do both row and column transforms in the column-transform pass.
+ // Process 4 1d wht4 rows and columns in parallel.
+ const auto* src = static_cast<int16_t*>(src_buffer);
+ auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+ Wht4_SSE4_1(frame, start_x, start_y, src, adjusted_tx_height);
+}
+
+//------------------------------------------------------------------------------
+
+template <typename Residual, typename Pixel>
+void InitAll(Dsp* const dsp) {
+ // Maximum transform size for Dct is 64.
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kRow] =
+ Dct4TransformLoopRow_SSE4_1;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kColumn] =
+ Dct4TransformLoopColumn_SSE4_1;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kRow] =
+ Dct8TransformLoopRow_SSE4_1;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kColumn] =
+ Dct8TransformLoopColumn_SSE4_1;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kRow] =
+ Dct16TransformLoopRow_SSE4_1;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kColumn] =
+ Dct16TransformLoopColumn_SSE4_1;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kRow] =
+ Dct32TransformLoopRow_SSE4_1;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kColumn] =
+ Dct32TransformLoopColumn_SSE4_1;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kRow] =
+ Dct64TransformLoopRow_SSE4_1;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kColumn] =
+ Dct64TransformLoopColumn_SSE4_1;
+
+ // Maximum transform size for Adst is 16.
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kRow] =
+ Adst4TransformLoopRow_SSE4_1;
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kColumn] =
+ Adst4TransformLoopColumn_SSE4_1;
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kRow] =
+ Adst8TransformLoopRow_SSE4_1;
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kColumn] =
+ Adst8TransformLoopColumn_SSE4_1;
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kRow] =
+ Adst16TransformLoopRow_SSE4_1;
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kColumn] =
+ Adst16TransformLoopColumn_SSE4_1;
+
+ // Maximum transform size for Identity transform is 32.
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kRow] =
+ Identity4TransformLoopRow_SSE4_1;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kColumn] =
+ Identity4TransformLoopColumn_SSE4_1;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kRow] =
+ Identity8TransformLoopRow_SSE4_1;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kColumn] =
+ Identity8TransformLoopColumn_SSE4_1;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kRow] =
+ Identity16TransformLoopRow_SSE4_1;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kColumn] =
+ Identity16TransformLoopColumn_SSE4_1;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize32][kRow] =
+ Identity32TransformLoopRow_SSE4_1;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize32][kColumn] =
+ Identity32TransformLoopColumn_SSE4_1;
+
+ // Maximum transform size for Wht is 4.
+ dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kRow] =
+ Wht4TransformLoopRow_SSE4_1;
+ dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kColumn] =
+ Wht4TransformLoopColumn_SSE4_1;
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ InitAll<int16_t, uint8_t>(dsp);
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize4_1DTransformDct)
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kRow] =
+ Dct4TransformLoopRow_SSE4_1;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kColumn] =
+ Dct4TransformLoopColumn_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize8_1DTransformDct)
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kRow] =
+ Dct8TransformLoopRow_SSE4_1;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kColumn] =
+ Dct8TransformLoopColumn_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize16_1DTransformDct)
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kRow] =
+ Dct16TransformLoopRow_SSE4_1;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kColumn] =
+ Dct16TransformLoopColumn_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize32_1DTransformDct)
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kRow] =
+ Dct32TransformLoopRow_SSE4_1;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kColumn] =
+ Dct32TransformLoopColumn_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize64_1DTransformDct)
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kRow] =
+ Dct64TransformLoopRow_SSE4_1;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kColumn] =
+ Dct64TransformLoopColumn_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize4_1DTransformAdst)
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kRow] =
+ Adst4TransformLoopRow_SSE4_1;
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kColumn] =
+ Adst4TransformLoopColumn_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize8_1DTransformAdst)
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kRow] =
+ Adst8TransformLoopRow_SSE4_1;
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kColumn] =
+ Adst8TransformLoopColumn_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize16_1DTransformAdst)
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kRow] =
+ Adst16TransformLoopRow_SSE4_1;
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kColumn] =
+ Adst16TransformLoopColumn_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize4_1DTransformIdentity)
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kRow] =
+ Identity4TransformLoopRow_SSE4_1;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kColumn] =
+ Identity4TransformLoopColumn_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize8_1DTransformIdentity)
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kRow] =
+ Identity8TransformLoopRow_SSE4_1;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kColumn] =
+ Identity8TransformLoopColumn_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize16_1DTransformIdentity)
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kRow] =
+ Identity16TransformLoopRow_SSE4_1;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kColumn] =
+ Identity16TransformLoopColumn_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize32_1DTransformIdentity)
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize32][kRow] =
+ Identity32TransformLoopRow_SSE4_1;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize32][kColumn] =
+ Identity32TransformLoopColumn_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize4_1DTransformWht)
+ dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kRow] =
+ Wht4TransformLoopRow_SSE4_1;
+ dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kColumn] =
+ Wht4TransformLoopColumn_SSE4_1;
+#endif
+#endif
+}
+
+} // namespace
+} // namespace low_bitdepth
+
+void InverseTransformInit_SSE4_1() { low_bitdepth::Init8bpp(); }
+
+} // namespace dsp
+} // namespace libgav1
+#else // !LIBGAV1_TARGETING_SSE4_1
+namespace libgav1 {
+namespace dsp {
+
+void InverseTransformInit_SSE4_1() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/inverse_transform_sse4.h b/src/dsp/x86/inverse_transform_sse4.h
new file mode 100644
index 0000000..106084b
--- /dev/null
+++ b/src/dsp/x86/inverse_transform_sse4.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_INVERSE_TRANSFORM_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_INVERSE_TRANSFORM_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::inverse_transforms, see the defines below for specifics.
+// This function is not thread-safe.
+void InverseTransformInit_SSE4_1();
+
+} // namespace dsp
+} // namespace libgav1
+
+// If sse4 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the sse4 implementation should be used.
+#if LIBGAV1_TARGETING_SSE4_1
+
+#ifndef LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformDct
+#define LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformDct LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_1DTransformSize8_1DTransformDct
+#define LIBGAV1_Dsp8bpp_1DTransformSize8_1DTransformDct LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_1DTransformSize16_1DTransformDct
+#define LIBGAV1_Dsp8bpp_1DTransformSize16_1DTransformDct LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_1DTransformSize32_1DTransformDct
+#define LIBGAV1_Dsp8bpp_1DTransformSize32_1DTransformDct LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_1DTransformSize64_1DTransformDct
+#define LIBGAV1_Dsp8bpp_1DTransformSize64_1DTransformDct LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformAdst
+#define LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformAdst LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_1DTransformSize8_1DTransformAdst
+#define LIBGAV1_Dsp8bpp_1DTransformSize8_1DTransformAdst LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_1DTransformSize16_1DTransformAdst
+#define LIBGAV1_Dsp8bpp_1DTransformSize16_1DTransformAdst LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformIdentity
+#define LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformIdentity LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_1DTransformSize8_1DTransformIdentity
+#define LIBGAV1_Dsp8bpp_1DTransformSize8_1DTransformIdentity LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_1DTransformSize16_1DTransformIdentity
+#define LIBGAV1_Dsp8bpp_1DTransformSize16_1DTransformIdentity LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_1DTransformSize32_1DTransformIdentity
+#define LIBGAV1_Dsp8bpp_1DTransformSize32_1DTransformIdentity LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformWht
+#define LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformWht LIBGAV1_CPU_SSE4_1
+#endif
+#endif // LIBGAV1_TARGETING_SSE4_1
+#endif // LIBGAV1_SRC_DSP_X86_INVERSE_TRANSFORM_SSE4_H_
diff --git a/src/dsp/x86/loop_filter_sse4.cc b/src/dsp/x86/loop_filter_sse4.cc
new file mode 100644
index 0000000..d67b450
--- /dev/null
+++ b/src/dsp/x86/loop_filter_sse4.cc
@@ -0,0 +1,2256 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/loop_filter.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <smmintrin.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+inline __m128i FilterAdd2Sub2(const __m128i& total, const __m128i& a1,
+ const __m128i& a2, const __m128i& s1,
+ const __m128i& s2) {
+ __m128i x = _mm_add_epi16(a1, total);
+ x = _mm_add_epi16(_mm_sub_epi16(x, _mm_add_epi16(s1, s2)), a2);
+ return x;
+}
+
+} // namespace
+
+namespace low_bitdepth {
+namespace {
+
+inline __m128i AbsDiff(const __m128i& a, const __m128i& b) {
+ return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a));
+}
+
+inline __m128i CheckOuterThreshF4(const __m128i& q1q0, const __m128i& p1p0,
+ const __m128i& outer_thresh) {
+ const __m128i fe = _mm_set1_epi8(static_cast<int8_t>(0xfe));
+ // abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= outer_thresh;
+ const __m128i abs_pmq = AbsDiff(p1p0, q1q0);
+ const __m128i a = _mm_adds_epu8(abs_pmq, abs_pmq);
+ const __m128i b = _mm_srli_epi16(_mm_and_si128(abs_pmq, fe), 1);
+ const __m128i c = _mm_adds_epu8(a, _mm_srli_si128(b, 4));
+ return _mm_subs_epu8(c, outer_thresh);
+}
+
+inline __m128i Hev(const __m128i& qp1, const __m128i& qp0,
+ const __m128i& hev_thresh) {
+ const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0);
+ const __m128i max_pq =
+ _mm_max_epu8(abs_qp1mqp0, _mm_srli_si128(abs_qp1mqp0, 4));
+ const __m128i hev_mask0 = _mm_cvtepu8_epi16(max_pq);
+ const __m128i hev_mask1 = _mm_cmpgt_epi16(hev_mask0, hev_thresh);
+ const __m128i hev_mask = _mm_packs_epi16(hev_mask1, hev_mask1);
+ return hev_mask;
+}
+
+inline __m128i AddShift3(const __m128i& a, const __m128i& b) {
+ const __m128i c = _mm_adds_epi8(a, b);
+ const __m128i d = _mm_unpacklo_epi8(c, c);
+ const __m128i e = _mm_srai_epi16(d, 11); /* >> 3 */
+ return _mm_packs_epi16(e, e);
+}
+
+inline __m128i AddShift1(const __m128i& a, const __m128i& b) {
+ const __m128i c = _mm_adds_epi8(a, b);
+ const __m128i d = _mm_unpacklo_epi8(c, c);
+ const __m128i e = _mm_srai_epi16(d, 9); /* >> 1 */
+ return _mm_packs_epi16(e, e);
+}
+
+//------------------------------------------------------------------------------
+// 4-tap filters
+
+inline __m128i NeedsFilter4(const __m128i& q1q0, const __m128i& p1p0,
+ const __m128i& qp1, const __m128i& qp0,
+ const __m128i& outer_thresh,
+ const __m128i& inner_thresh) {
+ const __m128i outer_mask = CheckOuterThreshF4(q1q0, p1p0, outer_thresh);
+ const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0);
+ const __m128i inner_mask = _mm_subs_epu8(
+ _mm_max_epu8(abs_qp1mqp0, _mm_srli_si128(abs_qp1mqp0, 4)), inner_thresh);
+ // ~mask
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i a = _mm_or_si128(outer_mask, inner_mask);
+ const __m128i b = _mm_cmpeq_epi8(a, zero);
+ return b;
+}
+
+inline void Filter4(const __m128i& qp1, const __m128i& qp0, __m128i* oqp1,
+ __m128i* oqp0, const __m128i& mask, const __m128i& hev) {
+ const __m128i t80 = _mm_set1_epi8(static_cast<int8_t>(0x80));
+ const __m128i t1 = _mm_set1_epi8(0x1);
+ const __m128i qp1qp0 = _mm_unpacklo_epi64(qp0, qp1);
+ const __m128i qps1qps0 = _mm_xor_si128(qp1qp0, t80);
+ const __m128i ps1qs0 = _mm_shuffle_epi32(qps1qps0, 0x09);
+ const __m128i qs1ps0 = _mm_shuffle_epi32(qps1qps0, 0x0c);
+ const __m128i _hev = _mm_unpacklo_epi32(hev, hev);
+ const __m128i x = _mm_subs_epi8(ps1qs0, qs1ps0);
+ __m128i a = _mm_and_si128(_mm_srli_si128(x, 4), _hev);
+
+ a = _mm_adds_epi8(a, x);
+ a = _mm_adds_epi8(a, x);
+ a = _mm_adds_epi8(a, x);
+ a = _mm_and_si128(a, mask);
+ a = _mm_unpacklo_epi32(a, a);
+
+ const __m128i t4t3 = _mm_set_epi32(0x0, 0x0, 0x04040404, 0x03030303);
+ const __m128i a1a2 = AddShift3(a, t4t3);
+ const __m128i a1a1 = _mm_shuffle_epi32(a1a2, 0x55);
+ const __m128i a3a3 = _mm_andnot_si128(_hev, AddShift1(a1a1, t1));
+ // -1 -1 -1 -1 1 1 1 1 -1 -1 -1 -1 1 1 1 1
+ const __m128i adjust_sign_for_add =
+ _mm_unpacklo_epi32(t1, _mm_cmpeq_epi8(t1, t1));
+
+ const __m128i a3a3a1a2 = _mm_unpacklo_epi64(a1a2, a3a3);
+ const __m128i ma3a3ma1a2 = _mm_sign_epi8(a3a3a1a2, adjust_sign_for_add);
+
+ const __m128i b = _mm_adds_epi8(qps1qps0, ma3a3ma1a2);
+ const __m128i c = _mm_xor_si128(b, t80);
+
+ *oqp0 = c;
+ *oqp1 = _mm_srli_si128(c, 8);
+}
+
+void Horizontal4(void* dest, ptrdiff_t stride, int outer_thresh,
+ int inner_thresh, int hev_thresh) {
+ auto* const dst = static_cast<uint8_t*>(dest);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i v_outer_thresh =
+ _mm_shuffle_epi8(_mm_cvtsi32_si128(outer_thresh), zero);
+ const __m128i v_inner_thresh =
+ _mm_shuffle_epi8(_mm_cvtsi32_si128(inner_thresh), zero);
+ const __m128i v_hev_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(hev_thresh), 0);
+
+ const __m128i p1 = Load4(dst - 2 * stride);
+ const __m128i p0 = Load4(dst - 1 * stride);
+ const __m128i q0 = Load4(dst + 0 * stride);
+ const __m128i q1 = Load4(dst + 1 * stride);
+ const __m128i qp1 = _mm_unpacklo_epi32(p1, q1);
+ const __m128i qp0 = _mm_unpacklo_epi32(p0, q0);
+ const __m128i q1q0 = _mm_unpacklo_epi32(q0, q1);
+ const __m128i p1p0 = _mm_unpacklo_epi32(p0, p1);
+
+ const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+ const __m128i v_needs_mask =
+ NeedsFilter4(q1q0, p1p0, qp1, qp0, v_outer_thresh, v_inner_thresh);
+
+ __m128i oqp1;
+ __m128i oqp0;
+ Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask);
+
+ Store4(dst - 2 * stride, oqp1);
+ Store4(dst - 1 * stride, oqp0);
+ Store4(dst + 0 * stride, _mm_srli_si128(oqp0, 4));
+ Store4(dst + 1 * stride, _mm_srli_si128(oqp1, 4));
+}
+
+inline void Transpose4x4(const __m128i& x0, const __m128i& x1,
+ const __m128i& x2, const __m128i& x3, __m128i* d0,
+ __m128i* d1, __m128i* d2, __m128i* d3) {
+ // input
+ // x0 00 01 02 03 xx xx xx xx xx xx xx xx xx xx xx xx
+ // x1 10 11 12 13 xx xx xx xx xx xx xx xx xx xx xx xx
+ // x2 20 21 22 23 xx xx xx xx xx xx xx xx xx xx xx xx
+ // x3 30 31 32 33 xx xx xx xx xx xx xx xx xx xx xx xx
+ // output
+ // d0 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
+ // d1 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
+ // d2 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
+ // d3 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
+
+ // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+ const __m128i w0 = _mm_unpacklo_epi8(x0, x1);
+ // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+ const __m128i w1 = _mm_unpacklo_epi8(x2, x3);
+
+ // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ *d0 = _mm_unpacklo_epi16(w0, w1);
+ // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
+ *d1 = _mm_srli_si128(*d0, 4);
+ // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
+ *d2 = _mm_srli_si128(*d0, 8);
+ // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
+ *d3 = _mm_srli_si128(*d0, 12);
+}
+
+void Vertical4(void* dest, ptrdiff_t stride, int outer_thresh, int inner_thresh,
+ int hev_thresh) {
+ auto* const dst = static_cast<uint8_t*>(dest);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i v_outer_thresh =
+ _mm_shuffle_epi8(_mm_cvtsi32_si128(outer_thresh), zero);
+ const __m128i v_inner_thresh =
+ _mm_shuffle_epi8(_mm_cvtsi32_si128(inner_thresh), zero);
+ const __m128i v_hev_thresh0 =
+ _mm_shuffle_epi8(_mm_cvtsi32_si128(hev_thresh), zero);
+ const __m128i v_hev_thresh = _mm_unpacklo_epi8(v_hev_thresh0, zero);
+
+ __m128i x0 = Load4(dst - 2 + 0 * stride);
+ __m128i x1 = Load4(dst - 2 + 1 * stride);
+ __m128i x2 = Load4(dst - 2 + 2 * stride);
+ __m128i x3 = Load4(dst - 2 + 3 * stride);
+
+ // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+ const __m128i w0 = _mm_unpacklo_epi8(x0, x1);
+ // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+ const __m128i w1 = _mm_unpacklo_epi8(x2, x3);
+ // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ const __m128i d0 = _mm_unpacklo_epi16(w0, w1);
+ const __m128i qp1 = _mm_shuffle_epi32(d0, 0xc);
+ const __m128i qp0 = _mm_srli_si128(d0, 4);
+ const __m128i q1q0 = _mm_srli_si128(d0, 8);
+ const __m128i p1p0 = _mm_shuffle_epi32(d0, 0x1);
+
+ const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+ const __m128i v_needs_mask =
+ NeedsFilter4(q1q0, p1p0, qp1, qp0, v_outer_thresh, v_inner_thresh);
+
+ __m128i oqp1;
+ __m128i oqp0;
+ Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask);
+
+ const __m128i p1 = oqp1;
+ const __m128i p0 = oqp0;
+ const __m128i q0 = _mm_srli_si128(oqp0, 4);
+ const __m128i q1 = _mm_srli_si128(oqp1, 4);
+
+ Transpose4x4(p1, p0, q0, q1, &x0, &x1, &x2, &x3);
+
+ Store4(dst - 2 + 0 * stride, x0);
+ Store4(dst - 2 + 1 * stride, x1);
+ Store4(dst - 2 + 2 * stride, x2);
+ Store4(dst - 2 + 3 * stride, x3);
+}
+
+//------------------------------------------------------------------------------
+// 5-tap (chroma) filters
+
+inline __m128i NeedsFilter6(const __m128i& q1q0, const __m128i& p1p0,
+ const __m128i& qp2, const __m128i& qp1,
+ const __m128i& qp0, const __m128i& outer_thresh,
+ const __m128i& inner_thresh) {
+ const __m128i outer_mask = CheckOuterThreshF4(q1q0, p1p0, outer_thresh);
+ const __m128i abs_qp2mqp1 = AbsDiff(qp2, qp1);
+ const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0);
+ const __m128i max_pq = _mm_max_epu8(abs_qp2mqp1, abs_qp1mqp0);
+ const __m128i inner_mask = _mm_subs_epu8(
+ _mm_max_epu8(max_pq, _mm_srli_si128(max_pq, 4)), inner_thresh);
+ // ~mask
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i a = _mm_or_si128(outer_mask, inner_mask);
+ const __m128i b = _mm_cmpeq_epi8(a, zero);
+ return b;
+}
+
+inline __m128i IsFlat3(const __m128i& qp2, const __m128i& qp1,
+ const __m128i& qp0, const __m128i& flat_thresh) {
+ const __m128i abs_pq2mpq0 = AbsDiff(qp2, qp0);
+ const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0);
+ const __m128i max_pq = _mm_max_epu8(abs_pq2mpq0, abs_qp1mqp0);
+ const __m128i flat_mask = _mm_subs_epu8(
+ _mm_max_epu8(max_pq, _mm_srli_si128(max_pq, 4)), flat_thresh);
+ // ~mask
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i a = _mm_cmpeq_epi8(flat_mask, zero);
+ return a;
+}
+
+inline void Filter6(const __m128i& qp2, const __m128i& qp1, const __m128i& qp0,
+ __m128i* oqp1, __m128i* oqp0) {
+ const __m128i four = _mm_set1_epi16(4);
+ const __m128i qp2_lo = _mm_cvtepu8_epi16(qp2);
+ const __m128i qp1_lo = _mm_cvtepu8_epi16(qp1);
+ const __m128i qp0_lo = _mm_cvtepu8_epi16(qp0);
+ const __m128i pq1_lo = _mm_shuffle_epi32(qp1_lo, 0x4e);
+ const __m128i pq0_lo = _mm_shuffle_epi32(qp0_lo, 0x4e);
+
+ __m128i f6_lo =
+ _mm_add_epi16(_mm_add_epi16(qp2_lo, four), _mm_add_epi16(qp2_lo, qp2_lo));
+
+ f6_lo = _mm_add_epi16(_mm_add_epi16(f6_lo, qp1_lo), qp1_lo);
+
+ f6_lo = _mm_add_epi16(_mm_add_epi16(f6_lo, qp0_lo),
+ _mm_add_epi16(qp0_lo, pq0_lo));
+
+ // p2 * 3 + p1 * 2 + p0 * 2 + q0
+ // q2 * 3 + q1 * 2 + q0 * 2 + p0
+ *oqp1 = _mm_srli_epi16(f6_lo, 3);
+ *oqp1 = _mm_packus_epi16(*oqp1, *oqp1);
+
+ // p2 + p1 * 2 + p0 * 2 + q0 * 2 + q1
+ // q2 + q1 * 2 + q0 * 2 + p0 * 2 + p1
+ f6_lo = FilterAdd2Sub2(f6_lo, pq0_lo, pq1_lo, qp2_lo, qp2_lo);
+ *oqp0 = _mm_srli_epi16(f6_lo, 3);
+ *oqp0 = _mm_packus_epi16(*oqp0, *oqp0);
+}
+
+void Horizontal6(void* dest, ptrdiff_t stride, int outer_thresh,
+ int inner_thresh, int hev_thresh) {
+ auto* const dst = static_cast<uint8_t*>(dest);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i v_flat_thresh = _mm_set1_epi8(1);
+ const __m128i v_outer_thresh =
+ _mm_shuffle_epi8(_mm_cvtsi32_si128(outer_thresh), zero);
+ const __m128i v_inner_thresh =
+ _mm_shuffle_epi8(_mm_cvtsi32_si128(inner_thresh), zero);
+ const __m128i v_hev_thresh0 =
+ _mm_shuffle_epi8(_mm_cvtsi32_si128(hev_thresh), zero);
+ const __m128i v_hev_thresh = _mm_unpacklo_epi8(v_hev_thresh0, zero);
+
+ const __m128i p2 = Load4(dst - 3 * stride);
+ const __m128i p1 = Load4(dst - 2 * stride);
+ const __m128i p0 = Load4(dst - 1 * stride);
+ const __m128i q0 = Load4(dst + 0 * stride);
+ const __m128i q1 = Load4(dst + 1 * stride);
+ const __m128i q2 = Load4(dst + 2 * stride);
+ const __m128i qp2 = _mm_unpacklo_epi32(p2, q2);
+ const __m128i qp1 = _mm_unpacklo_epi32(p1, q1);
+ const __m128i qp0 = _mm_unpacklo_epi32(p0, q0);
+ const __m128i q1q0 = _mm_unpacklo_epi32(q0, q1);
+ const __m128i p1p0 = _mm_unpacklo_epi32(p0, p1);
+
+ const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+ const __m128i v_needs_mask =
+ NeedsFilter6(q1q0, p1p0, qp2, qp1, qp0, v_outer_thresh, v_inner_thresh);
+ __m128i oqp1;
+ __m128i oqp0;
+
+ Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask);
+
+ const __m128i v_isflat3_mask = IsFlat3(qp2, qp1, qp0, v_flat_thresh);
+ const __m128i v_mask =
+ _mm_shuffle_epi32(_mm_and_si128(v_needs_mask, v_isflat3_mask), 0);
+
+ if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi8(v_mask, v_mask)) == 0) {
+ __m128i oqp1_f6;
+ __m128i oqp0_f6;
+
+ Filter6(qp2, qp1, qp0, &oqp1_f6, &oqp0_f6);
+
+ oqp1 = _mm_blendv_epi8(oqp1, oqp1_f6, v_mask);
+ oqp0 = _mm_blendv_epi8(oqp0, oqp0_f6, v_mask);
+ }
+
+ Store4(dst - 2 * stride, oqp1);
+ Store4(dst - 1 * stride, oqp0);
+ Store4(dst + 0 * stride, _mm_srli_si128(oqp0, 4));
+ Store4(dst + 1 * stride, _mm_srli_si128(oqp1, 4));
+}
+
+inline void Transpose8x4To4x8(const __m128i& x0, const __m128i& x1,
+ const __m128i& x2, const __m128i& x3, __m128i* d0,
+ __m128i* d1, __m128i* d2, __m128i* d3,
+ __m128i* d4, __m128i* d5, __m128i* d6,
+ __m128i* d7) {
+ // input
+ // x0 00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx
+ // x1 10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx
+ // x2 20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx
+ // x3 30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx
+ // output
+ // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
+ // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
+ // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
+ // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
+ // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx
+ // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx
+ // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx
+ // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx
+
+ // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+ const __m128i w0 = _mm_unpacklo_epi8(x0, x1);
+ // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+ const __m128i w1 = _mm_unpacklo_epi8(x2, x3);
+ // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ const __m128i ww0 = _mm_unpacklo_epi16(w0, w1);
+ // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+ const __m128i ww1 = _mm_unpackhi_epi16(w0, w1);
+
+ // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
+ *d0 = ww0;
+ // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
+ *d1 = _mm_srli_si128(ww0, 4);
+ // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
+ *d2 = _mm_srli_si128(ww0, 8);
+ // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
+ *d3 = _mm_srli_si128(ww0, 12);
+ // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx
+ *d4 = ww1;
+ // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx
+ *d5 = _mm_srli_si128(ww1, 4);
+ // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx
+ *d6 = _mm_srli_si128(ww1, 8);
+ // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx
+ *d7 = _mm_srli_si128(ww1, 12);
+}
+
+void Vertical6(void* dest, ptrdiff_t stride, int outer_thresh, int inner_thresh,
+ int hev_thresh) {
+ auto* const dst = static_cast<uint8_t*>(dest);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i v_flat_thresh = _mm_set1_epi8(1);
+ const __m128i v_outer_thresh =
+ _mm_shuffle_epi8(_mm_cvtsi32_si128(outer_thresh), zero);
+ const __m128i v_inner_thresh =
+ _mm_shuffle_epi8(_mm_cvtsi32_si128(inner_thresh), zero);
+ const __m128i v_hev_thresh0 =
+ _mm_shuffle_epi8(_mm_cvtsi32_si128(hev_thresh), zero);
+ const __m128i v_hev_thresh = _mm_unpacklo_epi8(v_hev_thresh0, zero);
+
+ __m128i x0 = LoadLo8(dst - 3 + 0 * stride);
+ __m128i x1 = LoadLo8(dst - 3 + 1 * stride);
+ __m128i x2 = LoadLo8(dst - 3 + 2 * stride);
+ __m128i x3 = LoadLo8(dst - 3 + 3 * stride);
+
+ __m128i p2, p1, p0, q0, q1, q2;
+ __m128i z0, z1; // not used
+
+ Transpose8x4To4x8(x0, x1, x2, x3, &p2, &p1, &p0, &q0, &q1, &q2, &z0, &z1);
+
+ const __m128i qp2 = _mm_unpacklo_epi32(p2, q2);
+ const __m128i qp1 = _mm_unpacklo_epi32(p1, q1);
+ const __m128i qp0 = _mm_unpacklo_epi32(p0, q0);
+ const __m128i q1q0 = _mm_unpacklo_epi32(q0, q1);
+ const __m128i p1p0 = _mm_unpacklo_epi32(p0, p1);
+
+ const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+ const __m128i v_needs_mask =
+ NeedsFilter6(q1q0, p1p0, qp2, qp1, qp0, v_outer_thresh, v_inner_thresh);
+ __m128i oqp1;
+ __m128i oqp0;
+
+ Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask);
+
+ const __m128i v_isflat3_mask = IsFlat3(qp2, qp1, qp0, v_flat_thresh);
+ const __m128i v_mask =
+ _mm_shuffle_epi32(_mm_and_si128(v_needs_mask, v_isflat3_mask), 0);
+
+ if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi8(v_mask, v_mask)) == 0) {
+ __m128i oqp1_f6;
+ __m128i oqp0_f6;
+
+ Filter6(qp2, qp1, qp0, &oqp1_f6, &oqp0_f6);
+
+ oqp1 = _mm_blendv_epi8(oqp1, oqp1_f6, v_mask);
+ oqp0 = _mm_blendv_epi8(oqp0, oqp0_f6, v_mask);
+ }
+
+ p1 = oqp1;
+ p0 = oqp0;
+ q0 = _mm_srli_si128(oqp0, 4);
+ q1 = _mm_srli_si128(oqp1, 4);
+
+ Transpose4x4(p1, p0, q0, q1, &x0, &x1, &x2, &x3);
+
+ Store4(dst - 2 + 0 * stride, x0);
+ Store4(dst - 2 + 1 * stride, x1);
+ Store4(dst - 2 + 2 * stride, x2);
+ Store4(dst - 2 + 3 * stride, x3);
+}
+
+//------------------------------------------------------------------------------
+// 7-tap filters
+
+inline __m128i NeedsFilter8(const __m128i& q1q0, const __m128i& p1p0,
+ const __m128i& qp3, const __m128i& qp2,
+ const __m128i& qp1, const __m128i& qp0,
+ const __m128i& outer_thresh,
+ const __m128i& inner_thresh) {
+ const __m128i outer_mask = CheckOuterThreshF4(q1q0, p1p0, outer_thresh);
+ const __m128i abs_qp2mqp1 = AbsDiff(qp2, qp1);
+ const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0);
+ const __m128i max_pq_a = _mm_max_epu8(abs_qp2mqp1, abs_qp1mqp0);
+ const __m128i abs_pq3mpq2 = AbsDiff(qp3, qp2);
+ const __m128i max_pq = _mm_max_epu8(max_pq_a, abs_pq3mpq2);
+ const __m128i inner_mask = _mm_subs_epu8(
+ _mm_max_epu8(max_pq, _mm_srli_si128(max_pq, 4)), inner_thresh);
+ // ~mask
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i a = _mm_or_si128(outer_mask, inner_mask);
+ const __m128i b = _mm_cmpeq_epi8(a, zero);
+ return b;
+}
+
+inline __m128i IsFlat4(const __m128i& qp3, const __m128i& qp2,
+ const __m128i& qp1, const __m128i& qp0,
+ const __m128i& flat_thresh) {
+ const __m128i abs_pq2mpq0 = AbsDiff(qp2, qp0);
+ const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0);
+ const __m128i max_pq_a = _mm_max_epu8(abs_pq2mpq0, abs_qp1mqp0);
+ const __m128i abs_pq3mpq0 = AbsDiff(qp3, qp0);
+ const __m128i max_pq = _mm_max_epu8(max_pq_a, abs_pq3mpq0);
+ const __m128i flat_mask = _mm_subs_epu8(
+ _mm_max_epu8(max_pq, _mm_srli_si128(max_pq, 4)), flat_thresh);
+ // ~mask
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i a = _mm_cmpeq_epi8(flat_mask, zero);
+ return a;
+}
+
+inline void Filter8(const __m128i& qp3, const __m128i& qp2, const __m128i& qp1,
+ const __m128i& qp0, __m128i* oqp2, __m128i* oqp1,
+ __m128i* oqp0) {
+ const __m128i four = _mm_set1_epi16(4);
+ const __m128i qp3_lo = _mm_cvtepu8_epi16(qp3);
+ const __m128i qp2_lo = _mm_cvtepu8_epi16(qp2);
+ const __m128i qp1_lo = _mm_cvtepu8_epi16(qp1);
+ const __m128i qp0_lo = _mm_cvtepu8_epi16(qp0);
+ const __m128i pq2_lo = _mm_shuffle_epi32(qp2_lo, 0x4e);
+ const __m128i pq1_lo = _mm_shuffle_epi32(qp1_lo, 0x4e);
+ const __m128i pq0_lo = _mm_shuffle_epi32(qp0_lo, 0x4e);
+
+ __m128i f8_lo =
+ _mm_add_epi16(_mm_add_epi16(qp3_lo, four), _mm_add_epi16(qp3_lo, qp3_lo));
+
+ f8_lo = _mm_add_epi16(_mm_add_epi16(f8_lo, qp2_lo), qp2_lo);
+
+ f8_lo = _mm_add_epi16(_mm_add_epi16(f8_lo, qp1_lo),
+ _mm_add_epi16(qp0_lo, pq0_lo));
+
+ // p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0
+ // q3 + q3 + q3 + 2 * q2 + q1 + q0 + p0
+ *oqp2 = _mm_srli_epi16(f8_lo, 3);
+ *oqp2 = _mm_packus_epi16(*oqp2, *oqp2);
+
+ // p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1
+ // q3 + q3 + q2 + 2 * q1 + q0 + p0 + p1
+ f8_lo = FilterAdd2Sub2(f8_lo, qp1_lo, pq1_lo, qp3_lo, qp2_lo);
+ *oqp1 = _mm_srli_epi16(f8_lo, 3);
+ *oqp1 = _mm_packus_epi16(*oqp1, *oqp1);
+
+ // p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2
+ // q3 + q2 + q1 + 2 * q0 + p0 + p1 + p2
+ f8_lo = FilterAdd2Sub2(f8_lo, qp0_lo, pq2_lo, qp3_lo, qp1_lo);
+ *oqp0 = _mm_srli_epi16(f8_lo, 3);
+ *oqp0 = _mm_packus_epi16(*oqp0, *oqp0);
+}
+
+void Horizontal8(void* dest, ptrdiff_t stride, int outer_thresh,
+ int inner_thresh, int hev_thresh) {
+ auto* const dst = static_cast<uint8_t*>(dest);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i v_flat_thresh = _mm_set1_epi8(1);
+ const __m128i v_outer_thresh =
+ _mm_shuffle_epi8(_mm_cvtsi32_si128(outer_thresh), zero);
+ const __m128i v_inner_thresh =
+ _mm_shuffle_epi8(_mm_cvtsi32_si128(inner_thresh), zero);
+ const __m128i v_hev_thresh0 =
+ _mm_shuffle_epi8(_mm_cvtsi32_si128(hev_thresh), zero);
+ const __m128i v_hev_thresh = _mm_unpacklo_epi8(v_hev_thresh0, zero);
+
+ const __m128i p3 = Load4(dst - 4 * stride);
+ const __m128i p2 = Load4(dst - 3 * stride);
+ const __m128i p1 = Load4(dst - 2 * stride);
+ const __m128i p0 = Load4(dst - 1 * stride);
+ const __m128i q0 = Load4(dst + 0 * stride);
+ const __m128i q1 = Load4(dst + 1 * stride);
+ const __m128i q2 = Load4(dst + 2 * stride);
+ const __m128i q3 = Load4(dst + 3 * stride);
+
+ const __m128i qp3 = _mm_unpacklo_epi32(p3, q3);
+ const __m128i qp2 = _mm_unpacklo_epi32(p2, q2);
+ const __m128i qp1 = _mm_unpacklo_epi32(p1, q1);
+ const __m128i qp0 = _mm_unpacklo_epi32(p0, q0);
+ const __m128i q1q0 = _mm_unpacklo_epi32(q0, q1);
+ const __m128i p1p0 = _mm_unpacklo_epi32(p0, p1);
+
+ const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+ const __m128i v_needs_mask = NeedsFilter8(q1q0, p1p0, qp3, qp2, qp1, qp0,
+ v_outer_thresh, v_inner_thresh);
+ __m128i oqp1;
+ __m128i oqp0;
+
+ Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask);
+
+ const __m128i v_isflat4_mask = IsFlat4(qp3, qp2, qp1, qp0, v_flat_thresh);
+ const __m128i v_mask =
+ _mm_shuffle_epi32(_mm_and_si128(v_needs_mask, v_isflat4_mask), 0);
+
+ if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi8(v_mask, v_mask)) == 0) {
+ __m128i oqp2_f8;
+ __m128i oqp1_f8;
+ __m128i oqp0_f8;
+
+ Filter8(qp3, qp2, qp1, qp0, &oqp2_f8, &oqp1_f8, &oqp0_f8);
+
+ oqp2_f8 = _mm_blendv_epi8(qp2, oqp2_f8, v_mask);
+ oqp1 = _mm_blendv_epi8(oqp1, oqp1_f8, v_mask);
+ oqp0 = _mm_blendv_epi8(oqp0, oqp0_f8, v_mask);
+ Store4(dst - 3 * stride, oqp2_f8);
+ Store4(dst + 2 * stride, _mm_srli_si128(oqp2_f8, 4));
+ }
+
+ Store4(dst - 2 * stride, oqp1);
+ Store4(dst - 1 * stride, oqp0);
+ Store4(dst + 0 * stride, _mm_srli_si128(oqp0, 4));
+ Store4(dst + 1 * stride, _mm_srli_si128(oqp1, 4));
+}
+
+inline void Transpose8x8To8x4(const __m128i& x0, const __m128i& x1,
+ const __m128i& x2, const __m128i& x3,
+ const __m128i& x4, const __m128i& x5,
+ const __m128i& x6, const __m128i& x7, __m128i* d0,
+ __m128i* d1, __m128i* d2, __m128i* d3) {
+ // input
+ // x0 00 01 02 03 04 05 06 07
+ // x1 10 11 12 13 14 15 16 17
+ // x2 20 21 22 23 24 25 26 27
+ // x3 30 31 32 33 34 35 36 37
+ // x4 40 41 42 43 44 45 46 47
+ // x5 50 51 52 53 54 55 56 57
+ // x6 60 61 62 63 64 65 66 67
+ // x7 70 71 72 73 74 75 76 77
+ // output
+ // d0 00 10 20 30 40 50 60 70 xx xx xx xx xx xx xx xx
+ // d1 01 11 21 31 41 51 61 71 xx xx xx xx xx xx xx xx
+ // d2 02 12 22 32 42 52 62 72 xx xx xx xx xx xx xx xx
+ // d3 03 13 23 33 43 53 63 73 xx xx xx xx xx xx xx xx
+
+ // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+ const __m128i w0 = _mm_unpacklo_epi8(x0, x1);
+ // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+ const __m128i w1 = _mm_unpacklo_epi8(x2, x3);
+ // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
+ const __m128i w2 = _mm_unpacklo_epi8(x4, x5);
+ // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
+ const __m128i w3 = _mm_unpacklo_epi8(x6, x7);
+
+ // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ const __m128i w4 = _mm_unpacklo_epi16(w0, w1);
+ // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
+ const __m128i w5 = _mm_unpacklo_epi16(w2, w3);
+
+ // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+ *d0 = _mm_unpacklo_epi32(w4, w5);
+ *d1 = _mm_srli_si128(*d0, 8);
+ // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+ *d2 = _mm_unpackhi_epi32(w4, w5);
+ *d3 = _mm_srli_si128(*d2, 8);
+}
+
+void Vertical8(void* dest, ptrdiff_t stride, int outer_thresh, int inner_thresh,
+ int hev_thresh) {
+ auto* const dst = static_cast<uint8_t*>(dest);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i v_flat_thresh = _mm_set1_epi8(1);
+ const __m128i v_outer_thresh =
+ _mm_shuffle_epi8(_mm_cvtsi32_si128(outer_thresh), zero);
+ const __m128i v_inner_thresh =
+ _mm_shuffle_epi8(_mm_cvtsi32_si128(inner_thresh), zero);
+ const __m128i v_hev_thresh0 =
+ _mm_shuffle_epi8(_mm_cvtsi32_si128(hev_thresh), zero);
+ const __m128i v_hev_thresh = _mm_unpacklo_epi8(v_hev_thresh0, zero);
+
+ __m128i x0 = LoadLo8(dst - 4 + 0 * stride);
+ __m128i x1 = LoadLo8(dst - 4 + 1 * stride);
+ __m128i x2 = LoadLo8(dst - 4 + 2 * stride);
+ __m128i x3 = LoadLo8(dst - 4 + 3 * stride);
+
+ __m128i p3, p2, p1, p0, q0, q1, q2, q3;
+ Transpose8x4To4x8(x0, x1, x2, x3, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+
+ const __m128i qp3 = _mm_unpacklo_epi32(p3, q3);
+ const __m128i qp2 = _mm_unpacklo_epi32(p2, q2);
+ const __m128i qp1 = _mm_unpacklo_epi32(p1, q1);
+ const __m128i qp0 = _mm_unpacklo_epi32(p0, q0);
+ const __m128i q1q0 = _mm_unpacklo_epi32(q0, q1);
+ const __m128i p1p0 = _mm_unpacklo_epi32(p0, p1);
+
+ const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+ const __m128i v_needs_mask = NeedsFilter8(q1q0, p1p0, qp3, qp2, qp1, qp0,
+ v_outer_thresh, v_inner_thresh);
+ __m128i oqp1;
+ __m128i oqp0;
+
+ Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask);
+
+ const __m128i v_isflat4_mask = IsFlat4(qp3, qp2, qp1, qp0, v_flat_thresh);
+ const __m128i v_mask =
+ _mm_shuffle_epi32(_mm_and_si128(v_needs_mask, v_isflat4_mask), 0);
+
+ if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi8(v_mask, v_mask)) == 0) {
+ __m128i oqp2_f8;
+ __m128i oqp1_f8;
+ __m128i oqp0_f8;
+
+ Filter8(qp3, qp2, qp1, qp0, &oqp2_f8, &oqp1_f8, &oqp0_f8);
+
+ oqp2_f8 = _mm_blendv_epi8(qp2, oqp2_f8, v_mask);
+ oqp1 = _mm_blendv_epi8(oqp1, oqp1_f8, v_mask);
+ oqp0 = _mm_blendv_epi8(oqp0, oqp0_f8, v_mask);
+
+ p2 = oqp2_f8;
+ q2 = _mm_srli_si128(oqp2_f8, 4);
+ }
+
+ p1 = oqp1;
+ p0 = oqp0;
+ q0 = _mm_srli_si128(oqp0, 4);
+ q1 = _mm_srli_si128(oqp1, 4);
+
+ Transpose8x8To8x4(p3, p2, p1, p0, q0, q1, q2, q3, &x0, &x1, &x2, &x3);
+
+ StoreLo8(dst - 4 + 0 * stride, x0);
+ StoreLo8(dst - 4 + 1 * stride, x1);
+ StoreLo8(dst - 4 + 2 * stride, x2);
+ StoreLo8(dst - 4 + 3 * stride, x3);
+}
+
+//------------------------------------------------------------------------------
+// 13-tap filters
+
+inline void Filter14(const __m128i& qp6, const __m128i& qp5, const __m128i& qp4,
+ const __m128i& qp3, const __m128i& qp2, const __m128i& qp1,
+ const __m128i& qp0, __m128i* oqp5, __m128i* oqp4,
+ __m128i* oqp3, __m128i* oqp2, __m128i* oqp1,
+ __m128i* oqp0) {
+ const __m128i eight = _mm_set1_epi16(8);
+ const __m128i qp6_lo = _mm_cvtepu8_epi16(qp6);
+ const __m128i qp5_lo = _mm_cvtepu8_epi16(qp5);
+ const __m128i qp4_lo = _mm_cvtepu8_epi16(qp4);
+ const __m128i qp3_lo = _mm_cvtepu8_epi16(qp3);
+ const __m128i qp2_lo = _mm_cvtepu8_epi16(qp2);
+ const __m128i qp1_lo = _mm_cvtepu8_epi16(qp1);
+ const __m128i qp0_lo = _mm_cvtepu8_epi16(qp0);
+ const __m128i pq5_lo = _mm_shuffle_epi32(qp5_lo, 0x4e);
+ const __m128i pq4_lo = _mm_shuffle_epi32(qp4_lo, 0x4e);
+ const __m128i pq3_lo = _mm_shuffle_epi32(qp3_lo, 0x4e);
+ const __m128i pq2_lo = _mm_shuffle_epi32(qp2_lo, 0x4e);
+ const __m128i pq1_lo = _mm_shuffle_epi32(qp1_lo, 0x4e);
+ const __m128i pq0_lo = _mm_shuffle_epi32(qp0_lo, 0x4e);
+
+ __m128i f14_lo =
+ _mm_add_epi16(eight, _mm_sub_epi16(_mm_slli_epi16(qp6_lo, 3), qp6_lo));
+
+ f14_lo = _mm_add_epi16(_mm_add_epi16(f14_lo, qp5_lo),
+ _mm_add_epi16(qp5_lo, qp4_lo));
+
+ f14_lo = _mm_add_epi16(_mm_add_epi16(f14_lo, qp4_lo),
+ _mm_add_epi16(qp3_lo, qp2_lo));
+
+ f14_lo = _mm_add_epi16(_mm_add_epi16(f14_lo, qp1_lo),
+ _mm_add_epi16(qp0_lo, pq0_lo));
+
+ // p6 * 7 + p5 * 2 + p4 * 2 + p3 + p2 + p1 + p0 + q0
+ // q6 * 7 + q5 * 2 + q4 * 2 + q3 + q2 + q1 + q0 + p0
+ *oqp5 = _mm_srli_epi16(f14_lo, 4);
+ *oqp5 = _mm_packus_epi16(*oqp5, *oqp5);
+
+ // p6 * 5 + p5 * 2 + p4 * 2 + p3 * 2 + p2 + p1 + p0 + q0 + q1
+ // q6 * 5 + q5 * 2 + q4 * 2 + q3 * 2 + q2 + q1 + q0 + p0 + p1
+ f14_lo = FilterAdd2Sub2(f14_lo, qp3_lo, pq1_lo, qp6_lo, qp6_lo);
+ *oqp4 = _mm_srli_epi16(f14_lo, 4);
+ *oqp4 = _mm_packus_epi16(*oqp4, *oqp4);
+
+ // p6 * 4 + p5 + p4 * 2 + p3 * 2 + p2 * 2 + p1 + p0 + q0 + q1 + q2
+ // q6 * 4 + q5 + q4 * 2 + q3 * 2 + q2 * 2 + q1 + q0 + p0 + p1 + p2
+ f14_lo = FilterAdd2Sub2(f14_lo, qp2_lo, pq2_lo, qp6_lo, qp5_lo);
+ *oqp3 = _mm_srli_epi16(f14_lo, 4);
+ *oqp3 = _mm_packus_epi16(*oqp3, *oqp3);
+
+ // p6 * 3 + p5 + p4 + p3 * 2 + p2 * 2 + p1 * 2 + p0 + q0 + q1 + q2 + q3
+ // q6 * 3 + q5 + q4 + q3 * 2 + q2 * 2 + q1 * 2 + q0 + p0 + p1 + p2 + p3
+ f14_lo = FilterAdd2Sub2(f14_lo, qp1_lo, pq3_lo, qp6_lo, qp4_lo);
+ *oqp2 = _mm_srli_epi16(f14_lo, 4);
+ *oqp2 = _mm_packus_epi16(*oqp2, *oqp2);
+
+ // p6 * 2 + p5 + p4 + p3 + p2 * 2 + p1 * 2 + p0 * 2 + q0 + q1 + q2 + q3 + q4
+ // q6 * 2 + q5 + q4 + q3 + q2 * 2 + q1 * 2 + q0 * 2 + p0 + p1 + p2 + p3 + p4
+ f14_lo = FilterAdd2Sub2(f14_lo, qp0_lo, pq4_lo, qp6_lo, qp3_lo);
+ *oqp1 = _mm_srli_epi16(f14_lo, 4);
+ *oqp1 = _mm_packus_epi16(*oqp1, *oqp1);
+
+ // p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 * 2 + q0 * 2 + q1 + q2 + q3 + q4 + q5
+ // q6 + q5 + q4 + q3 + q2 + q1 * 2 + q0 * 2 + p0 * 2 + p1 + p2 + p3 + p4 + p5
+ f14_lo = FilterAdd2Sub2(f14_lo, pq0_lo, pq5_lo, qp6_lo, qp2_lo);
+ *oqp0 = _mm_srli_epi16(f14_lo, 4);
+ *oqp0 = _mm_packus_epi16(*oqp0, *oqp0);
+}
+
+void Horizontal14(void* dest, ptrdiff_t stride, int outer_thresh,
+ int inner_thresh, int hev_thresh) {
+ auto* const dst = static_cast<uint8_t*>(dest);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i v_flat_thresh = _mm_set1_epi8(1);
+ const __m128i v_outer_thresh =
+ _mm_shuffle_epi8(_mm_cvtsi32_si128(outer_thresh), zero);
+ const __m128i v_inner_thresh =
+ _mm_shuffle_epi8(_mm_cvtsi32_si128(inner_thresh), zero);
+ const __m128i v_hev_thresh0 =
+ _mm_shuffle_epi8(_mm_cvtsi32_si128(hev_thresh), zero);
+ const __m128i v_hev_thresh = _mm_unpacklo_epi8(v_hev_thresh0, zero);
+
+ const __m128i p3 = Load4(dst - 4 * stride);
+ const __m128i p2 = Load4(dst - 3 * stride);
+ const __m128i p1 = Load4(dst - 2 * stride);
+ const __m128i p0 = Load4(dst - 1 * stride);
+ const __m128i q0 = Load4(dst + 0 * stride);
+ const __m128i q1 = Load4(dst + 1 * stride);
+ const __m128i q2 = Load4(dst + 2 * stride);
+ const __m128i q3 = Load4(dst + 3 * stride);
+
+ const __m128i qp3 = _mm_unpacklo_epi32(p3, q3);
+ const __m128i qp2 = _mm_unpacklo_epi32(p2, q2);
+ const __m128i qp1 = _mm_unpacklo_epi32(p1, q1);
+ const __m128i qp0 = _mm_unpacklo_epi32(p0, q0);
+ const __m128i q1q0 = _mm_unpacklo_epi32(q0, q1);
+ const __m128i p1p0 = _mm_unpacklo_epi32(p0, p1);
+
+ const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+ const __m128i v_needs_mask = NeedsFilter8(q1q0, p1p0, qp3, qp2, qp1, qp0,
+ v_outer_thresh, v_inner_thresh);
+
+ __m128i oqp1;
+ __m128i oqp0;
+
+ Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask);
+
+ const __m128i v_isflat4_mask = IsFlat4(qp3, qp2, qp1, qp0, v_flat_thresh);
+ const __m128i v_mask =
+ _mm_shuffle_epi32(_mm_and_si128(v_needs_mask, v_isflat4_mask), 0);
+
+ if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi8(v_mask, v_mask)) == 0) {
+ const __m128i p6 = Load4(dst - 7 * stride);
+ const __m128i p5 = Load4(dst - 6 * stride);
+ const __m128i p4 = Load4(dst - 5 * stride);
+ const __m128i q4 = Load4(dst + 4 * stride);
+ const __m128i q5 = Load4(dst + 5 * stride);
+ const __m128i q6 = Load4(dst + 6 * stride);
+ const __m128i qp6 = _mm_unpacklo_epi32(p6, q6);
+ const __m128i qp5 = _mm_unpacklo_epi32(p5, q5);
+ const __m128i qp4 = _mm_unpacklo_epi32(p4, q4);
+
+ const __m128i v_isflatouter4_mask =
+ IsFlat4(qp6, qp5, qp4, qp0, v_flat_thresh);
+ const __m128i v_flat4_mask =
+ _mm_shuffle_epi32(_mm_and_si128(v_mask, v_isflatouter4_mask), 0);
+
+ __m128i oqp2_f8;
+ __m128i oqp1_f8;
+ __m128i oqp0_f8;
+
+ Filter8(qp3, qp2, qp1, qp0, &oqp2_f8, &oqp1_f8, &oqp0_f8);
+
+ oqp2_f8 = _mm_blendv_epi8(qp2, oqp2_f8, v_mask);
+ oqp1 = _mm_blendv_epi8(oqp1, oqp1_f8, v_mask);
+ oqp0 = _mm_blendv_epi8(oqp0, oqp0_f8, v_mask);
+
+ if (_mm_test_all_zeros(v_flat4_mask,
+ _mm_cmpeq_epi8(v_flat4_mask, v_flat4_mask)) == 0) {
+ __m128i oqp5_f14;
+ __m128i oqp4_f14;
+ __m128i oqp3_f14;
+ __m128i oqp2_f14;
+ __m128i oqp1_f14;
+ __m128i oqp0_f14;
+
+ Filter14(qp6, qp5, qp4, qp3, qp2, qp1, qp0, &oqp5_f14, &oqp4_f14,
+ &oqp3_f14, &oqp2_f14, &oqp1_f14, &oqp0_f14);
+
+ oqp5_f14 = _mm_blendv_epi8(qp5, oqp5_f14, v_flat4_mask);
+ oqp4_f14 = _mm_blendv_epi8(qp4, oqp4_f14, v_flat4_mask);
+ oqp3_f14 = _mm_blendv_epi8(qp3, oqp3_f14, v_flat4_mask);
+ oqp2_f8 = _mm_blendv_epi8(oqp2_f8, oqp2_f14, v_flat4_mask);
+ oqp1 = _mm_blendv_epi8(oqp1, oqp1_f14, v_flat4_mask);
+ oqp0 = _mm_blendv_epi8(oqp0, oqp0_f14, v_flat4_mask);
+
+ Store4(dst - 6 * stride, oqp5_f14);
+ Store4(dst - 5 * stride, oqp4_f14);
+ Store4(dst - 4 * stride, oqp3_f14);
+ Store4(dst + 3 * stride, _mm_srli_si128(oqp3_f14, 4));
+ Store4(dst + 4 * stride, _mm_srli_si128(oqp4_f14, 4));
+ Store4(dst + 5 * stride, _mm_srli_si128(oqp5_f14, 4));
+ }
+
+ Store4(dst - 3 * stride, oqp2_f8);
+ Store4(dst + 2 * stride, _mm_srli_si128(oqp2_f8, 4));
+ }
+
+ Store4(dst - 2 * stride, oqp1);
+ Store4(dst - 1 * stride, oqp0);
+ Store4(dst + 0 * stride, _mm_srli_si128(oqp0, 4));
+ Store4(dst + 1 * stride, _mm_srli_si128(oqp1, 4));
+}
+
+// Each of the 8x4 blocks of input data (p7-p0 and q0-q7) are transposed to 4x8,
+// then unpacked to the correct qp register. (qp7 - qp0)
+//
+// p7 p6 p5 p4 p3 p2 p1 p0 q0 q1 q2 q3 q4 q5 q6 q7
+//
+// 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f
+// 10 11 12 13 14 15 16 17 18 19 1a 1b 1c 1d 1e 1f
+// 20 21 22 23 24 25 26 27 28 29 2a 2b 2c 2d 2e 2f
+// 30 31 32 33 34 35 36 37 38 39 3a 3b 3c 3d 3e 3f
+
+inline void DualTranspose8x4To4x8(const __m128i& x0, const __m128i& x1,
+ const __m128i& x2, const __m128i& x3,
+ __m128i* q0p0, __m128i* q1p1, __m128i* q2p2,
+ __m128i* q3p3, __m128i* q4p4, __m128i* q5p5,
+ __m128i* q6p6, __m128i* q7p7) {
+ // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+ const __m128i w0 = _mm_unpacklo_epi8(x0, x1);
+ // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+ const __m128i w1 = _mm_unpacklo_epi8(x2, x3);
+ // 08 18 09 19 0a 1a 0b 1b 0c 1c 0d 1d 0e 1e 0f 1f
+ const __m128i w2 = _mm_unpackhi_epi8(x0, x1);
+ // 28 38 29 39 2a 3a 2b 3b 2c 3c 2d 3d 2e 3e 2f 3f
+ const __m128i w3 = _mm_unpackhi_epi8(x2, x3);
+ // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ const __m128i ww0 = _mm_unpacklo_epi16(w0, w1);
+ // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+ const __m128i ww1 = _mm_unpackhi_epi16(w0, w1);
+ // 08 18 28 38 09 19 29 39 0a 1a 2a 3a 0b 1b 2b 3b
+ const __m128i ww2 = _mm_unpacklo_epi16(w2, w3);
+ // 0c 1c 2c 3c 0d 1d 2d 3d 0e 1e 2e 3e 0f 1f 2f 3f
+ const __m128i ww3 = _mm_unpackhi_epi16(w2, w3);
+ // 00 10 20 30 0f 1f 2f 3f xx xx xx xx xx xx xx xx
+ *q7p7 = _mm_unpacklo_epi32(ww0, _mm_srli_si128(ww3, 12));
+ // 01 11 21 31 0e 1e 2e 3e xx xx xx xx xx xx xx xx
+ *q6p6 = _mm_unpackhi_epi32(_mm_slli_si128(ww0, 4), ww3);
+ // 02 12 22 32 0d 1d 2d 3d xx xx xx xx xx xx xx xx
+ *q5p5 = _mm_unpackhi_epi32(ww0, _mm_slli_si128(ww3, 4));
+ // 03 13 23 33 0c 1c 2c 3c xx xx xx xx xx xx xx xx
+ *q4p4 = _mm_unpacklo_epi32(_mm_srli_si128(ww0, 12), ww3);
+ // 04 14 24 34 0b 1b 2b 3b xx xx xx xx xx xx xx xx
+ *q3p3 = _mm_unpacklo_epi32(ww1, _mm_srli_si128(ww2, 12));
+ // 05 15 25 35 0a 1a 2a 3a xx xx xx xx xx xx xx xx
+ *q2p2 = _mm_unpackhi_epi32(_mm_slli_si128(ww1, 4), ww2);
+ // 06 16 26 36 09 19 29 39 xx xx xx xx xx xx xx xx
+ *q1p1 = _mm_unpackhi_epi32(ww1, _mm_slli_si128(ww2, 4));
+ // 07 17 27 37 08 18 28 38 xx xx xx xx xx xx xx xx
+ *q0p0 = _mm_unpacklo_epi32(_mm_srli_si128(ww1, 12), ww2);
+}
+
+inline void DualTranspose4x8To8x4(const __m128i& qp7, const __m128i& qp6,
+ const __m128i& qp5, const __m128i& qp4,
+ const __m128i& qp3, const __m128i& qp2,
+ const __m128i& qp1, const __m128i& qp0,
+ __m128i* x0, __m128i* x1, __m128i* x2,
+ __m128i* x3) {
+ // qp7: 00 10 20 30 0f 1f 2f 3f xx xx xx xx xx xx xx xx
+ // qp6: 01 11 21 31 0e 1e 2e 3e xx xx xx xx xx xx xx xx
+ // qp5: 02 12 22 32 0d 1d 2d 3d xx xx xx xx xx xx xx xx
+ // qp4: 03 13 23 33 0c 1c 2c 3c xx xx xx xx xx xx xx xx
+ // qp3: 04 14 24 34 0b 1b 2b 3b xx xx xx xx xx xx xx xx
+ // qp2: 05 15 25 35 0a 1a 2a 3a xx xx xx xx xx xx xx xx
+ // qp1: 06 16 26 36 09 19 29 39 xx xx xx xx xx xx xx xx
+ // qp0: 07 17 27 37 08 18 28 38 xx xx xx xx xx xx xx xx
+
+ // 00 01 10 11 20 21 30 31 0f 0e 1f 1e 2f 2e 3f 3e
+ const __m128i w0 = _mm_unpacklo_epi8(qp7, qp6);
+ // 02 03 12 13 22 23 32 33 xx xx xx xx xx xx xx xx
+ const __m128i w1 = _mm_unpacklo_epi8(qp5, qp4);
+ // 04 05 14 15 24 25 34 35 xx xx xx xx xx xx xx xx
+ const __m128i w2 = _mm_unpacklo_epi8(qp3, qp2);
+ // 06 07 16 17 26 27 36 37 xx xx xx xx xx xx xx xx
+ const __m128i w3 = _mm_unpacklo_epi8(qp1, qp0);
+ // 00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33
+ const __m128i w4 = _mm_unpacklo_epi16(w0, w1);
+ // 04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37
+ const __m128i w5 = _mm_unpacklo_epi16(w2, w3);
+ // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17
+ const __m128i d0 = _mm_unpacklo_epi32(w4, w5);
+ // 20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37
+ const __m128i d2 = _mm_unpackhi_epi32(w4, w5);
+ // xx xx xx xx xx xx xx xx 08 09 18 19 28 29 38 39
+ const __m128i w10 = _mm_unpacklo_epi8(qp0, qp1);
+ // xx xx xx xx xx xx xx xx 0a 0b 1a 1b 2a 2b 3a 3b
+ const __m128i w11 = _mm_unpacklo_epi8(qp2, qp3);
+ // xx xx xx xx xx xx xx xx 0c 0d 1c 1d 2c 2d 3c 3d
+ const __m128i w12 = _mm_unpacklo_epi8(qp4, qp5);
+ // xx xx xx xx xx xx xx xx 0e 0f 1e 1f 2e 2f 3e 3f
+ const __m128i w13 = _mm_unpacklo_epi8(qp6, qp7);
+ // 08 09 0a 0b 18 19 1a 1b 28 29 2a 2b 38 39 3a 3b
+ const __m128i w14 = _mm_unpackhi_epi16(w10, w11);
+ // 0c 0d 0e 0f 1c 1d 1e 1f 2c 2d 2e 2f 3c 3d 3e 3f
+ const __m128i w15 = _mm_unpackhi_epi16(w12, w13);
+ // 08 09 0a 0b 0c 0d 0e 0f 18 19 1a 1b 1c 1d 1e 1f
+ const __m128i d1 = _mm_unpacklo_epi32(w14, w15);
+ // 28 29 2a 2b 2c 2d 2e 2f 38 39 3a 3b 3c 3d 3e 3f
+ const __m128i d3 = _mm_unpackhi_epi32(w14, w15);
+
+ // p7 p6 p5 p4 p3 p2 p1 p0 q0 q1 q2 q3 q4 q5 q6 q7
+ //
+ // 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f
+ *x0 = _mm_unpacklo_epi64(d0, d1);
+ // 10 11 12 13 14 15 16 17 18 19 1a 1b 1c 1d 1e 1f
+ *x1 = _mm_unpackhi_epi64(d0, d1);
+ // 20 21 22 23 24 25 26 27 28 29 2a 2b 2c 2d 2e 2f
+ *x2 = _mm_unpacklo_epi64(d2, d3);
+ // 30 31 32 33 34 35 36 37 38 39 3a 3b 3c 3d 3e 3f
+ *x3 = _mm_unpackhi_epi64(d2, d3);
+}
+
+void Vertical14(void* dest, ptrdiff_t stride, int outer_thresh,
+ int inner_thresh, int hev_thresh) {
+ auto* const dst = static_cast<uint8_t*>(dest);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i v_flat_thresh = _mm_set1_epi8(1);
+ const __m128i v_outer_thresh =
+ _mm_shuffle_epi8(_mm_cvtsi32_si128(outer_thresh), zero);
+ const __m128i v_inner_thresh =
+ _mm_shuffle_epi8(_mm_cvtsi32_si128(inner_thresh), zero);
+ const __m128i v_hev_thresh0 =
+ _mm_shuffle_epi8(_mm_cvtsi32_si128(hev_thresh), zero);
+ const __m128i v_hev_thresh = _mm_unpacklo_epi8(v_hev_thresh0, zero);
+
+ __m128i x0 = LoadUnaligned16(dst - 8 + 0 * stride);
+ __m128i x1 = LoadUnaligned16(dst - 8 + 1 * stride);
+ __m128i x2 = LoadUnaligned16(dst - 8 + 2 * stride);
+ __m128i x3 = LoadUnaligned16(dst - 8 + 3 * stride);
+
+ __m128i qp7, qp6, qp5, qp4, qp3, qp2, qp1, qp0;
+
+ DualTranspose8x4To4x8(x0, x1, x2, x3, &qp0, &qp1, &qp2, &qp3, &qp4, &qp5,
+ &qp6, &qp7);
+
+ const __m128i qp1qp0 = _mm_unpacklo_epi64(qp0, qp1);
+ const __m128i q1q0 = _mm_shuffle_epi32(qp1qp0, 0x0d);
+ const __m128i p1p0 = _mm_shuffle_epi32(qp1qp0, 0x08);
+
+ const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+ const __m128i v_needs_mask = NeedsFilter8(q1q0, p1p0, qp3, qp2, qp1, qp0,
+ v_outer_thresh, v_inner_thresh);
+
+ __m128i oqp1;
+ __m128i oqp0;
+
+ Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask);
+
+ const __m128i v_isflat4_mask = IsFlat4(qp3, qp2, qp1, qp0, v_flat_thresh);
+ const __m128i v_mask =
+ _mm_shuffle_epi32(_mm_and_si128(v_needs_mask, v_isflat4_mask), 0);
+
+ if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi8(v_mask, v_mask)) == 0) {
+ const __m128i v_isflatouter4_mask =
+ IsFlat4(qp6, qp5, qp4, qp0, v_flat_thresh);
+ const __m128i v_flat4_mask =
+ _mm_shuffle_epi32(_mm_and_si128(v_mask, v_isflatouter4_mask), 0);
+
+ __m128i oqp2_f8;
+ __m128i oqp1_f8;
+ __m128i oqp0_f8;
+
+ Filter8(qp3, qp2, qp1, qp0, &oqp2_f8, &oqp1_f8, &oqp0_f8);
+
+ oqp2_f8 = _mm_blendv_epi8(qp2, oqp2_f8, v_mask);
+ oqp1 = _mm_blendv_epi8(oqp1, oqp1_f8, v_mask);
+ oqp0 = _mm_blendv_epi8(oqp0, oqp0_f8, v_mask);
+
+ if (_mm_test_all_zeros(v_flat4_mask,
+ _mm_cmpeq_epi8(v_flat4_mask, v_flat4_mask)) == 0) {
+ __m128i oqp5_f14;
+ __m128i oqp4_f14;
+ __m128i oqp3_f14;
+ __m128i oqp2_f14;
+ __m128i oqp1_f14;
+ __m128i oqp0_f14;
+
+ Filter14(qp6, qp5, qp4, qp3, qp2, qp1, qp0, &oqp5_f14, &oqp4_f14,
+ &oqp3_f14, &oqp2_f14, &oqp1_f14, &oqp0_f14);
+
+ oqp5_f14 = _mm_blendv_epi8(qp5, oqp5_f14, v_flat4_mask);
+ oqp4_f14 = _mm_blendv_epi8(qp4, oqp4_f14, v_flat4_mask);
+ oqp3_f14 = _mm_blendv_epi8(qp3, oqp3_f14, v_flat4_mask);
+ oqp2_f8 = _mm_blendv_epi8(oqp2_f8, oqp2_f14, v_flat4_mask);
+ oqp1 = _mm_blendv_epi8(oqp1, oqp1_f14, v_flat4_mask);
+ oqp0 = _mm_blendv_epi8(oqp0, oqp0_f14, v_flat4_mask);
+ qp3 = oqp3_f14;
+ qp4 = oqp4_f14;
+ qp5 = oqp5_f14;
+ }
+ qp2 = oqp2_f8;
+ }
+
+ DualTranspose4x8To8x4(qp7, qp6, qp5, qp4, qp3, qp2, oqp1, oqp0, &x0, &x1, &x2,
+ &x3);
+
+ StoreUnaligned16(dst - 8 + 0 * stride, x0);
+ StoreUnaligned16(dst - 8 + 1 * stride, x1);
+ StoreUnaligned16(dst - 8 + 2 * stride, x2);
+ StoreUnaligned16(dst - 8 + 3 * stride, x3);
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+ static_cast<void>(dsp);
+#if DSP_ENABLED_8BPP_SSE4_1(LoopFilterSize4_LoopFilterTypeHorizontal)
+ dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeHorizontal] = Horizontal4;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(LoopFilterSize6_LoopFilterTypeHorizontal)
+ dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeHorizontal] = Horizontal6;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(LoopFilterSize8_LoopFilterTypeHorizontal)
+ dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeHorizontal] = Horizontal8;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(LoopFilterSize14_LoopFilterTypeHorizontal)
+ dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeHorizontal] =
+ Horizontal14;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(LoopFilterSize4_LoopFilterTypeVertical)
+ dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeVertical] = Vertical4;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(LoopFilterSize6_LoopFilterTypeVertical)
+ dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeVertical] = Vertical6;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(LoopFilterSize8_LoopFilterTypeVertical)
+ dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeVertical] = Vertical8;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(LoopFilterSize14_LoopFilterTypeVertical)
+ dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeVertical] = Vertical14;
+#endif
+}
+} // namespace
+} // namespace low_bitdepth
+
+//------------------------------------------------------------------------------
+namespace high_bitdepth {
+namespace {
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+
+template <int bitdepth>
+struct LoopFilterFuncs_SSE4_1 {
+ LoopFilterFuncs_SSE4_1() = delete;
+
+ static constexpr int kThreshShift = bitdepth - 8;
+
+ static void Vertical4(void* dest, ptrdiff_t stride, int outer_thresh,
+ int inner_thresh, int hev_thresh);
+ static void Horizontal4(void* dest, ptrdiff_t stride, int outer_thresh,
+ int inner_thresh, int hev_thresh);
+ static void Vertical6(void* dest, ptrdiff_t stride, int outer_thresh,
+ int inner_thresh, int hev_thresh);
+ static void Horizontal6(void* dest, ptrdiff_t stride, int outer_thresh,
+ int inner_thresh, int hev_thresh);
+ static void Vertical8(void* dest, ptrdiff_t stride, int outer_thresh,
+ int inner_thresh, int hev_thresh);
+ static void Horizontal8(void* dest, ptrdiff_t stride, int outer_thresh,
+ int inner_thresh, int hev_thresh);
+ static void Vertical14(void* dest, ptrdiff_t stride, int outer_thresh,
+ int inner_thresh, int hev_thresh);
+ static void Horizontal14(void* dest, ptrdiff_t stride, int outer_thresh,
+ int inner_thresh, int hev_thresh);
+};
+
+inline __m128i Clamp(const __m128i& min, const __m128i& max,
+ const __m128i& val) {
+ const __m128i a = _mm_min_epi16(val, max);
+ const __m128i b = _mm_max_epi16(a, min);
+ return b;
+}
+
+inline __m128i AddShift3(const __m128i& a, const __m128i& b,
+ const __m128i& vmin, const __m128i& vmax) {
+ const __m128i c = _mm_adds_epi16(a, b);
+ const __m128i d = Clamp(vmin, vmax, c);
+ const __m128i e = _mm_srai_epi16(d, 3); /* >> 3 */
+ return e;
+}
+
+inline __m128i AddShift1(const __m128i& a, const __m128i& b) {
+ const __m128i c = _mm_adds_epi16(a, b);
+ const __m128i e = _mm_srai_epi16(c, 1); /* >> 1 */
+ return e;
+}
+
+inline __m128i AbsDiff(const __m128i& a, const __m128i& b) {
+ return _mm_or_si128(_mm_subs_epu16(a, b), _mm_subs_epu16(b, a));
+}
+
+inline __m128i Hev(const __m128i& qp1, const __m128i& qp0,
+ const __m128i& hev_thresh) {
+ const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0);
+ const __m128i max_pq =
+ _mm_max_epu16(abs_qp1mqp0, _mm_srli_si128(abs_qp1mqp0, 8));
+ const __m128i hev_mask = _mm_cmpgt_epi16(max_pq, hev_thresh);
+ return hev_mask;
+}
+
+inline __m128i CheckOuterThreshF4(const __m128i& q1q0, const __m128i& p1p0,
+ const __m128i& outer_thresh) {
+ // abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= outer_thresh;
+ const __m128i abs_pmq = AbsDiff(p1p0, q1q0);
+ const __m128i a = _mm_adds_epu16(abs_pmq, abs_pmq);
+ const __m128i b = _mm_srli_epi16(abs_pmq, 1);
+ const __m128i c = _mm_adds_epu16(a, _mm_srli_si128(b, 8));
+ return _mm_subs_epu16(c, outer_thresh);
+}
+
+inline __m128i NeedsFilter4(const __m128i& q1q0, const __m128i& p1p0,
+ const __m128i& qp1, const __m128i& qp0,
+ const __m128i& outer_thresh,
+ const __m128i& inner_thresh) {
+ const __m128i outer_mask = CheckOuterThreshF4(q1q0, p1p0, outer_thresh);
+ const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0);
+ const __m128i max_abs_qp1mqp =
+ _mm_max_epu16(abs_qp1mqp0, _mm_srli_si128(abs_qp1mqp0, 8));
+ const __m128i inner_mask = _mm_subs_epu16(max_abs_qp1mqp, inner_thresh);
+ // ~mask
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i a = _mm_or_si128(outer_mask, inner_mask);
+ const __m128i b = _mm_cmpeq_epi16(a, zero);
+ return b;
+}
+
+inline void Filter4(const __m128i& qp1, const __m128i& qp0, __m128i* oqp1,
+ __m128i* oqp0, const __m128i& mask, const __m128i& hev,
+ int bitdepth) {
+ const __m128i t4 = _mm_set1_epi16(4);
+ const __m128i t3 = _mm_set1_epi16(3);
+ const __m128i t80 = _mm_set1_epi16(static_cast<int16_t>(1 << (bitdepth - 1)));
+ const __m128i t1 = _mm_set1_epi16(0x1);
+ const __m128i vmin = _mm_subs_epi16(_mm_setzero_si128(), t80);
+ const __m128i vmax = _mm_subs_epi16(t80, t1);
+ const __m128i ps1 = _mm_subs_epi16(qp1, t80);
+ const __m128i ps0 = _mm_subs_epi16(qp0, t80);
+ const __m128i qs0 = _mm_srli_si128(ps0, 8);
+ const __m128i qs1 = _mm_srli_si128(ps1, 8);
+
+ __m128i a = _mm_subs_epi16(ps1, qs1);
+ a = _mm_and_si128(Clamp(vmin, vmax, a), hev);
+
+ const __m128i x = _mm_subs_epi16(qs0, ps0);
+ a = _mm_adds_epi16(a, x);
+ a = _mm_adds_epi16(a, x);
+ a = _mm_adds_epi16(a, x);
+ a = _mm_and_si128(Clamp(vmin, vmax, a), mask);
+
+ const __m128i a1 = AddShift3(a, t4, vmin, vmax);
+ const __m128i a2 = AddShift3(a, t3, vmin, vmax);
+ const __m128i a3 = _mm_andnot_si128(hev, AddShift1(a1, t1));
+
+ const __m128i ops1 = _mm_adds_epi16(ps1, a3);
+ const __m128i ops0 = _mm_adds_epi16(ps0, a2);
+ const __m128i oqs0 = _mm_subs_epi16(qs0, a1);
+ const __m128i oqs1 = _mm_subs_epi16(qs1, a3);
+
+ __m128i oqps1 = _mm_unpacklo_epi64(ops1, oqs1);
+ __m128i oqps0 = _mm_unpacklo_epi64(ops0, oqs0);
+
+ oqps1 = Clamp(vmin, vmax, oqps1);
+ oqps0 = Clamp(vmin, vmax, oqps0);
+
+ *oqp1 = _mm_adds_epi16(oqps1, t80);
+ *oqp0 = _mm_adds_epi16(oqps0, t80);
+}
+
+template <int bitdepth>
+void LoopFilterFuncs_SSE4_1<bitdepth>::Horizontal4(void* dest,
+ ptrdiff_t stride8,
+ int outer_thresh,
+ int inner_thresh,
+ int hev_thresh) {
+ auto* const dst = static_cast<uint16_t*>(dest);
+ const ptrdiff_t stride = stride8 / 2;
+ const __m128i v_outer_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(outer_thresh << kThreshShift), 0);
+ const __m128i v_inner_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(inner_thresh << kThreshShift), 0);
+ const __m128i v_hev_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(hev_thresh << kThreshShift), 0);
+ const __m128i p1 = LoadLo8(dst - 2 * stride);
+ const __m128i p0 = LoadLo8(dst - 1 * stride);
+ const __m128i qp0 = LoadHi8(p0, dst + 0 * stride);
+ const __m128i qp1 = LoadHi8(p1, dst + 1 * stride);
+ const __m128i q1q0 = _mm_unpackhi_epi64(qp0, qp1);
+ const __m128i p1p0 = _mm_unpacklo_epi64(qp0, qp1);
+ const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+ const __m128i v_needs_mask =
+ NeedsFilter4(q1q0, p1p0, qp1, qp0, v_outer_thresh, v_inner_thresh);
+
+ __m128i oqp1;
+ __m128i oqp0;
+ Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask, bitdepth);
+
+ StoreLo8(dst - 2 * stride, oqp1);
+ StoreLo8(dst - 1 * stride, oqp0);
+ StoreHi8(dst + 0 * stride, oqp0);
+ StoreHi8(dst + 1 * stride, oqp1);
+}
+
+template <int bitdepth>
+void LoopFilterFuncs_SSE4_1<bitdepth>::Vertical4(void* dest, ptrdiff_t stride8,
+ int outer_thresh,
+ int inner_thresh,
+ int hev_thresh) {
+ auto* const dst = static_cast<uint16_t*>(dest);
+ const ptrdiff_t stride = stride8 / 2;
+ const __m128i v_outer_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(outer_thresh << kThreshShift), 0);
+ const __m128i v_inner_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(inner_thresh << kThreshShift), 0);
+ const __m128i v_hev_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(hev_thresh << kThreshShift), 0);
+ const __m128i x0 = LoadLo8(dst - 2 + 0 * stride);
+ const __m128i x1 = LoadLo8(dst - 2 + 1 * stride);
+ const __m128i x2 = LoadLo8(dst - 2 + 2 * stride);
+ const __m128i x3 = LoadLo8(dst - 2 + 3 * stride);
+ // 00 10 01 11 02 12 03 13
+ const __m128i w0 = _mm_unpacklo_epi16(x0, x1);
+ // 20 30 21 31 22 32 23 33
+ const __m128i w1 = _mm_unpacklo_epi16(x2, x3);
+ // 00 10 20 30 01 11 21 31 p0p1
+ const __m128i a = _mm_unpacklo_epi32(w0, w1);
+ const __m128i p1p0 = _mm_shuffle_epi32(a, 0x4e);
+ // 02 12 22 32 03 13 23 33 q1q0
+ const __m128i q1q0 = _mm_unpackhi_epi32(w0, w1);
+ const __m128i qp1 = _mm_unpackhi_epi64(p1p0, q1q0);
+ const __m128i qp0 = _mm_unpacklo_epi64(p1p0, q1q0);
+ const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+ const __m128i v_needs_mask =
+ NeedsFilter4(q1q0, p1p0, qp1, qp0, v_outer_thresh, v_inner_thresh);
+
+ __m128i oqp1;
+ __m128i oqp0;
+ Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask, bitdepth);
+
+ // 00 10 01 11 02 12 03 13
+ const __m128i w2 = _mm_unpacklo_epi16(oqp1, oqp0);
+ // 20 30 21 31 22 32 23 33
+ const __m128i w3 = _mm_unpackhi_epi16(oqp0, oqp1);
+ // 00 10 20 30 01 11 21 31
+ const __m128i op0p1 = _mm_unpacklo_epi32(w2, w3);
+ // 02 12 22 32 03 13 23 33
+ const __m128i oq1q0 = _mm_unpackhi_epi32(w2, w3);
+
+ StoreLo8(dst - 2 + 0 * stride, op0p1);
+ StoreHi8(dst - 2 + 1 * stride, op0p1);
+ StoreLo8(dst - 2 + 2 * stride, oq1q0);
+ StoreHi8(dst - 2 + 3 * stride, oq1q0);
+}
+
+//------------------------------------------------------------------------------
+// 5-tap (chroma) filters
+
+inline __m128i CheckOuterThreshF6(const __m128i& qp1, const __m128i& qp0,
+ const __m128i& outer_thresh) {
+ // abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= outer_thresh;
+ const __m128i q1q0 = _mm_unpackhi_epi64(qp0, qp1);
+ const __m128i p1p0 = _mm_unpacklo_epi64(qp0, qp1);
+ return CheckOuterThreshF4(q1q0, p1p0, outer_thresh);
+}
+
+inline __m128i NeedsFilter6(const __m128i& qp2, const __m128i& qp1,
+ const __m128i& qp0, const __m128i& outer_thresh,
+ const __m128i& inner_thresh) {
+ const __m128i outer_mask = CheckOuterThreshF6(qp1, qp0, outer_thresh);
+ const __m128i abs_qp2mqp1 = AbsDiff(qp2, qp1);
+ const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0);
+ const __m128i max_pq = _mm_max_epu16(abs_qp2mqp1, abs_qp1mqp0);
+ const __m128i inner_mask = _mm_subs_epu16(
+ _mm_max_epu16(max_pq, _mm_srli_si128(max_pq, 8)), inner_thresh);
+ // ~mask
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i a = _mm_or_si128(outer_mask, inner_mask);
+ const __m128i b = _mm_cmpeq_epi16(a, zero);
+ return b;
+}
+
+inline __m128i IsFlat3(const __m128i& qp2, const __m128i& qp1,
+ const __m128i& qp0, const __m128i& flat_thresh) {
+ const __m128i abs_pq2mpq0 = AbsDiff(qp2, qp0);
+ const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0);
+ const __m128i max_pq = _mm_max_epu16(abs_pq2mpq0, abs_qp1mqp0);
+ const __m128i flat_mask = _mm_subs_epu16(
+ _mm_max_epu16(max_pq, _mm_srli_si128(max_pq, 8)), flat_thresh);
+ // ~mask
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i a = _mm_cmpeq_epi16(flat_mask, zero);
+ return a;
+}
+
+inline void Filter6(const __m128i& qp2, const __m128i& qp1, const __m128i& qp0,
+ __m128i* oqp1, __m128i* oqp0) {
+ const __m128i four = _mm_set1_epi16(4);
+ const __m128i qp2_lo = qp2;
+ const __m128i qp1_lo = qp1;
+ const __m128i qp0_lo = qp0;
+ const __m128i pq1_lo = _mm_shuffle_epi32(qp1_lo, 0x4e);
+ const __m128i pq0_lo = _mm_shuffle_epi32(qp0_lo, 0x4e);
+
+ __m128i f6_lo;
+ f6_lo =
+ _mm_add_epi16(_mm_add_epi16(qp2_lo, four), _mm_add_epi16(qp2_lo, qp2_lo));
+
+ f6_lo = _mm_add_epi16(_mm_add_epi16(f6_lo, qp1_lo), qp1_lo);
+
+ f6_lo = _mm_add_epi16(_mm_add_epi16(f6_lo, qp0_lo),
+ _mm_add_epi16(qp0_lo, pq0_lo));
+
+ // p2 * 3 + p1 * 2 + p0 * 2 + q0
+ // q2 * 3 + q1 * 2 + q0 * 2 + p0
+ *oqp1 = _mm_srli_epi16(f6_lo, 3);
+
+ // p2 + p1 * 2 + p0 * 2 + q0 * 2 + q1
+ // q2 + q1 * 2 + q0 * 2 + p0 * 2 + p1
+ f6_lo = FilterAdd2Sub2(f6_lo, pq0_lo, pq1_lo, qp2_lo, qp2_lo);
+ *oqp0 = _mm_srli_epi16(f6_lo, 3);
+}
+
+template <int bitdepth>
+void LoopFilterFuncs_SSE4_1<bitdepth>::Horizontal6(void* dest,
+ ptrdiff_t stride8,
+ int outer_thresh,
+ int inner_thresh,
+ int hev_thresh) {
+ auto* const dst = static_cast<uint16_t*>(dest);
+ const ptrdiff_t stride = stride8 / 2;
+ const __m128i v_flat_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(1 << kThreshShift), 0);
+ const __m128i v_outer_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(outer_thresh << kThreshShift), 0);
+ const __m128i v_inner_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(inner_thresh << kThreshShift), 0);
+ const __m128i v_hev_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(hev_thresh << kThreshShift), 0);
+
+ const __m128i p2 = LoadLo8(dst - 3 * stride);
+ const __m128i p1 = LoadLo8(dst - 2 * stride);
+ const __m128i p0 = LoadLo8(dst - 1 * stride);
+ const __m128i q0 = LoadLo8(dst + 0 * stride);
+ const __m128i q1 = LoadLo8(dst + 1 * stride);
+ const __m128i q2 = LoadLo8(dst + 2 * stride);
+
+ const __m128i qp2 = _mm_unpacklo_epi64(p2, q2);
+ const __m128i qp1 = _mm_unpacklo_epi64(p1, q1);
+ const __m128i qp0 = _mm_unpacklo_epi64(p0, q0);
+
+ const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+ const __m128i v_needs_mask =
+ NeedsFilter6(qp2, qp1, qp0, v_outer_thresh, v_inner_thresh);
+ __m128i oqp1;
+ __m128i oqp0;
+
+ Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask, bitdepth);
+
+ const __m128i v_isflat3_mask = IsFlat3(qp2, qp1, qp0, v_flat_thresh);
+ const __m128i v_mask_lo = _mm_and_si128(v_needs_mask, v_isflat3_mask);
+ const __m128i v_mask = _mm_unpacklo_epi64(v_mask_lo, v_mask_lo);
+
+ if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi16(v_mask, v_mask)) == 0) {
+ __m128i oqp1_f6;
+ __m128i oqp0_f6;
+
+ Filter6(qp2, qp1, qp0, &oqp1_f6, &oqp0_f6);
+
+ oqp1 = _mm_blendv_epi8(oqp1, oqp1_f6, v_mask);
+ oqp0 = _mm_blendv_epi8(oqp0, oqp0_f6, v_mask);
+ }
+
+ StoreLo8(dst - 2 * stride, oqp1);
+ StoreLo8(dst - 1 * stride, oqp0);
+ StoreHi8(dst + 0 * stride, oqp0);
+ StoreHi8(dst + 1 * stride, oqp1);
+}
+
+inline void Transpose8x4To4x8(const __m128i& x0, const __m128i& x1,
+ const __m128i& x2, const __m128i& x3, __m128i* d0,
+ __m128i* d1, __m128i* d2, __m128i* d3,
+ __m128i* d4, __m128i* d5, __m128i* d6,
+ __m128i* d7) {
+ // input
+ // x0 00 01 02 03 04 05 06 07
+ // x1 10 11 12 13 14 15 16 17
+ // x2 20 21 22 23 24 25 26 27
+ // x3 30 31 32 33 34 35 36 37
+ // output
+ // 00 10 20 30 xx xx xx xx
+ // 01 11 21 31 xx xx xx xx
+ // 02 12 22 32 xx xx xx xx
+ // 03 13 23 33 xx xx xx xx
+ // 04 14 24 34 xx xx xx xx
+ // 05 15 25 35 xx xx xx xx
+ // 06 16 26 36 xx xx xx xx
+ // 07 17 27 37 xx xx xx xx
+
+ // 00 10 01 11 02 12 03 13
+ const __m128i w0 = _mm_unpacklo_epi16(x0, x1);
+ // 20 30 21 31 22 32 23 33
+ const __m128i w1 = _mm_unpacklo_epi16(x2, x3);
+ // 04 14 05 15 06 16 07 17
+ const __m128i w2 = _mm_unpackhi_epi16(x0, x1);
+ // 24 34 25 35 26 36 27 37
+ const __m128i w3 = _mm_unpackhi_epi16(x2, x3);
+
+ // 00 10 20 30 01 11 21 31
+ const __m128i ww0 = _mm_unpacklo_epi32(w0, w1);
+ // 04 14 24 34 05 15 25 35
+ const __m128i ww1 = _mm_unpacklo_epi32(w2, w3);
+ // 02 12 22 32 03 13 23 33
+ const __m128i ww2 = _mm_unpackhi_epi32(w0, w1);
+ // 06 16 26 36 07 17 27 37
+ const __m128i ww3 = _mm_unpackhi_epi32(w2, w3);
+
+ // 00 10 20 30 xx xx xx xx
+ *d0 = ww0;
+ // 01 11 21 31 xx xx xx xx
+ *d1 = _mm_srli_si128(ww0, 8);
+ // 02 12 22 32 xx xx xx xx
+ *d2 = ww2;
+ // 03 13 23 33 xx xx xx xx
+ *d3 = _mm_srli_si128(ww2, 8);
+ // 04 14 24 34 xx xx xx xx
+ *d4 = ww1;
+ // 05 15 25 35 xx xx xx xx
+ *d5 = _mm_srli_si128(ww1, 8);
+ // 06 16 26 36 xx xx xx xx
+ *d6 = ww3;
+ // 07 17 27 37 xx xx xx xx
+ *d7 = _mm_srli_si128(ww3, 8);
+}
+
+template <int bitdepth>
+void LoopFilterFuncs_SSE4_1<bitdepth>::Vertical6(void* dest, ptrdiff_t stride8,
+ int outer_thresh,
+ int inner_thresh,
+ int hev_thresh) {
+ auto* const dst = static_cast<uint16_t*>(dest);
+ const ptrdiff_t stride = stride8 / 2;
+ const __m128i v_flat_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(1 << kThreshShift), 0);
+ const __m128i v_outer_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(outer_thresh << kThreshShift), 0);
+ const __m128i v_inner_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(inner_thresh << kThreshShift), 0);
+ const __m128i v_hev_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(hev_thresh << kThreshShift), 0);
+
+ __m128i x0 = LoadUnaligned16(dst - 3 + 0 * stride);
+ __m128i x1 = LoadUnaligned16(dst - 3 + 1 * stride);
+ __m128i x2 = LoadUnaligned16(dst - 3 + 2 * stride);
+ __m128i x3 = LoadUnaligned16(dst - 3 + 3 * stride);
+
+ __m128i p2, p1, p0, q0, q1, q2;
+ __m128i z0, z1; // not used
+
+ Transpose8x4To4x8(x0, x1, x2, x3, &p2, &p1, &p0, &q0, &q1, &q2, &z0, &z1);
+
+ const __m128i qp2 = _mm_unpacklo_epi64(p2, q2);
+ const __m128i qp1 = _mm_unpacklo_epi64(p1, q1);
+ const __m128i qp0 = _mm_unpacklo_epi64(p0, q0);
+
+ const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+ const __m128i v_needs_mask =
+ NeedsFilter6(qp2, qp1, qp0, v_outer_thresh, v_inner_thresh);
+ __m128i oqp1;
+ __m128i oqp0;
+
+ Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask, bitdepth);
+
+ const __m128i v_isflat3_mask = IsFlat3(qp2, qp1, qp0, v_flat_thresh);
+ const __m128i v_mask_lo = _mm_and_si128(v_needs_mask, v_isflat3_mask);
+ const __m128i v_mask = _mm_unpacklo_epi64(v_mask_lo, v_mask_lo);
+
+ if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi16(v_mask, v_mask)) == 0) {
+ __m128i oqp1_f6;
+ __m128i oqp0_f6;
+
+ Filter6(qp2, qp1, qp0, &oqp1_f6, &oqp0_f6);
+
+ oqp1 = _mm_blendv_epi8(oqp1, oqp1_f6, v_mask);
+ oqp0 = _mm_blendv_epi8(oqp0, oqp0_f6, v_mask);
+ }
+
+ // 00 10 01 11 02 12 03 13
+ const __m128i w2 = _mm_unpacklo_epi16(oqp1, oqp0);
+ // 20 30 21 31 22 32 23 33
+ const __m128i w3 = _mm_unpackhi_epi16(oqp0, oqp1);
+ // 00 10 20 30 01 11 21 31
+ const __m128i op0p1 = _mm_unpacklo_epi32(w2, w3);
+ // 02 12 22 32 03 13 23 33
+ const __m128i oq1q0 = _mm_unpackhi_epi32(w2, w3);
+
+ StoreLo8(dst - 2 + 0 * stride, op0p1);
+ StoreHi8(dst - 2 + 1 * stride, op0p1);
+ StoreLo8(dst - 2 + 2 * stride, oq1q0);
+ StoreHi8(dst - 2 + 3 * stride, oq1q0);
+}
+
+//------------------------------------------------------------------------------
+// 7-tap filters
+inline __m128i NeedsFilter8(const __m128i& qp3, const __m128i& qp2,
+ const __m128i& qp1, const __m128i& qp0,
+ const __m128i& outer_thresh,
+ const __m128i& inner_thresh) {
+ const __m128i outer_mask = CheckOuterThreshF6(qp1, qp0, outer_thresh);
+ const __m128i abs_qp2mqp1 = AbsDiff(qp2, qp1);
+ const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0);
+ const __m128i max_pq_a = _mm_max_epu16(abs_qp2mqp1, abs_qp1mqp0);
+ const __m128i abs_pq3mpq2 = AbsDiff(qp3, qp2);
+ const __m128i max_pq = _mm_max_epu16(max_pq_a, abs_pq3mpq2);
+ const __m128i inner_mask = _mm_subs_epu16(
+ _mm_max_epu16(max_pq, _mm_srli_si128(max_pq, 8)), inner_thresh);
+ // ~mask
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i a = _mm_or_si128(outer_mask, inner_mask);
+ const __m128i b = _mm_cmpeq_epi16(a, zero);
+ return b;
+}
+
+inline __m128i IsFlat4(const __m128i& qp3, const __m128i& qp2,
+ const __m128i& qp1, const __m128i& qp0,
+ const __m128i& flat_thresh) {
+ const __m128i abs_pq2mpq0 = AbsDiff(qp2, qp0);
+ const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0);
+ const __m128i max_pq_a = _mm_max_epu16(abs_pq2mpq0, abs_qp1mqp0);
+ const __m128i abs_pq3mpq0 = AbsDiff(qp3, qp0);
+ const __m128i max_pq = _mm_max_epu16(max_pq_a, abs_pq3mpq0);
+ const __m128i flat_mask = _mm_subs_epu16(
+ _mm_max_epu16(max_pq, _mm_srli_si128(max_pq, 8)), flat_thresh);
+ // ~mask
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i a = _mm_cmpeq_epi16(flat_mask, zero);
+ return a;
+}
+
+inline void Filter8(const __m128i& qp3, const __m128i& qp2, const __m128i& qp1,
+ const __m128i& qp0, __m128i* oqp2, __m128i* oqp1,
+ __m128i* oqp0) {
+ const __m128i four = _mm_set1_epi16(4);
+ const __m128i qp3_lo = qp3;
+ const __m128i qp2_lo = qp2;
+ const __m128i qp1_lo = qp1;
+ const __m128i qp0_lo = qp0;
+ const __m128i pq2_lo = _mm_shuffle_epi32(qp2_lo, 0x4e);
+ const __m128i pq1_lo = _mm_shuffle_epi32(qp1_lo, 0x4e);
+ const __m128i pq0_lo = _mm_shuffle_epi32(qp0_lo, 0x4e);
+
+ __m128i f8_lo =
+ _mm_add_epi16(_mm_add_epi16(qp3_lo, four), _mm_add_epi16(qp3_lo, qp3_lo));
+
+ f8_lo = _mm_add_epi16(_mm_add_epi16(f8_lo, qp2_lo), qp2_lo);
+
+ f8_lo = _mm_add_epi16(_mm_add_epi16(f8_lo, qp1_lo),
+ _mm_add_epi16(qp0_lo, pq0_lo));
+
+ // p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0
+ // q3 + q3 + q3 + 2 * q2 + q1 + q0 + p0
+ *oqp2 = _mm_srli_epi16(f8_lo, 3);
+
+ // p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1
+ // q3 + q3 + q2 + 2 * q1 + q0 + p0 + p1
+ f8_lo = FilterAdd2Sub2(f8_lo, qp1_lo, pq1_lo, qp3_lo, qp2_lo);
+ *oqp1 = _mm_srli_epi16(f8_lo, 3);
+
+ // p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2
+ // q3 + q2 + q1 + 2 * q0 + p0 + p1 + p2
+ f8_lo = FilterAdd2Sub2(f8_lo, qp0_lo, pq2_lo, qp3_lo, qp1_lo);
+ *oqp0 = _mm_srli_epi16(f8_lo, 3);
+}
+
+template <int bitdepth>
+void LoopFilterFuncs_SSE4_1<bitdepth>::Horizontal8(void* dest,
+ ptrdiff_t stride8,
+ int outer_thresh,
+ int inner_thresh,
+ int hev_thresh) {
+ auto* const dst = static_cast<uint16_t*>(dest);
+ const ptrdiff_t stride = stride8 / 2;
+ const __m128i v_flat_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(1 << kThreshShift), 0);
+ const __m128i v_outer_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(outer_thresh << kThreshShift), 0);
+ const __m128i v_inner_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(inner_thresh << kThreshShift), 0);
+ const __m128i v_hev_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(hev_thresh << kThreshShift), 0);
+
+ const __m128i p3 = LoadLo8(dst - 4 * stride);
+ const __m128i p2 = LoadLo8(dst - 3 * stride);
+ const __m128i p1 = LoadLo8(dst - 2 * stride);
+ const __m128i p0 = LoadLo8(dst - 1 * stride);
+ const __m128i q0 = LoadLo8(dst + 0 * stride);
+ const __m128i q1 = LoadLo8(dst + 1 * stride);
+ const __m128i q2 = LoadLo8(dst + 2 * stride);
+ const __m128i q3 = LoadLo8(dst + 3 * stride);
+ const __m128i qp3 = _mm_unpacklo_epi64(p3, q3);
+ const __m128i qp2 = _mm_unpacklo_epi64(p2, q2);
+ const __m128i qp1 = _mm_unpacklo_epi64(p1, q1);
+ const __m128i qp0 = _mm_unpacklo_epi64(p0, q0);
+
+ const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+ const __m128i v_needs_mask =
+ NeedsFilter8(qp3, qp2, qp1, qp0, v_outer_thresh, v_inner_thresh);
+ __m128i oqp1;
+ __m128i oqp0;
+
+ Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask, bitdepth);
+
+ const __m128i v_isflat4_mask = IsFlat4(qp3, qp2, qp1, qp0, v_flat_thresh);
+ const __m128i v_mask_lo = _mm_and_si128(v_needs_mask, v_isflat4_mask);
+ const __m128i v_mask = _mm_unpacklo_epi64(v_mask_lo, v_mask_lo);
+
+ if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi16(v_mask, v_mask)) == 0) {
+ __m128i oqp2_f8;
+ __m128i oqp1_f8;
+ __m128i oqp0_f8;
+
+ Filter8(qp3, qp2, qp1, qp0, &oqp2_f8, &oqp1_f8, &oqp0_f8);
+
+ oqp2_f8 = _mm_blendv_epi8(qp2, oqp2_f8, v_mask);
+ oqp1 = _mm_blendv_epi8(oqp1, oqp1_f8, v_mask);
+ oqp0 = _mm_blendv_epi8(oqp0, oqp0_f8, v_mask);
+ StoreLo8(dst - 3 * stride, oqp2_f8);
+ StoreHi8(dst + 2 * stride, oqp2_f8);
+ }
+
+ StoreLo8(dst - 2 * stride, oqp1);
+ StoreLo8(dst - 1 * stride, oqp0);
+ StoreHi8(dst + 0 * stride, oqp0);
+ StoreHi8(dst + 1 * stride, oqp1);
+}
+
+inline void TransposeLower4x8To8x4(const __m128i& x0, const __m128i& x1,
+ const __m128i& x2, const __m128i& x3,
+ const __m128i& x4, const __m128i& x5,
+ const __m128i& x6, const __m128i& x7,
+ __m128i* d0, __m128i* d1, __m128i* d2,
+ __m128i* d3) {
+ // input
+ // x0 00 01 02 03 04 05 06 07
+ // x1 10 11 12 13 14 15 16 17
+ // x2 20 21 22 23 24 25 26 27
+ // x3 30 31 32 33 34 35 36 37
+ // x4 40 41 42 43 44 45 46 47
+ // x5 50 51 52 53 54 55 56 57
+ // x6 60 61 62 63 64 65 66 67
+ // x7 70 71 72 73 74 75 76 77
+ // output
+ // d0 00 10 20 30 40 50 60 70
+ // d1 01 11 21 31 41 51 61 71
+ // d2 02 12 22 32 42 52 62 72
+ // d3 03 13 23 33 43 53 63 73
+
+ // 00 10 01 11 02 12 03 13
+ const __m128i w0 = _mm_unpacklo_epi16(x0, x1);
+ // 20 30 21 31 22 32 23 33
+ const __m128i w1 = _mm_unpacklo_epi16(x2, x3);
+ // 40 50 41 51 42 52 43 53
+ const __m128i w2 = _mm_unpacklo_epi16(x4, x5);
+ // 60 70 61 71 62 72 63 73
+ const __m128i w3 = _mm_unpacklo_epi16(x6, x7);
+
+ // 00 10 20 30 01 11 21 31
+ const __m128i w4 = _mm_unpacklo_epi32(w0, w1);
+ // 40 50 60 70 41 51 61 71
+ const __m128i w5 = _mm_unpacklo_epi32(w2, w3);
+ // 02 12 22 32 03 13 23 33
+ const __m128i w6 = _mm_unpackhi_epi32(w0, w1);
+ // 42 52 62 72 43 53 63 73
+ const __m128i w7 = _mm_unpackhi_epi32(w2, w3);
+
+ // 00 10 20 30 40 50 60 70
+ *d0 = _mm_unpacklo_epi64(w4, w5);
+ // 01 11 21 31 41 51 61 71
+ *d1 = _mm_unpackhi_epi64(w4, w5);
+ // 02 12 22 32 42 52 62 72
+ *d2 = _mm_unpacklo_epi64(w6, w7);
+ // 03 13 23 33 43 53 63 73
+ *d3 = _mm_unpackhi_epi64(w6, w7);
+}
+
+template <int bitdepth>
+void LoopFilterFuncs_SSE4_1<bitdepth>::Vertical8(void* dest, ptrdiff_t stride8,
+ int outer_thresh,
+ int inner_thresh,
+ int hev_thresh) {
+ auto* const dst = static_cast<uint16_t*>(dest);
+ const ptrdiff_t stride = stride8 / 2;
+ const __m128i v_flat_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(1 << kThreshShift), 0);
+ const __m128i v_outer_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(outer_thresh << kThreshShift), 0);
+ const __m128i v_inner_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(inner_thresh << kThreshShift), 0);
+ const __m128i v_hev_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(hev_thresh << kThreshShift), 0);
+
+ __m128i x0 = LoadUnaligned16(dst - 4 + 0 * stride);
+ __m128i x1 = LoadUnaligned16(dst - 4 + 1 * stride);
+ __m128i x2 = LoadUnaligned16(dst - 4 + 2 * stride);
+ __m128i x3 = LoadUnaligned16(dst - 4 + 3 * stride);
+
+ __m128i p3, p2, p1, p0, q0, q1, q2, q3;
+ Transpose8x4To4x8(x0, x1, x2, x3, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+
+ const __m128i qp3 = _mm_unpacklo_epi64(p3, q3);
+ const __m128i qp2 = _mm_unpacklo_epi64(p2, q2);
+ const __m128i qp1 = _mm_unpacklo_epi64(p1, q1);
+ const __m128i qp0 = _mm_unpacklo_epi64(p0, q0);
+
+ const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+ const __m128i v_needs_mask =
+ NeedsFilter8(qp3, qp2, qp1, qp0, v_outer_thresh, v_inner_thresh);
+ __m128i oqp1;
+ __m128i oqp0;
+
+ Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask, bitdepth);
+
+ const __m128i v_isflat4_mask = IsFlat4(qp3, qp2, qp1, qp0, v_flat_thresh);
+ const __m128i v_mask_lo = _mm_and_si128(v_needs_mask, v_isflat4_mask);
+ const __m128i v_mask = _mm_unpacklo_epi64(v_mask_lo, v_mask_lo);
+
+ if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi16(v_mask, v_mask)) == 0) {
+ __m128i oqp2_f8;
+ __m128i oqp1_f8;
+ __m128i oqp0_f8;
+
+ Filter8(qp3, qp2, qp1, qp0, &oqp2_f8, &oqp1_f8, &oqp0_f8);
+
+ oqp2_f8 = _mm_blendv_epi8(qp2, oqp2_f8, v_mask);
+ oqp1 = _mm_blendv_epi8(oqp1, oqp1_f8, v_mask);
+ oqp0 = _mm_blendv_epi8(oqp0, oqp0_f8, v_mask);
+
+ p2 = oqp2_f8;
+ q2 = _mm_srli_si128(oqp2_f8, 8);
+ }
+
+ p1 = oqp1;
+ p0 = oqp0;
+ q0 = _mm_srli_si128(oqp0, 8);
+ q1 = _mm_srli_si128(oqp1, 8);
+
+ TransposeLower4x8To8x4(p3, p2, p1, p0, q0, q1, q2, q3, &x0, &x1, &x2, &x3);
+
+ StoreUnaligned16(dst - 4 + 0 * stride, x0);
+ StoreUnaligned16(dst - 4 + 1 * stride, x1);
+ StoreUnaligned16(dst - 4 + 2 * stride, x2);
+ StoreUnaligned16(dst - 4 + 3 * stride, x3);
+}
+
+//------------------------------------------------------------------------------
+// 13-tap filters
+
+inline void Filter14(const __m128i& qp6, const __m128i& qp5, const __m128i& qp4,
+ const __m128i& qp3, const __m128i& qp2, const __m128i& qp1,
+ const __m128i& qp0, __m128i* oqp5, __m128i* oqp4,
+ __m128i* oqp3, __m128i* oqp2, __m128i* oqp1,
+ __m128i* oqp0) {
+ const __m128i eight = _mm_set1_epi16(8);
+ const __m128i qp6_lo = qp6;
+ const __m128i qp5_lo = qp5;
+ const __m128i qp4_lo = qp4;
+ const __m128i qp3_lo = qp3;
+ const __m128i qp2_lo = qp2;
+ const __m128i qp1_lo = qp1;
+ const __m128i qp0_lo = qp0;
+ const __m128i pq5_lo = _mm_shuffle_epi32(qp5_lo, 0x4e);
+ const __m128i pq4_lo = _mm_shuffle_epi32(qp4_lo, 0x4e);
+ const __m128i pq3_lo = _mm_shuffle_epi32(qp3_lo, 0x4e);
+ const __m128i pq2_lo = _mm_shuffle_epi32(qp2_lo, 0x4e);
+ const __m128i pq1_lo = _mm_shuffle_epi32(qp1_lo, 0x4e);
+ const __m128i pq0_lo = _mm_shuffle_epi32(qp0_lo, 0x4e);
+
+ __m128i f14_lo =
+ _mm_add_epi16(eight, _mm_sub_epi16(_mm_slli_epi16(qp6_lo, 3), qp6_lo));
+
+ f14_lo = _mm_add_epi16(_mm_add_epi16(f14_lo, qp5_lo),
+ _mm_add_epi16(qp5_lo, qp4_lo));
+
+ f14_lo = _mm_add_epi16(_mm_add_epi16(f14_lo, qp4_lo),
+ _mm_add_epi16(qp3_lo, qp2_lo));
+
+ f14_lo = _mm_add_epi16(_mm_add_epi16(f14_lo, qp1_lo),
+ _mm_add_epi16(qp0_lo, pq0_lo));
+
+ // p6 * 7 + p5 * 2 + p4 * 2 + p3 + p2 + p1 + p0 + q0
+ // q6 * 7 + q5 * 2 + q4 * 2 + q3 + q2 + q1 + q0 + p0
+ *oqp5 = _mm_srli_epi16(f14_lo, 4);
+
+ // p6 * 5 + p5 * 2 + p4 * 2 + p3 * 2 + p2 + p1 + p0 + q0 + q1
+ // q6 * 5 + q5 * 2 + q4 * 2 + q3 * 2 + q2 + q1 + q0 + p0 + p1
+ f14_lo = FilterAdd2Sub2(f14_lo, qp3_lo, pq1_lo, qp6_lo, qp6_lo);
+ *oqp4 = _mm_srli_epi16(f14_lo, 4);
+
+ // p6 * 4 + p5 + p4 * 2 + p3 * 2 + p2 * 2 + p1 + p0 + q0 + q1 + q2
+ // q6 * 4 + q5 + q4 * 2 + q3 * 2 + q2 * 2 + q1 + q0 + p0 + p1 + p2
+ f14_lo = FilterAdd2Sub2(f14_lo, qp2_lo, pq2_lo, qp6_lo, qp5_lo);
+ *oqp3 = _mm_srli_epi16(f14_lo, 4);
+
+ // p6 * 3 + p5 + p4 + p3 * 2 + p2 * 2 + p1 * 2 + p0 + q0 + q1 + q2 + q3
+ // q6 * 3 + q5 + q4 + q3 * 2 + q2 * 2 + q1 * 2 + q0 + p0 + p1 + p2 + p3
+ f14_lo = FilterAdd2Sub2(f14_lo, qp1_lo, pq3_lo, qp6_lo, qp4_lo);
+ *oqp2 = _mm_srli_epi16(f14_lo, 4);
+
+ // p6 * 2 + p5 + p4 + p3 + p2 * 2 + p1 * 2 + p0 * 2 + q0 + q1 + q2 + q3 + q4
+ // q6 * 2 + q5 + q4 + q3 + q2 * 2 + q1 * 2 + q0 * 2 + p0 + p1 + p2 + p3 + p4
+ f14_lo = FilterAdd2Sub2(f14_lo, qp0_lo, pq4_lo, qp6_lo, qp3_lo);
+ *oqp1 = _mm_srli_epi16(f14_lo, 4);
+
+ // p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 * 2 + q0 * 2 + q1 + q2 + q3 + q4 + q5
+ // q6 + q5 + q4 + q3 + q2 + q1 * 2 + q0 * 2 + p0 * 2 + p1 + p2 + p3 + p4 + p5
+ f14_lo = FilterAdd2Sub2(f14_lo, pq0_lo, pq5_lo, qp6_lo, qp2_lo);
+ *oqp0 = _mm_srli_epi16(f14_lo, 4);
+}
+
+template <int bitdepth>
+void LoopFilterFuncs_SSE4_1<bitdepth>::Horizontal14(void* dest,
+ ptrdiff_t stride8,
+ int outer_thresh,
+ int inner_thresh,
+ int hev_thresh) {
+ auto* const dst = static_cast<uint16_t*>(dest);
+ const ptrdiff_t stride = stride8 / 2;
+ const __m128i v_flat_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(1 << kThreshShift), 0);
+ const __m128i v_outer_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(outer_thresh << kThreshShift), 0);
+ const __m128i v_inner_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(inner_thresh << kThreshShift), 0);
+ const __m128i v_hev_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(hev_thresh << kThreshShift), 0);
+
+ const __m128i p3 = LoadLo8(dst - 4 * stride);
+ const __m128i p2 = LoadLo8(dst - 3 * stride);
+ const __m128i p1 = LoadLo8(dst - 2 * stride);
+ const __m128i p0 = LoadLo8(dst - 1 * stride);
+ const __m128i q0 = LoadLo8(dst + 0 * stride);
+ const __m128i q1 = LoadLo8(dst + 1 * stride);
+ const __m128i q2 = LoadLo8(dst + 2 * stride);
+ const __m128i q3 = LoadLo8(dst + 3 * stride);
+ const __m128i qp3 = _mm_unpacklo_epi64(p3, q3);
+ const __m128i qp2 = _mm_unpacklo_epi64(p2, q2);
+ const __m128i qp1 = _mm_unpacklo_epi64(p1, q1);
+ const __m128i qp0 = _mm_unpacklo_epi64(p0, q0);
+
+ const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+ const __m128i v_needs_mask =
+ NeedsFilter8(qp3, qp2, qp1, qp0, v_outer_thresh, v_inner_thresh);
+
+ __m128i oqp1;
+ __m128i oqp0;
+
+ Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask, bitdepth);
+
+ const __m128i v_isflat4_mask = IsFlat4(qp3, qp2, qp1, qp0, v_flat_thresh);
+ const __m128i v_mask_lo = _mm_and_si128(v_needs_mask, v_isflat4_mask);
+ const __m128i v_mask = _mm_unpacklo_epi64(v_mask_lo, v_mask_lo);
+
+ if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi16(v_mask, v_mask)) == 0) {
+ const __m128i p6 = LoadLo8(dst - 7 * stride);
+ const __m128i p5 = LoadLo8(dst - 6 * stride);
+ const __m128i p4 = LoadLo8(dst - 5 * stride);
+ const __m128i q4 = LoadLo8(dst + 4 * stride);
+ const __m128i q5 = LoadLo8(dst + 5 * stride);
+ const __m128i q6 = LoadLo8(dst + 6 * stride);
+ const __m128i qp6 = _mm_unpacklo_epi64(p6, q6);
+ const __m128i qp5 = _mm_unpacklo_epi64(p5, q5);
+ const __m128i qp4 = _mm_unpacklo_epi64(p4, q4);
+
+ const __m128i v_isflatouter4_mask =
+ IsFlat4(qp6, qp5, qp4, qp0, v_flat_thresh);
+ const __m128i v_flat4_mask_lo = _mm_and_si128(v_mask, v_isflatouter4_mask);
+ const __m128i v_flat4_mask =
+ _mm_unpacklo_epi64(v_flat4_mask_lo, v_flat4_mask_lo);
+
+ __m128i oqp2_f8;
+ __m128i oqp1_f8;
+ __m128i oqp0_f8;
+
+ Filter8(qp3, qp2, qp1, qp0, &oqp2_f8, &oqp1_f8, &oqp0_f8);
+
+ oqp2_f8 = _mm_blendv_epi8(qp2, oqp2_f8, v_mask);
+ oqp1 = _mm_blendv_epi8(oqp1, oqp1_f8, v_mask);
+ oqp0 = _mm_blendv_epi8(oqp0, oqp0_f8, v_mask);
+
+ if (_mm_test_all_zeros(v_flat4_mask,
+ _mm_cmpeq_epi16(v_flat4_mask, v_flat4_mask)) == 0) {
+ __m128i oqp5_f14;
+ __m128i oqp4_f14;
+ __m128i oqp3_f14;
+ __m128i oqp2_f14;
+ __m128i oqp1_f14;
+ __m128i oqp0_f14;
+
+ Filter14(qp6, qp5, qp4, qp3, qp2, qp1, qp0, &oqp5_f14, &oqp4_f14,
+ &oqp3_f14, &oqp2_f14, &oqp1_f14, &oqp0_f14);
+
+ oqp5_f14 = _mm_blendv_epi8(qp5, oqp5_f14, v_flat4_mask);
+ oqp4_f14 = _mm_blendv_epi8(qp4, oqp4_f14, v_flat4_mask);
+ oqp3_f14 = _mm_blendv_epi8(qp3, oqp3_f14, v_flat4_mask);
+ oqp2_f8 = _mm_blendv_epi8(oqp2_f8, oqp2_f14, v_flat4_mask);
+ oqp1 = _mm_blendv_epi8(oqp1, oqp1_f14, v_flat4_mask);
+ oqp0 = _mm_blendv_epi8(oqp0, oqp0_f14, v_flat4_mask);
+
+ StoreLo8(dst - 6 * stride, oqp5_f14);
+ StoreLo8(dst - 5 * stride, oqp4_f14);
+ StoreLo8(dst - 4 * stride, oqp3_f14);
+
+ StoreHi8(dst + 3 * stride, oqp3_f14);
+ StoreHi8(dst + 4 * stride, oqp4_f14);
+ StoreHi8(dst + 5 * stride, oqp5_f14);
+ }
+
+ StoreLo8(dst - 3 * stride, oqp2_f8);
+ StoreHi8(dst + 2 * stride, oqp2_f8);
+ }
+
+ StoreLo8(dst - 2 * stride, oqp1);
+ StoreLo8(dst - 1 * stride, oqp0);
+ StoreHi8(dst + 0 * stride, oqp0);
+ StoreHi8(dst + 1 * stride, oqp1);
+}
+
+inline void TransposeUpper4x8To8x4(const __m128i& x0, const __m128i& x1,
+ const __m128i& x2, const __m128i& x3,
+ const __m128i& x4, const __m128i& x5,
+ const __m128i& x6, const __m128i& x7,
+ __m128i* d0, __m128i* d1, __m128i* d2,
+ __m128i* d3) {
+ // input
+ // x0 00 01 02 03 xx xx xx xx
+ // x1 10 11 12 13 xx xx xx xx
+ // x2 20 21 22 23 xx xx xx xx
+ // x3 30 31 32 33 xx xx xx xx
+ // x4 40 41 42 43 xx xx xx xx
+ // x5 50 51 52 53 xx xx xx xx
+ // x6 60 61 62 63 xx xx xx xx
+ // x7 70 71 72 73 xx xx xx xx
+ // output
+ // d0 00 10 20 30 40 50 60 70
+ // d1 01 11 21 31 41 51 61 71
+ // d2 02 12 22 32 42 52 62 72
+ // d3 03 13 23 33 43 53 63 73
+
+ // 00 10 01 11 02 12 03 13
+ const __m128i w0 = _mm_unpackhi_epi16(x0, x1);
+ // 20 30 21 31 22 32 23 33
+ const __m128i w1 = _mm_unpackhi_epi16(x2, x3);
+ // 40 50 41 51 42 52 43 53
+ const __m128i w2 = _mm_unpackhi_epi16(x4, x5);
+ // 60 70 61 71 62 72 63 73
+ const __m128i w3 = _mm_unpackhi_epi16(x6, x7);
+
+ // 00 10 20 30 01 11 21 31
+ const __m128i w4 = _mm_unpacklo_epi32(w0, w1);
+ // 40 50 60 70 41 51 61 71
+ const __m128i w5 = _mm_unpacklo_epi32(w2, w3);
+ // 02 12 22 32 03 13 23 33
+ const __m128i w6 = _mm_unpackhi_epi32(w0, w1);
+ // 42 52 62 72 43 53 63 73
+ const __m128i w7 = _mm_unpackhi_epi32(w2, w3);
+
+ // 00 10 20 30 40 50 60 70
+ *d0 = _mm_unpacklo_epi64(w4, w5);
+ // 01 11 21 31 41 51 61 71
+ *d1 = _mm_unpackhi_epi64(w4, w5);
+ // 02 12 22 32 42 52 62 72
+ *d2 = _mm_unpacklo_epi64(w6, w7);
+ // 03 13 23 33 43 53 63 73
+ *d3 = _mm_unpackhi_epi64(w6, w7);
+}
+
+template <int bitdepth>
+void LoopFilterFuncs_SSE4_1<bitdepth>::Vertical14(void* dest, ptrdiff_t stride8,
+ int outer_thresh,
+ int inner_thresh,
+ int hev_thresh) {
+ auto* const dst = static_cast<uint16_t*>(dest);
+ const ptrdiff_t stride = stride8 / 2;
+ const __m128i v_flat_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(1 << kThreshShift), 0);
+ const __m128i v_outer_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(outer_thresh << kThreshShift), 0);
+ const __m128i v_inner_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(inner_thresh << kThreshShift), 0);
+ const __m128i v_hev_thresh =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(hev_thresh << kThreshShift), 0);
+
+ // p7 p6 p5 p4 p3 p2 p1 p0 q0 q1 q2 q3 q4 q5 q6 q7
+ //
+ // 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f
+ // 10 11 12 13 14 15 16 17 18 19 1a 1b 1c 1d 1e 1f
+ // 20 21 22 23 24 25 26 27 28 29 2a 2b 2c 2d 2e 2f
+ // 30 31 32 33 34 35 36 37 38 39 3a 3b 3c 3d 3e 3f
+
+ __m128i x0 = LoadUnaligned16(dst - 8 + 0 * stride);
+ __m128i x1 = LoadUnaligned16(dst - 8 + 1 * stride);
+ __m128i x2 = LoadUnaligned16(dst - 8 + 2 * stride);
+ __m128i x3 = LoadUnaligned16(dst - 8 + 3 * stride);
+
+ __m128i p7, p6, p5, p4, p3, p2, p1, p0;
+ __m128i q7, q6, q5, q4, q3, q2, q1, q0;
+
+ Transpose8x4To4x8(x0, x1, x2, x3, &p7, &p6, &p5, &p4, &p3, &p2, &p1, &p0);
+
+ x0 = LoadUnaligned16(dst - 8 + 8 + 0 * stride);
+ x1 = LoadUnaligned16(dst - 8 + 8 + 1 * stride);
+ x2 = LoadUnaligned16(dst - 8 + 8 + 2 * stride);
+ x3 = LoadUnaligned16(dst - 8 + 8 + 3 * stride);
+
+ Transpose8x4To4x8(x0, x1, x2, x3, &q0, &q1, &q2, &q3, &q4, &q5, &q6, &q7);
+
+ __m128i qp7 = _mm_unpacklo_epi64(p7, q7);
+ __m128i qp6 = _mm_unpacklo_epi64(p6, q6);
+ __m128i qp5 = _mm_unpacklo_epi64(p5, q5);
+ __m128i qp4 = _mm_unpacklo_epi64(p4, q4);
+ __m128i qp3 = _mm_unpacklo_epi64(p3, q3);
+ __m128i qp2 = _mm_unpacklo_epi64(p2, q2);
+ __m128i qp1 = _mm_unpacklo_epi64(p1, q1);
+ __m128i qp0 = _mm_unpacklo_epi64(p0, q0);
+
+ const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+ const __m128i v_needs_mask =
+ NeedsFilter8(qp3, qp2, qp1, qp0, v_outer_thresh, v_inner_thresh);
+
+ __m128i oqp1;
+ __m128i oqp0;
+
+ Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask, bitdepth);
+
+ const __m128i v_isflat4_mask = IsFlat4(qp3, qp2, qp1, qp0, v_flat_thresh);
+ const __m128i v_mask_lo = _mm_and_si128(v_needs_mask, v_isflat4_mask);
+ const __m128i v_mask = _mm_unpacklo_epi64(v_mask_lo, v_mask_lo);
+
+ if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi16(v_mask, v_mask)) == 0) {
+ const __m128i v_isflatouter4_mask =
+ IsFlat4(qp6, qp5, qp4, qp0, v_flat_thresh);
+ const __m128i v_flat4_mask_lo = _mm_and_si128(v_mask, v_isflatouter4_mask);
+ const __m128i v_flat4_mask =
+ _mm_unpacklo_epi64(v_flat4_mask_lo, v_flat4_mask_lo);
+
+ __m128i oqp2_f8;
+ __m128i oqp1_f8;
+ __m128i oqp0_f8;
+
+ Filter8(qp3, qp2, qp1, qp0, &oqp2_f8, &oqp1_f8, &oqp0_f8);
+
+ oqp2_f8 = _mm_blendv_epi8(qp2, oqp2_f8, v_mask);
+ oqp1 = _mm_blendv_epi8(oqp1, oqp1_f8, v_mask);
+ oqp0 = _mm_blendv_epi8(oqp0, oqp0_f8, v_mask);
+
+ if (_mm_test_all_zeros(v_flat4_mask,
+ _mm_cmpeq_epi16(v_flat4_mask, v_flat4_mask)) == 0) {
+ __m128i oqp5_f14;
+ __m128i oqp4_f14;
+ __m128i oqp3_f14;
+ __m128i oqp2_f14;
+ __m128i oqp1_f14;
+ __m128i oqp0_f14;
+
+ Filter14(qp6, qp5, qp4, qp3, qp2, qp1, qp0, &oqp5_f14, &oqp4_f14,
+ &oqp3_f14, &oqp2_f14, &oqp1_f14, &oqp0_f14);
+
+ oqp5_f14 = _mm_blendv_epi8(qp5, oqp5_f14, v_flat4_mask);
+ oqp4_f14 = _mm_blendv_epi8(qp4, oqp4_f14, v_flat4_mask);
+ oqp3_f14 = _mm_blendv_epi8(qp3, oqp3_f14, v_flat4_mask);
+ oqp2_f8 = _mm_blendv_epi8(oqp2_f8, oqp2_f14, v_flat4_mask);
+ oqp1 = _mm_blendv_epi8(oqp1, oqp1_f14, v_flat4_mask);
+ oqp0 = _mm_blendv_epi8(oqp0, oqp0_f14, v_flat4_mask);
+ qp3 = oqp3_f14;
+ qp4 = oqp4_f14;
+ qp5 = oqp5_f14;
+ }
+ qp2 = oqp2_f8;
+ }
+
+ TransposeLower4x8To8x4(qp7, qp6, qp5, qp4, qp3, qp2, oqp1, oqp0, &x0, &x1,
+ &x2, &x3);
+
+ StoreUnaligned16(dst - 8 + 0 * stride, x0);
+ StoreUnaligned16(dst - 8 + 1 * stride, x1);
+ StoreUnaligned16(dst - 8 + 2 * stride, x2);
+ StoreUnaligned16(dst - 8 + 3 * stride, x3);
+
+ TransposeUpper4x8To8x4(oqp0, oqp1, qp2, qp3, qp4, qp5, qp6, qp7, &x0, &x1,
+ &x2, &x3);
+
+ StoreUnaligned16(dst - 8 + 8 + 0 * stride, x0);
+ StoreUnaligned16(dst - 8 + 8 + 1 * stride, x1);
+ StoreUnaligned16(dst - 8 + 8 + 2 * stride, x2);
+ StoreUnaligned16(dst - 8 + 8 + 3 * stride, x3);
+}
+
+using Defs10bpp = LoopFilterFuncs_SSE4_1<kBitdepth10>;
+
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+ static_cast<void>(dsp);
+#if DSP_ENABLED_10BPP_SSE4_1(LoopFilterSize4_LoopFilterTypeHorizontal)
+ dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeHorizontal] =
+ Defs10bpp::Horizontal4;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(LoopFilterSize6_LoopFilterTypeHorizontal)
+ dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeHorizontal] =
+ Defs10bpp::Horizontal6;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(LoopFilterSize8_LoopFilterTypeHorizontal)
+ dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeHorizontal] =
+ Defs10bpp::Horizontal8;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(LoopFilterSize14_LoopFilterTypeHorizontal)
+ dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeHorizontal] =
+ Defs10bpp::Horizontal14;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(LoopFilterSize4_LoopFilterTypeVertical)
+ dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeVertical] =
+ Defs10bpp::Vertical4;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(LoopFilterSize6_LoopFilterTypeVertical)
+ dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeVertical] =
+ Defs10bpp::Vertical6;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(LoopFilterSize8_LoopFilterTypeVertical)
+ dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeVertical] =
+ Defs10bpp::Vertical8;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(LoopFilterSize14_LoopFilterTypeVertical)
+ dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeVertical] =
+ Defs10bpp::Vertical14;
+#endif
+}
+#endif
+} // namespace
+} // namespace high_bitdepth
+
+void LoopFilterInit_SSE4_1() {
+ low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ high_bitdepth::Init10bpp();
+#endif
+}
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_TARGETING_SSE4_1
+namespace libgav1 {
+namespace dsp {
+
+void LoopFilterInit_SSE4_1() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/loop_filter_sse4.h b/src/dsp/x86/loop_filter_sse4.h
new file mode 100644
index 0000000..4795d8b
--- /dev/null
+++ b/src/dsp/x86/loop_filter_sse4.h
@@ -0,0 +1,119 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_LOOP_FILTER_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_LOOP_FILTER_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::loop_filters, see the defines below for specifics. This
+// function is not thread-safe.
+void LoopFilterInit_SSE4_1();
+
+} // namespace dsp
+} // namespace libgav1
+
+// If sse4 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the sse4 implementation should be used.
+#if LIBGAV1_TARGETING_SSE4_1
+
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize4_LoopFilterTypeHorizontal
+#define LIBGAV1_Dsp8bpp_LoopFilterSize4_LoopFilterTypeHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize6_LoopFilterTypeHorizontal
+#define LIBGAV1_Dsp8bpp_LoopFilterSize6_LoopFilterTypeHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize8_LoopFilterTypeHorizontal
+#define LIBGAV1_Dsp8bpp_LoopFilterSize8_LoopFilterTypeHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize14_LoopFilterTypeHorizontal
+#define LIBGAV1_Dsp8bpp_LoopFilterSize14_LoopFilterTypeHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize4_LoopFilterTypeVertical
+#define LIBGAV1_Dsp8bpp_LoopFilterSize4_LoopFilterTypeVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize6_LoopFilterTypeVertical
+#define LIBGAV1_Dsp8bpp_LoopFilterSize6_LoopFilterTypeVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize8_LoopFilterTypeVertical
+#define LIBGAV1_Dsp8bpp_LoopFilterSize8_LoopFilterTypeVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize14_LoopFilterTypeVertical
+#define LIBGAV1_Dsp8bpp_LoopFilterSize14_LoopFilterTypeVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize4_LoopFilterTypeHorizontal
+#define LIBGAV1_Dsp10bpp_LoopFilterSize4_LoopFilterTypeHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize6_LoopFilterTypeHorizontal
+#define LIBGAV1_Dsp10bpp_LoopFilterSize6_LoopFilterTypeHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize8_LoopFilterTypeHorizontal
+#define LIBGAV1_Dsp10bpp_LoopFilterSize8_LoopFilterTypeHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize14_LoopFilterTypeHorizontal
+#define LIBGAV1_Dsp10bpp_LoopFilterSize14_LoopFilterTypeHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize4_LoopFilterTypeVertical
+#define LIBGAV1_Dsp10bpp_LoopFilterSize4_LoopFilterTypeVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize6_LoopFilterTypeVertical
+#define LIBGAV1_Dsp10bpp_LoopFilterSize6_LoopFilterTypeVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize8_LoopFilterTypeVertical
+#define LIBGAV1_Dsp10bpp_LoopFilterSize8_LoopFilterTypeVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize14_LoopFilterTypeVertical
+#define LIBGAV1_Dsp10bpp_LoopFilterSize14_LoopFilterTypeVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif // LIBGAV1_TARGETING_SSE4_1
+
+#endif // LIBGAV1_SRC_DSP_X86_LOOP_FILTER_SSE4_H_
diff --git a/src/dsp/x86/loop_restoration_10bit_avx2.cc b/src/dsp/x86/loop_restoration_10bit_avx2.cc
new file mode 100644
index 0000000..702bdea
--- /dev/null
+++ b/src/dsp/x86/loop_restoration_10bit_avx2.cc
@@ -0,0 +1,592 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/loop_restoration.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_AVX2 && LIBGAV1_MAX_BITDEPTH >= 10
+#include <immintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/common.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_avx2.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+inline void WienerHorizontalClip(const __m256i s[2],
+ int16_t* const wiener_buffer) {
+ constexpr int offset =
+ 1 << (10 + kWienerFilterBits - kInterRoundBitsHorizontal - 1);
+ constexpr int limit = (offset << 2) - 1;
+ const __m256i offsets = _mm256_set1_epi16(-offset);
+ const __m256i limits = _mm256_set1_epi16(limit - offset);
+ const __m256i round = _mm256_set1_epi32(1 << (kInterRoundBitsHorizontal - 1));
+ const __m256i sum0 = _mm256_add_epi32(s[0], round);
+ const __m256i sum1 = _mm256_add_epi32(s[1], round);
+ const __m256i rounded_sum0 =
+ _mm256_srai_epi32(sum0, kInterRoundBitsHorizontal);
+ const __m256i rounded_sum1 =
+ _mm256_srai_epi32(sum1, kInterRoundBitsHorizontal);
+ const __m256i rounded_sum = _mm256_packs_epi32(rounded_sum0, rounded_sum1);
+ const __m256i d0 = _mm256_max_epi16(rounded_sum, offsets);
+ const __m256i d1 = _mm256_min_epi16(d0, limits);
+ StoreAligned32(wiener_buffer, d1);
+}
+
+inline void WienerHorizontalTap7Kernel(const __m256i s[7],
+ const __m256i filter[2],
+ int16_t* const wiener_buffer) {
+ const __m256i s06 = _mm256_add_epi16(s[0], s[6]);
+ const __m256i s15 = _mm256_add_epi16(s[1], s[5]);
+ const __m256i s24 = _mm256_add_epi16(s[2], s[4]);
+ const __m256i ss0 = _mm256_unpacklo_epi16(s06, s15);
+ const __m256i ss1 = _mm256_unpackhi_epi16(s06, s15);
+ const __m256i ss2 = _mm256_unpacklo_epi16(s24, s[3]);
+ const __m256i ss3 = _mm256_unpackhi_epi16(s24, s[3]);
+ __m256i madds[4];
+ madds[0] = _mm256_madd_epi16(ss0, filter[0]);
+ madds[1] = _mm256_madd_epi16(ss1, filter[0]);
+ madds[2] = _mm256_madd_epi16(ss2, filter[1]);
+ madds[3] = _mm256_madd_epi16(ss3, filter[1]);
+ madds[0] = _mm256_add_epi32(madds[0], madds[2]);
+ madds[1] = _mm256_add_epi32(madds[1], madds[3]);
+ WienerHorizontalClip(madds, wiener_buffer);
+}
+
+inline void WienerHorizontalTap5Kernel(const __m256i s[5], const __m256i filter,
+ int16_t* const wiener_buffer) {
+ const __m256i s04 = _mm256_add_epi16(s[0], s[4]);
+ const __m256i s13 = _mm256_add_epi16(s[1], s[3]);
+ const __m256i s2d = _mm256_add_epi16(s[2], s[2]);
+ const __m256i s0m = _mm256_sub_epi16(s04, s2d);
+ const __m256i s1m = _mm256_sub_epi16(s13, s2d);
+ const __m256i ss0 = _mm256_unpacklo_epi16(s0m, s1m);
+ const __m256i ss1 = _mm256_unpackhi_epi16(s0m, s1m);
+ __m256i madds[2];
+ madds[0] = _mm256_madd_epi16(ss0, filter);
+ madds[1] = _mm256_madd_epi16(ss1, filter);
+ const __m256i s2_lo = _mm256_unpacklo_epi16(s[2], _mm256_setzero_si256());
+ const __m256i s2_hi = _mm256_unpackhi_epi16(s[2], _mm256_setzero_si256());
+ const __m256i s2x128_lo = _mm256_slli_epi32(s2_lo, 7);
+ const __m256i s2x128_hi = _mm256_slli_epi32(s2_hi, 7);
+ madds[0] = _mm256_add_epi32(madds[0], s2x128_lo);
+ madds[1] = _mm256_add_epi32(madds[1], s2x128_hi);
+ WienerHorizontalClip(madds, wiener_buffer);
+}
+
+inline void WienerHorizontalTap3Kernel(const __m256i s[3], const __m256i filter,
+ int16_t* const wiener_buffer) {
+ const __m256i s02 = _mm256_add_epi16(s[0], s[2]);
+ const __m256i ss0 = _mm256_unpacklo_epi16(s02, s[1]);
+ const __m256i ss1 = _mm256_unpackhi_epi16(s02, s[1]);
+ __m256i madds[2];
+ madds[0] = _mm256_madd_epi16(ss0, filter);
+ madds[1] = _mm256_madd_epi16(ss1, filter);
+ WienerHorizontalClip(madds, wiener_buffer);
+}
+
+inline void WienerHorizontalTap7(const uint16_t* src,
+ const ptrdiff_t src_stride,
+ const ptrdiff_t width, const int height,
+ const __m256i* const coefficients,
+ int16_t** const wiener_buffer) {
+ __m256i filter[2];
+ filter[0] = _mm256_shuffle_epi32(*coefficients, 0x0);
+ filter[1] = _mm256_shuffle_epi32(*coefficients, 0x55);
+ for (int y = height; y != 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ __m256i s[7];
+ s[0] = LoadUnaligned32(src + x + 0);
+ s[1] = LoadUnaligned32(src + x + 1);
+ s[2] = LoadUnaligned32(src + x + 2);
+ s[3] = LoadUnaligned32(src + x + 3);
+ s[4] = LoadUnaligned32(src + x + 4);
+ s[5] = LoadUnaligned32(src + x + 5);
+ s[6] = LoadUnaligned32(src + x + 6);
+ WienerHorizontalTap7Kernel(s, filter, *wiener_buffer + x);
+ x += 16;
+ } while (x < width);
+ src += src_stride;
+ *wiener_buffer += width;
+ }
+}
+
+inline void WienerHorizontalTap5(const uint16_t* src,
+ const ptrdiff_t src_stride,
+ const ptrdiff_t width, const int height,
+ const __m256i* const coefficients,
+ int16_t** const wiener_buffer) {
+ const __m256i filter =
+ _mm256_shuffle_epi8(*coefficients, _mm256_set1_epi32(0x05040302));
+ for (int y = height; y != 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ __m256i s[5];
+ s[0] = LoadUnaligned32(src + x + 0);
+ s[1] = LoadUnaligned32(src + x + 1);
+ s[2] = LoadUnaligned32(src + x + 2);
+ s[3] = LoadUnaligned32(src + x + 3);
+ s[4] = LoadUnaligned32(src + x + 4);
+ WienerHorizontalTap5Kernel(s, filter, *wiener_buffer + x);
+ x += 16;
+ } while (x < width);
+ src += src_stride;
+ *wiener_buffer += width;
+ }
+}
+
+inline void WienerHorizontalTap3(const uint16_t* src,
+ const ptrdiff_t src_stride,
+ const ptrdiff_t width, const int height,
+ const __m256i* const coefficients,
+ int16_t** const wiener_buffer) {
+ const auto filter = _mm256_shuffle_epi32(*coefficients, 0x55);
+ for (int y = height; y != 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ __m256i s[3];
+ s[0] = LoadUnaligned32(src + x + 0);
+ s[1] = LoadUnaligned32(src + x + 1);
+ s[2] = LoadUnaligned32(src + x + 2);
+ WienerHorizontalTap3Kernel(s, filter, *wiener_buffer + x);
+ x += 16;
+ } while (x < width);
+ src += src_stride;
+ *wiener_buffer += width;
+ }
+}
+
+inline void WienerHorizontalTap1(const uint16_t* src,
+ const ptrdiff_t src_stride,
+ const ptrdiff_t width, const int height,
+ int16_t** const wiener_buffer) {
+ for (int y = height; y != 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ const __m256i s0 = LoadUnaligned32(src + x);
+ const __m256i d0 = _mm256_slli_epi16(s0, 4);
+ StoreAligned32(*wiener_buffer + x, d0);
+ x += 16;
+ } while (x < width);
+ src += src_stride;
+ *wiener_buffer += width;
+ }
+}
+
+inline __m256i WienerVertical7(const __m256i a[4], const __m256i filter[4]) {
+ const __m256i madd0 = _mm256_madd_epi16(a[0], filter[0]);
+ const __m256i madd1 = _mm256_madd_epi16(a[1], filter[1]);
+ const __m256i madd2 = _mm256_madd_epi16(a[2], filter[2]);
+ const __m256i madd3 = _mm256_madd_epi16(a[3], filter[3]);
+ const __m256i madd01 = _mm256_add_epi32(madd0, madd1);
+ const __m256i madd23 = _mm256_add_epi32(madd2, madd3);
+ const __m256i sum = _mm256_add_epi32(madd01, madd23);
+ return _mm256_srai_epi32(sum, kInterRoundBitsVertical);
+}
+
+inline __m256i WienerVertical5(const __m256i a[3], const __m256i filter[3]) {
+ const __m256i madd0 = _mm256_madd_epi16(a[0], filter[0]);
+ const __m256i madd1 = _mm256_madd_epi16(a[1], filter[1]);
+ const __m256i madd2 = _mm256_madd_epi16(a[2], filter[2]);
+ const __m256i madd01 = _mm256_add_epi32(madd0, madd1);
+ const __m256i sum = _mm256_add_epi32(madd01, madd2);
+ return _mm256_srai_epi32(sum, kInterRoundBitsVertical);
+}
+
+inline __m256i WienerVertical3(const __m256i a[2], const __m256i filter[2]) {
+ const __m256i madd0 = _mm256_madd_epi16(a[0], filter[0]);
+ const __m256i madd1 = _mm256_madd_epi16(a[1], filter[1]);
+ const __m256i sum = _mm256_add_epi32(madd0, madd1);
+ return _mm256_srai_epi32(sum, kInterRoundBitsVertical);
+}
+
+inline __m256i WienerVerticalClip(const __m256i s[2]) {
+ const __m256i d = _mm256_packus_epi32(s[0], s[1]);
+ return _mm256_min_epu16(d, _mm256_set1_epi16(1023));
+}
+
+inline __m256i WienerVerticalFilter7(const __m256i a[7],
+ const __m256i filter[2]) {
+ const __m256i round = _mm256_set1_epi16(1 << (kInterRoundBitsVertical - 1));
+ __m256i b[4], c[2];
+ b[0] = _mm256_unpacklo_epi16(a[0], a[1]);
+ b[1] = _mm256_unpacklo_epi16(a[2], a[3]);
+ b[2] = _mm256_unpacklo_epi16(a[4], a[5]);
+ b[3] = _mm256_unpacklo_epi16(a[6], round);
+ c[0] = WienerVertical7(b, filter);
+ b[0] = _mm256_unpackhi_epi16(a[0], a[1]);
+ b[1] = _mm256_unpackhi_epi16(a[2], a[3]);
+ b[2] = _mm256_unpackhi_epi16(a[4], a[5]);
+ b[3] = _mm256_unpackhi_epi16(a[6], round);
+ c[1] = WienerVertical7(b, filter);
+ return WienerVerticalClip(c);
+}
+
+inline __m256i WienerVerticalFilter5(const __m256i a[5],
+ const __m256i filter[3]) {
+ const __m256i round = _mm256_set1_epi16(1 << (kInterRoundBitsVertical - 1));
+ __m256i b[3], c[2];
+ b[0] = _mm256_unpacklo_epi16(a[0], a[1]);
+ b[1] = _mm256_unpacklo_epi16(a[2], a[3]);
+ b[2] = _mm256_unpacklo_epi16(a[4], round);
+ c[0] = WienerVertical5(b, filter);
+ b[0] = _mm256_unpackhi_epi16(a[0], a[1]);
+ b[1] = _mm256_unpackhi_epi16(a[2], a[3]);
+ b[2] = _mm256_unpackhi_epi16(a[4], round);
+ c[1] = WienerVertical5(b, filter);
+ return WienerVerticalClip(c);
+}
+
+inline __m256i WienerVerticalFilter3(const __m256i a[3],
+ const __m256i filter[2]) {
+ const __m256i round = _mm256_set1_epi16(1 << (kInterRoundBitsVertical - 1));
+ __m256i b[2], c[2];
+ b[0] = _mm256_unpacklo_epi16(a[0], a[1]);
+ b[1] = _mm256_unpacklo_epi16(a[2], round);
+ c[0] = WienerVertical3(b, filter);
+ b[0] = _mm256_unpackhi_epi16(a[0], a[1]);
+ b[1] = _mm256_unpackhi_epi16(a[2], round);
+ c[1] = WienerVertical3(b, filter);
+ return WienerVerticalClip(c);
+}
+
+inline __m256i WienerVerticalTap7Kernel(const int16_t* wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const __m256i filter[2], __m256i a[7]) {
+ a[0] = LoadAligned32(wiener_buffer + 0 * wiener_stride);
+ a[1] = LoadAligned32(wiener_buffer + 1 * wiener_stride);
+ a[2] = LoadAligned32(wiener_buffer + 2 * wiener_stride);
+ a[3] = LoadAligned32(wiener_buffer + 3 * wiener_stride);
+ a[4] = LoadAligned32(wiener_buffer + 4 * wiener_stride);
+ a[5] = LoadAligned32(wiener_buffer + 5 * wiener_stride);
+ a[6] = LoadAligned32(wiener_buffer + 6 * wiener_stride);
+ return WienerVerticalFilter7(a, filter);
+}
+
+inline __m256i WienerVerticalTap5Kernel(const int16_t* wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const __m256i filter[3], __m256i a[5]) {
+ a[0] = LoadAligned32(wiener_buffer + 0 * wiener_stride);
+ a[1] = LoadAligned32(wiener_buffer + 1 * wiener_stride);
+ a[2] = LoadAligned32(wiener_buffer + 2 * wiener_stride);
+ a[3] = LoadAligned32(wiener_buffer + 3 * wiener_stride);
+ a[4] = LoadAligned32(wiener_buffer + 4 * wiener_stride);
+ return WienerVerticalFilter5(a, filter);
+}
+
+inline __m256i WienerVerticalTap3Kernel(const int16_t* wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const __m256i filter[2], __m256i a[3]) {
+ a[0] = LoadAligned32(wiener_buffer + 0 * wiener_stride);
+ a[1] = LoadAligned32(wiener_buffer + 1 * wiener_stride);
+ a[2] = LoadAligned32(wiener_buffer + 2 * wiener_stride);
+ return WienerVerticalFilter3(a, filter);
+}
+
+inline void WienerVerticalTap7Kernel2(const int16_t* wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const __m256i filter[2], __m256i d[2]) {
+ __m256i a[8];
+ d[0] = WienerVerticalTap7Kernel(wiener_buffer, wiener_stride, filter, a);
+ a[7] = LoadAligned32(wiener_buffer + 7 * wiener_stride);
+ d[1] = WienerVerticalFilter7(a + 1, filter);
+}
+
+inline void WienerVerticalTap5Kernel2(const int16_t* wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const __m256i filter[3], __m256i d[2]) {
+ __m256i a[6];
+ d[0] = WienerVerticalTap5Kernel(wiener_buffer, wiener_stride, filter, a);
+ a[5] = LoadAligned32(wiener_buffer + 5 * wiener_stride);
+ d[1] = WienerVerticalFilter5(a + 1, filter);
+}
+
+inline void WienerVerticalTap3Kernel2(const int16_t* wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const __m256i filter[2], __m256i d[2]) {
+ __m256i a[4];
+ d[0] = WienerVerticalTap3Kernel(wiener_buffer, wiener_stride, filter, a);
+ a[3] = LoadAligned32(wiener_buffer + 3 * wiener_stride);
+ d[1] = WienerVerticalFilter3(a + 1, filter);
+}
+
+inline void WienerVerticalTap7(const int16_t* wiener_buffer,
+ const ptrdiff_t width, const int height,
+ const int16_t coefficients[4], uint16_t* dst,
+ const ptrdiff_t dst_stride) {
+ const __m256i c = _mm256_broadcastq_epi64(LoadLo8(coefficients));
+ __m256i filter[4];
+ filter[0] = _mm256_shuffle_epi32(c, 0x0);
+ filter[1] = _mm256_shuffle_epi32(c, 0x55);
+ filter[2] = _mm256_shuffle_epi8(c, _mm256_set1_epi32(0x03020504));
+ filter[3] =
+ _mm256_set1_epi32((1 << 16) | static_cast<uint16_t>(coefficients[0]));
+ for (int y = height >> 1; y > 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ __m256i d[2];
+ WienerVerticalTap7Kernel2(wiener_buffer + x, width, filter, d);
+ StoreUnaligned32(dst + x, d[0]);
+ StoreUnaligned32(dst + dst_stride + x, d[1]);
+ x += 16;
+ } while (x < width);
+ dst += 2 * dst_stride;
+ wiener_buffer += 2 * width;
+ }
+
+ if ((height & 1) != 0) {
+ ptrdiff_t x = 0;
+ do {
+ __m256i a[7];
+ const __m256i d =
+ WienerVerticalTap7Kernel(wiener_buffer + x, width, filter, a);
+ StoreUnaligned32(dst + x, d);
+ x += 16;
+ } while (x < width);
+ }
+}
+
+inline void WienerVerticalTap5(const int16_t* wiener_buffer,
+ const ptrdiff_t width, const int height,
+ const int16_t coefficients[3], uint16_t* dst,
+ const ptrdiff_t dst_stride) {
+ const __m256i c = _mm256_broadcastq_epi64(LoadLo8(coefficients));
+ __m256i filter[3];
+ filter[0] = _mm256_shuffle_epi32(c, 0x0);
+ filter[1] = _mm256_shuffle_epi8(c, _mm256_set1_epi32(0x03020504));
+ filter[2] =
+ _mm256_set1_epi32((1 << 16) | static_cast<uint16_t>(coefficients[0]));
+ for (int y = height >> 1; y > 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ __m256i d[2];
+ WienerVerticalTap5Kernel2(wiener_buffer + x, width, filter, d);
+ StoreUnaligned32(dst + x, d[0]);
+ StoreUnaligned32(dst + dst_stride + x, d[1]);
+ x += 16;
+ } while (x < width);
+ dst += 2 * dst_stride;
+ wiener_buffer += 2 * width;
+ }
+
+ if ((height & 1) != 0) {
+ ptrdiff_t x = 0;
+ do {
+ __m256i a[5];
+ const __m256i d =
+ WienerVerticalTap5Kernel(wiener_buffer + x, width, filter, a);
+ StoreUnaligned32(dst + x, d);
+ x += 16;
+ } while (x < width);
+ }
+}
+
+inline void WienerVerticalTap3(const int16_t* wiener_buffer,
+ const ptrdiff_t width, const int height,
+ const int16_t coefficients[2], uint16_t* dst,
+ const ptrdiff_t dst_stride) {
+ __m256i filter[2];
+ filter[0] =
+ _mm256_set1_epi32(*reinterpret_cast<const int32_t*>(coefficients));
+ filter[1] =
+ _mm256_set1_epi32((1 << 16) | static_cast<uint16_t>(coefficients[0]));
+ for (int y = height >> 1; y > 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ __m256i d[2][2];
+ WienerVerticalTap3Kernel2(wiener_buffer + x, width, filter, d[0]);
+ StoreUnaligned32(dst + x, d[0][0]);
+ StoreUnaligned32(dst + dst_stride + x, d[0][1]);
+ x += 16;
+ } while (x < width);
+ dst += 2 * dst_stride;
+ wiener_buffer += 2 * width;
+ }
+
+ if ((height & 1) != 0) {
+ ptrdiff_t x = 0;
+ do {
+ __m256i a[3];
+ const __m256i d =
+ WienerVerticalTap3Kernel(wiener_buffer + x, width, filter, a);
+ StoreUnaligned32(dst + x, d);
+ x += 16;
+ } while (x < width);
+ }
+}
+
+inline void WienerVerticalTap1Kernel(const int16_t* const wiener_buffer,
+ uint16_t* const dst) {
+ const __m256i a = LoadAligned32(wiener_buffer);
+ const __m256i b = _mm256_add_epi16(a, _mm256_set1_epi16(8));
+ const __m256i c = _mm256_srai_epi16(b, 4);
+ const __m256i d = _mm256_max_epi16(c, _mm256_setzero_si256());
+ const __m256i e = _mm256_min_epi16(d, _mm256_set1_epi16(1023));
+ StoreUnaligned32(dst, e);
+}
+
+inline void WienerVerticalTap1(const int16_t* wiener_buffer,
+ const ptrdiff_t width, const int height,
+ uint16_t* dst, const ptrdiff_t dst_stride) {
+ for (int y = height >> 1; y > 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ WienerVerticalTap1Kernel(wiener_buffer + x, dst + x);
+ WienerVerticalTap1Kernel(wiener_buffer + width + x, dst + dst_stride + x);
+ x += 16;
+ } while (x < width);
+ dst += 2 * dst_stride;
+ wiener_buffer += 2 * width;
+ }
+
+ if ((height & 1) != 0) {
+ ptrdiff_t x = 0;
+ do {
+ WienerVerticalTap1Kernel(wiener_buffer + x, dst + x);
+ x += 16;
+ } while (x < width);
+ }
+}
+
+void WienerFilter_AVX2(const RestorationUnitInfo& restoration_info,
+ const void* const source, const void* const top_border,
+ const void* const bottom_border, const ptrdiff_t stride,
+ const int width, const int height,
+ RestorationBuffer* const restoration_buffer,
+ void* const dest) {
+ const int16_t* const number_leading_zero_coefficients =
+ restoration_info.wiener_info.number_leading_zero_coefficients;
+ const int number_rows_to_skip = std::max(
+ static_cast<int>(number_leading_zero_coefficients[WienerInfo::kVertical]),
+ 1);
+ const ptrdiff_t wiener_stride = Align(width, 16);
+ int16_t* const wiener_buffer_vertical = restoration_buffer->wiener_buffer;
+ // The values are saturated to 13 bits before storing.
+ int16_t* wiener_buffer_horizontal =
+ wiener_buffer_vertical + number_rows_to_skip * wiener_stride;
+
+ // horizontal filtering.
+ // Over-reads up to 15 - |kRestorationHorizontalBorder| values.
+ const int height_horizontal =
+ height + kWienerFilterTaps - 1 - 2 * number_rows_to_skip;
+ const int height_extra = (height_horizontal - height) >> 1;
+ assert(height_extra <= 2);
+ const auto* const src = static_cast<const uint16_t*>(source);
+ const auto* const top = static_cast<const uint16_t*>(top_border);
+ const auto* const bottom = static_cast<const uint16_t*>(bottom_border);
+ const __m128i c =
+ LoadLo8(restoration_info.wiener_info.filter[WienerInfo::kHorizontal]);
+ const __m256i coefficients_horizontal = _mm256_broadcastq_epi64(c);
+ if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) {
+ WienerHorizontalTap7(top + (2 - height_extra) * stride - 3, stride,
+ wiener_stride, height_extra, &coefficients_horizontal,
+ &wiener_buffer_horizontal);
+ WienerHorizontalTap7(src - 3, stride, wiener_stride, height,
+ &coefficients_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap7(bottom - 3, stride, wiener_stride, height_extra,
+ &coefficients_horizontal, &wiener_buffer_horizontal);
+ } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
+ WienerHorizontalTap5(top + (2 - height_extra) * stride - 2, stride,
+ wiener_stride, height_extra, &coefficients_horizontal,
+ &wiener_buffer_horizontal);
+ WienerHorizontalTap5(src - 2, stride, wiener_stride, height,
+ &coefficients_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap5(bottom - 2, stride, wiener_stride, height_extra,
+ &coefficients_horizontal, &wiener_buffer_horizontal);
+ } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) {
+ // The maximum over-reads happen here.
+ WienerHorizontalTap3(top + (2 - height_extra) * stride - 1, stride,
+ wiener_stride, height_extra, &coefficients_horizontal,
+ &wiener_buffer_horizontal);
+ WienerHorizontalTap3(src - 1, stride, wiener_stride, height,
+ &coefficients_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap3(bottom - 1, stride, wiener_stride, height_extra,
+ &coefficients_horizontal, &wiener_buffer_horizontal);
+ } else {
+ assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3);
+ WienerHorizontalTap1(top + (2 - height_extra) * stride, stride,
+ wiener_stride, height_extra,
+ &wiener_buffer_horizontal);
+ WienerHorizontalTap1(src, stride, wiener_stride, height,
+ &wiener_buffer_horizontal);
+ WienerHorizontalTap1(bottom, stride, wiener_stride, height_extra,
+ &wiener_buffer_horizontal);
+ }
+
+ // vertical filtering.
+ // Over-writes up to 15 values.
+ const int16_t* const filter_vertical =
+ restoration_info.wiener_info.filter[WienerInfo::kVertical];
+ auto* dst = static_cast<uint16_t*>(dest);
+ if (number_leading_zero_coefficients[WienerInfo::kVertical] == 0) {
+ // Because the top row of |source| is a duplicate of the second row, and the
+ // bottom row of |source| is a duplicate of its above row, we can duplicate
+ // the top and bottom row of |wiener_buffer| accordingly.
+ memcpy(wiener_buffer_horizontal, wiener_buffer_horizontal - wiener_stride,
+ sizeof(*wiener_buffer_horizontal) * wiener_stride);
+ memcpy(restoration_buffer->wiener_buffer,
+ restoration_buffer->wiener_buffer + wiener_stride,
+ sizeof(*restoration_buffer->wiener_buffer) * wiener_stride);
+ WienerVerticalTap7(wiener_buffer_vertical, wiener_stride, height,
+ filter_vertical, dst, stride);
+ } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 1) {
+ WienerVerticalTap5(wiener_buffer_vertical + wiener_stride, wiener_stride,
+ height, filter_vertical + 1, dst, stride);
+ } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 2) {
+ WienerVerticalTap3(wiener_buffer_vertical + 2 * wiener_stride,
+ wiener_stride, height, filter_vertical + 2, dst, stride);
+ } else {
+ assert(number_leading_zero_coefficients[WienerInfo::kVertical] == 3);
+ WienerVerticalTap1(wiener_buffer_vertical + 3 * wiener_stride,
+ wiener_stride, height, dst, stride);
+ }
+}
+
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+#if DSP_ENABLED_10BPP_AVX2(WienerFilter)
+ dsp->loop_restorations[0] = WienerFilter_AVX2;
+#endif
+}
+
+} // namespace
+
+void LoopRestorationInit10bpp_AVX2() { Init10bpp(); }
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !(LIBGAV1_TARGETING_AVX2 && LIBGAV1_MAX_BITDEPTH >= 10)
+namespace libgav1 {
+namespace dsp {
+
+void LoopRestorationInit10bpp_AVX2() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_TARGETING_AVX2 && LIBGAV1_MAX_BITDEPTH >= 10
diff --git a/src/dsp/x86/loop_restoration_10bit_sse4.cc b/src/dsp/x86/loop_restoration_10bit_sse4.cc
new file mode 100644
index 0000000..0598435
--- /dev/null
+++ b/src/dsp/x86/loop_restoration_10bit_sse4.cc
@@ -0,0 +1,551 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/loop_restoration.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1 && LIBGAV1_MAX_BITDEPTH >= 10
+#include <smmintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/common.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+inline void WienerHorizontalClip(const __m128i s[2],
+ int16_t* const wiener_buffer) {
+ constexpr int offset =
+ 1 << (10 + kWienerFilterBits - kInterRoundBitsHorizontal - 1);
+ constexpr int limit = (offset << 2) - 1;
+ const __m128i offsets = _mm_set1_epi16(-offset);
+ const __m128i limits = _mm_set1_epi16(limit - offset);
+ const __m128i round = _mm_set1_epi32(1 << (kInterRoundBitsHorizontal - 1));
+ const __m128i sum0 = _mm_add_epi32(s[0], round);
+ const __m128i sum1 = _mm_add_epi32(s[1], round);
+ const __m128i rounded_sum0 = _mm_srai_epi32(sum0, kInterRoundBitsHorizontal);
+ const __m128i rounded_sum1 = _mm_srai_epi32(sum1, kInterRoundBitsHorizontal);
+ const __m128i rounded_sum = _mm_packs_epi32(rounded_sum0, rounded_sum1);
+ const __m128i d0 = _mm_max_epi16(rounded_sum, offsets);
+ const __m128i d1 = _mm_min_epi16(d0, limits);
+ StoreAligned16(wiener_buffer, d1);
+}
+
+inline void WienerHorizontalTap7(const uint16_t* src,
+ const ptrdiff_t src_stride,
+ const ptrdiff_t width, const int height,
+ const __m128i coefficients,
+ int16_t** const wiener_buffer) {
+ __m128i filter[2];
+ filter[0] = _mm_shuffle_epi32(coefficients, 0x0);
+ filter[1] = _mm_shuffle_epi32(coefficients, 0x55);
+ for (int y = height; y != 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ __m128i s[7], madds[4];
+ s[0] = LoadUnaligned16(src + x + 0);
+ s[1] = LoadUnaligned16(src + x + 1);
+ s[2] = LoadUnaligned16(src + x + 2);
+ s[3] = LoadUnaligned16(src + x + 3);
+ s[4] = LoadUnaligned16(src + x + 4);
+ s[5] = LoadUnaligned16(src + x + 5);
+ s[6] = LoadUnaligned16(src + x + 6);
+ const __m128i s06 = _mm_add_epi16(s[0], s[6]);
+ const __m128i s15 = _mm_add_epi16(s[1], s[5]);
+ const __m128i s24 = _mm_add_epi16(s[2], s[4]);
+ const __m128i ss0 = _mm_unpacklo_epi16(s06, s15);
+ const __m128i ss1 = _mm_unpackhi_epi16(s06, s15);
+ const __m128i ss2 = _mm_unpacklo_epi16(s24, s[3]);
+ const __m128i ss3 = _mm_unpackhi_epi16(s24, s[3]);
+ madds[0] = _mm_madd_epi16(ss0, filter[0]);
+ madds[1] = _mm_madd_epi16(ss1, filter[0]);
+ madds[2] = _mm_madd_epi16(ss2, filter[1]);
+ madds[3] = _mm_madd_epi16(ss3, filter[1]);
+ madds[0] = _mm_add_epi32(madds[0], madds[2]);
+ madds[1] = _mm_add_epi32(madds[1], madds[3]);
+ WienerHorizontalClip(madds, *wiener_buffer + x);
+ x += 8;
+ } while (x < width);
+ src += src_stride;
+ *wiener_buffer += width;
+ }
+}
+
+inline void WienerHorizontalTap5(const uint16_t* src,
+ const ptrdiff_t src_stride,
+ const ptrdiff_t width, const int height,
+ const __m128i coefficients,
+ int16_t** const wiener_buffer) {
+ const __m128i filter =
+ _mm_shuffle_epi8(coefficients, _mm_set1_epi32(0x05040302));
+ for (int y = height; y != 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ __m128i s[5], madds[2];
+ s[0] = LoadUnaligned16(src + x + 0);
+ s[1] = LoadUnaligned16(src + x + 1);
+ s[2] = LoadUnaligned16(src + x + 2);
+ s[3] = LoadUnaligned16(src + x + 3);
+ s[4] = LoadUnaligned16(src + x + 4);
+ const __m128i s04 = _mm_add_epi16(s[0], s[4]);
+ const __m128i s13 = _mm_add_epi16(s[1], s[3]);
+ const __m128i s2d = _mm_add_epi16(s[2], s[2]);
+ const __m128i s0m = _mm_sub_epi16(s04, s2d);
+ const __m128i s1m = _mm_sub_epi16(s13, s2d);
+ const __m128i ss0 = _mm_unpacklo_epi16(s0m, s1m);
+ const __m128i ss1 = _mm_unpackhi_epi16(s0m, s1m);
+ madds[0] = _mm_madd_epi16(ss0, filter);
+ madds[1] = _mm_madd_epi16(ss1, filter);
+ const __m128i s2_lo = _mm_unpacklo_epi16(s[2], _mm_setzero_si128());
+ const __m128i s2_hi = _mm_unpackhi_epi16(s[2], _mm_setzero_si128());
+ const __m128i s2x128_lo = _mm_slli_epi32(s2_lo, 7);
+ const __m128i s2x128_hi = _mm_slli_epi32(s2_hi, 7);
+ madds[0] = _mm_add_epi32(madds[0], s2x128_lo);
+ madds[1] = _mm_add_epi32(madds[1], s2x128_hi);
+ WienerHorizontalClip(madds, *wiener_buffer + x);
+ x += 8;
+ } while (x < width);
+ src += src_stride;
+ *wiener_buffer += width;
+ }
+}
+
+inline void WienerHorizontalTap3(const uint16_t* src,
+ const ptrdiff_t src_stride,
+ const ptrdiff_t width, const int height,
+ const __m128i coefficients,
+ int16_t** const wiener_buffer) {
+ const auto filter = _mm_shuffle_epi32(coefficients, 0x55);
+ for (int y = height; y != 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ __m128i s[3], madds[2];
+ s[0] = LoadUnaligned16(src + x + 0);
+ s[1] = LoadUnaligned16(src + x + 1);
+ s[2] = LoadUnaligned16(src + x + 2);
+ const __m128i s02 = _mm_add_epi16(s[0], s[2]);
+ const __m128i ss0 = _mm_unpacklo_epi16(s02, s[1]);
+ const __m128i ss1 = _mm_unpackhi_epi16(s02, s[1]);
+ madds[0] = _mm_madd_epi16(ss0, filter);
+ madds[1] = _mm_madd_epi16(ss1, filter);
+ WienerHorizontalClip(madds, *wiener_buffer + x);
+ x += 8;
+ } while (x < width);
+ src += src_stride;
+ *wiener_buffer += width;
+ }
+}
+
+inline void WienerHorizontalTap1(const uint16_t* src,
+ const ptrdiff_t src_stride,
+ const ptrdiff_t width, const int height,
+ int16_t** const wiener_buffer) {
+ for (int y = height; y != 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ const __m128i s = LoadUnaligned16(src + x);
+ const __m128i d = _mm_slli_epi16(s, 4);
+ StoreAligned16(*wiener_buffer + x, d);
+ x += 8;
+ } while (x < width);
+ src += src_stride;
+ *wiener_buffer += width;
+ }
+}
+
+inline __m128i WienerVertical7(const __m128i a[4], const __m128i filter[4]) {
+ const __m128i madd0 = _mm_madd_epi16(a[0], filter[0]);
+ const __m128i madd1 = _mm_madd_epi16(a[1], filter[1]);
+ const __m128i madd2 = _mm_madd_epi16(a[2], filter[2]);
+ const __m128i madd3 = _mm_madd_epi16(a[3], filter[3]);
+ const __m128i madd01 = _mm_add_epi32(madd0, madd1);
+ const __m128i madd23 = _mm_add_epi32(madd2, madd3);
+ const __m128i sum = _mm_add_epi32(madd01, madd23);
+ return _mm_srai_epi32(sum, kInterRoundBitsVertical);
+}
+
+inline __m128i WienerVertical5(const __m128i a[3], const __m128i filter[3]) {
+ const __m128i madd0 = _mm_madd_epi16(a[0], filter[0]);
+ const __m128i madd1 = _mm_madd_epi16(a[1], filter[1]);
+ const __m128i madd2 = _mm_madd_epi16(a[2], filter[2]);
+ const __m128i madd01 = _mm_add_epi32(madd0, madd1);
+ const __m128i sum = _mm_add_epi32(madd01, madd2);
+ return _mm_srai_epi32(sum, kInterRoundBitsVertical);
+}
+
+inline __m128i WienerVertical3(const __m128i a[2], const __m128i filter[2]) {
+ const __m128i madd0 = _mm_madd_epi16(a[0], filter[0]);
+ const __m128i madd1 = _mm_madd_epi16(a[1], filter[1]);
+ const __m128i sum = _mm_add_epi32(madd0, madd1);
+ return _mm_srai_epi32(sum, kInterRoundBitsVertical);
+}
+
+inline __m128i WienerVerticalClip(const __m128i s[2]) {
+ const __m128i d = _mm_packus_epi32(s[0], s[1]);
+ return _mm_min_epu16(d, _mm_set1_epi16(1023));
+}
+
+inline __m128i WienerVerticalFilter7(const __m128i a[7],
+ const __m128i filter[2]) {
+ const __m128i round = _mm_set1_epi16(1 << (kInterRoundBitsVertical - 1));
+ __m128i b[4], c[2];
+ b[0] = _mm_unpacklo_epi16(a[0], a[1]);
+ b[1] = _mm_unpacklo_epi16(a[2], a[3]);
+ b[2] = _mm_unpacklo_epi16(a[4], a[5]);
+ b[3] = _mm_unpacklo_epi16(a[6], round);
+ c[0] = WienerVertical7(b, filter);
+ b[0] = _mm_unpackhi_epi16(a[0], a[1]);
+ b[1] = _mm_unpackhi_epi16(a[2], a[3]);
+ b[2] = _mm_unpackhi_epi16(a[4], a[5]);
+ b[3] = _mm_unpackhi_epi16(a[6], round);
+ c[1] = WienerVertical7(b, filter);
+ return WienerVerticalClip(c);
+}
+
+inline __m128i WienerVerticalFilter5(const __m128i a[5],
+ const __m128i filter[3]) {
+ const __m128i round = _mm_set1_epi16(1 << (kInterRoundBitsVertical - 1));
+ __m128i b[3], c[2];
+ b[0] = _mm_unpacklo_epi16(a[0], a[1]);
+ b[1] = _mm_unpacklo_epi16(a[2], a[3]);
+ b[2] = _mm_unpacklo_epi16(a[4], round);
+ c[0] = WienerVertical5(b, filter);
+ b[0] = _mm_unpackhi_epi16(a[0], a[1]);
+ b[1] = _mm_unpackhi_epi16(a[2], a[3]);
+ b[2] = _mm_unpackhi_epi16(a[4], round);
+ c[1] = WienerVertical5(b, filter);
+ return WienerVerticalClip(c);
+}
+
+inline __m128i WienerVerticalFilter3(const __m128i a[3],
+ const __m128i filter[2]) {
+ const __m128i round = _mm_set1_epi16(1 << (kInterRoundBitsVertical - 1));
+ __m128i b[2], c[2];
+ b[0] = _mm_unpacklo_epi16(a[0], a[1]);
+ b[1] = _mm_unpacklo_epi16(a[2], round);
+ c[0] = WienerVertical3(b, filter);
+ b[0] = _mm_unpackhi_epi16(a[0], a[1]);
+ b[1] = _mm_unpackhi_epi16(a[2], round);
+ c[1] = WienerVertical3(b, filter);
+ return WienerVerticalClip(c);
+}
+
+inline __m128i WienerVerticalTap7Kernel(const int16_t* wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const __m128i filter[2], __m128i a[7]) {
+ a[0] = LoadAligned16(wiener_buffer + 0 * wiener_stride);
+ a[1] = LoadAligned16(wiener_buffer + 1 * wiener_stride);
+ a[2] = LoadAligned16(wiener_buffer + 2 * wiener_stride);
+ a[3] = LoadAligned16(wiener_buffer + 3 * wiener_stride);
+ a[4] = LoadAligned16(wiener_buffer + 4 * wiener_stride);
+ a[5] = LoadAligned16(wiener_buffer + 5 * wiener_stride);
+ a[6] = LoadAligned16(wiener_buffer + 6 * wiener_stride);
+ return WienerVerticalFilter7(a, filter);
+}
+
+inline __m128i WienerVerticalTap5Kernel(const int16_t* wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const __m128i filter[3], __m128i a[5]) {
+ a[0] = LoadAligned16(wiener_buffer + 0 * wiener_stride);
+ a[1] = LoadAligned16(wiener_buffer + 1 * wiener_stride);
+ a[2] = LoadAligned16(wiener_buffer + 2 * wiener_stride);
+ a[3] = LoadAligned16(wiener_buffer + 3 * wiener_stride);
+ a[4] = LoadAligned16(wiener_buffer + 4 * wiener_stride);
+ return WienerVerticalFilter5(a, filter);
+}
+
+inline __m128i WienerVerticalTap3Kernel(const int16_t* wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const __m128i filter[2], __m128i a[3]) {
+ a[0] = LoadAligned16(wiener_buffer + 0 * wiener_stride);
+ a[1] = LoadAligned16(wiener_buffer + 1 * wiener_stride);
+ a[2] = LoadAligned16(wiener_buffer + 2 * wiener_stride);
+ return WienerVerticalFilter3(a, filter);
+}
+
+inline void WienerVerticalTap7(const int16_t* wiener_buffer,
+ const ptrdiff_t width, const int height,
+ const int16_t coefficients[4], uint16_t* dst,
+ const ptrdiff_t dst_stride) {
+ const __m128i c = LoadLo8(coefficients);
+ __m128i filter[4];
+ filter[0] = _mm_shuffle_epi32(c, 0x0);
+ filter[1] = _mm_shuffle_epi32(c, 0x55);
+ filter[2] = _mm_shuffle_epi8(c, _mm_set1_epi32(0x03020504));
+ filter[3] =
+ _mm_set1_epi32((1 << 16) | static_cast<uint16_t>(coefficients[0]));
+ for (int y = height >> 1; y > 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ __m128i a[8], d[2];
+ d[0] = WienerVerticalTap7Kernel(wiener_buffer + x, width, filter, a);
+ a[7] = LoadAligned16(wiener_buffer + x + 7 * width);
+ d[1] = WienerVerticalFilter7(a + 1, filter);
+ StoreAligned16(dst + x, d[0]);
+ StoreAligned16(dst + dst_stride + x, d[1]);
+ x += 8;
+ } while (x < width);
+ dst += 2 * dst_stride;
+ wiener_buffer += 2 * width;
+ }
+
+ if ((height & 1) != 0) {
+ ptrdiff_t x = 0;
+ do {
+ __m128i a[7];
+ const __m128i d =
+ WienerVerticalTap7Kernel(wiener_buffer + x, width, filter, a);
+ StoreAligned16(dst + x, d);
+ x += 8;
+ } while (x < width);
+ }
+}
+
+inline void WienerVerticalTap5(const int16_t* wiener_buffer,
+ const ptrdiff_t width, const int height,
+ const int16_t coefficients[3], uint16_t* dst,
+ const ptrdiff_t dst_stride) {
+ const __m128i c = LoadLo8(coefficients);
+ __m128i filter[3];
+ filter[0] = _mm_shuffle_epi32(c, 0x0);
+ filter[1] = _mm_shuffle_epi8(c, _mm_set1_epi32(0x03020504));
+ filter[2] =
+ _mm_set1_epi32((1 << 16) | static_cast<uint16_t>(coefficients[0]));
+ for (int y = height >> 1; y > 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ __m128i a[6], d[2];
+ d[0] = WienerVerticalTap5Kernel(wiener_buffer + x, width, filter, a);
+ a[5] = LoadAligned16(wiener_buffer + x + 5 * width);
+ d[1] = WienerVerticalFilter5(a + 1, filter);
+ StoreAligned16(dst + x, d[0]);
+ StoreAligned16(dst + dst_stride + x, d[1]);
+ x += 8;
+ } while (x < width);
+ dst += 2 * dst_stride;
+ wiener_buffer += 2 * width;
+ }
+
+ if ((height & 1) != 0) {
+ ptrdiff_t x = 0;
+ do {
+ __m128i a[5];
+ const __m128i d =
+ WienerVerticalTap5Kernel(wiener_buffer + x, width, filter, a);
+ StoreAligned16(dst + x, d);
+ x += 8;
+ } while (x < width);
+ }
+}
+
+inline void WienerVerticalTap3(const int16_t* wiener_buffer,
+ const ptrdiff_t width, const int height,
+ const int16_t coefficients[2], uint16_t* dst,
+ const ptrdiff_t dst_stride) {
+ __m128i filter[2];
+ filter[0] = _mm_set1_epi32(*reinterpret_cast<const int32_t*>(coefficients));
+ filter[1] =
+ _mm_set1_epi32((1 << 16) | static_cast<uint16_t>(coefficients[0]));
+ for (int y = height >> 1; y > 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ __m128i a[4], d[2];
+ d[0] = WienerVerticalTap3Kernel(wiener_buffer + x, width, filter, a);
+ a[3] = LoadAligned16(wiener_buffer + x + 3 * width);
+ d[1] = WienerVerticalFilter3(a + 1, filter);
+ StoreAligned16(dst + x, d[0]);
+ StoreAligned16(dst + dst_stride + x, d[1]);
+ x += 8;
+ } while (x < width);
+ dst += 2 * dst_stride;
+ wiener_buffer += 2 * width;
+ }
+
+ if ((height & 1) != 0) {
+ ptrdiff_t x = 0;
+ do {
+ __m128i a[3];
+ const __m128i d =
+ WienerVerticalTap3Kernel(wiener_buffer + x, width, filter, a);
+ StoreAligned16(dst + x, d);
+ x += 8;
+ } while (x < width);
+ }
+}
+
+inline void WienerVerticalTap1Kernel(const int16_t* const wiener_buffer,
+ uint16_t* const dst) {
+ const __m128i a = LoadAligned16(wiener_buffer);
+ const __m128i b = _mm_add_epi16(a, _mm_set1_epi16(8));
+ const __m128i c = _mm_srai_epi16(b, 4);
+ const __m128i d = _mm_max_epi16(c, _mm_setzero_si128());
+ const __m128i e = _mm_min_epi16(d, _mm_set1_epi16(1023));
+ StoreAligned16(dst, e);
+}
+
+inline void WienerVerticalTap1(const int16_t* wiener_buffer,
+ const ptrdiff_t width, const int height,
+ uint16_t* dst, const ptrdiff_t dst_stride) {
+ for (int y = height >> 1; y > 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ WienerVerticalTap1Kernel(wiener_buffer + x, dst + x);
+ WienerVerticalTap1Kernel(wiener_buffer + width + x, dst + dst_stride + x);
+ x += 8;
+ } while (x < width);
+ dst += 2 * dst_stride;
+ wiener_buffer += 2 * width;
+ }
+
+ if ((height & 1) != 0) {
+ ptrdiff_t x = 0;
+ do {
+ WienerVerticalTap1Kernel(wiener_buffer + x, dst + x);
+ x += 8;
+ } while (x < width);
+ }
+}
+
+void WienerFilter_SSE4_1(const RestorationUnitInfo& restoration_info,
+ const void* const source, const void* const top_border,
+ const void* const bottom_border,
+ const ptrdiff_t stride, const int width,
+ const int height,
+ RestorationBuffer* const restoration_buffer,
+ void* const dest) {
+ const int16_t* const number_leading_zero_coefficients =
+ restoration_info.wiener_info.number_leading_zero_coefficients;
+ const int number_rows_to_skip = std::max(
+ static_cast<int>(number_leading_zero_coefficients[WienerInfo::kVertical]),
+ 1);
+ const ptrdiff_t wiener_stride = Align(width, 16);
+ int16_t* const wiener_buffer_vertical = restoration_buffer->wiener_buffer;
+ // The values are saturated to 13 bits before storing.
+ int16_t* wiener_buffer_horizontal =
+ wiener_buffer_vertical + number_rows_to_skip * wiener_stride;
+
+ // horizontal filtering.
+ // Over-reads up to 15 - |kRestorationHorizontalBorder| values.
+ const int height_horizontal =
+ height + kWienerFilterTaps - 1 - 2 * number_rows_to_skip;
+ const int height_extra = (height_horizontal - height) >> 1;
+ assert(height_extra <= 2);
+ const auto* const src = static_cast<const uint16_t*>(source);
+ const auto* const top = static_cast<const uint16_t*>(top_border);
+ const auto* const bottom = static_cast<const uint16_t*>(bottom_border);
+ const __m128i coefficients_horizontal =
+ LoadLo8(restoration_info.wiener_info.filter[WienerInfo::kHorizontal]);
+ if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) {
+ WienerHorizontalTap7(top + (2 - height_extra) * stride - 3, stride,
+ wiener_stride, height_extra, coefficients_horizontal,
+ &wiener_buffer_horizontal);
+ WienerHorizontalTap7(src - 3, stride, wiener_stride, height,
+ coefficients_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap7(bottom - 3, stride, wiener_stride, height_extra,
+ coefficients_horizontal, &wiener_buffer_horizontal);
+ } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
+ WienerHorizontalTap5(top + (2 - height_extra) * stride - 2, stride,
+ wiener_stride, height_extra, coefficients_horizontal,
+ &wiener_buffer_horizontal);
+ WienerHorizontalTap5(src - 2, stride, wiener_stride, height,
+ coefficients_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap5(bottom - 2, stride, wiener_stride, height_extra,
+ coefficients_horizontal, &wiener_buffer_horizontal);
+ } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) {
+ // The maximum over-reads happen here.
+ WienerHorizontalTap3(top + (2 - height_extra) * stride - 1, stride,
+ wiener_stride, height_extra, coefficients_horizontal,
+ &wiener_buffer_horizontal);
+ WienerHorizontalTap3(src - 1, stride, wiener_stride, height,
+ coefficients_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap3(bottom - 1, stride, wiener_stride, height_extra,
+ coefficients_horizontal, &wiener_buffer_horizontal);
+ } else {
+ assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3);
+ WienerHorizontalTap1(top + (2 - height_extra) * stride, stride,
+ wiener_stride, height_extra,
+ &wiener_buffer_horizontal);
+ WienerHorizontalTap1(src, stride, wiener_stride, height,
+ &wiener_buffer_horizontal);
+ WienerHorizontalTap1(bottom, stride, wiener_stride, height_extra,
+ &wiener_buffer_horizontal);
+ }
+
+ // vertical filtering.
+ // Over-writes up to 15 values.
+ const int16_t* const filter_vertical =
+ restoration_info.wiener_info.filter[WienerInfo::kVertical];
+ auto* dst = static_cast<uint16_t*>(dest);
+ if (number_leading_zero_coefficients[WienerInfo::kVertical] == 0) {
+ // Because the top row of |source| is a duplicate of the second row, and the
+ // bottom row of |source| is a duplicate of its above row, we can duplicate
+ // the top and bottom row of |wiener_buffer| accordingly.
+ memcpy(wiener_buffer_horizontal, wiener_buffer_horizontal - wiener_stride,
+ sizeof(*wiener_buffer_horizontal) * wiener_stride);
+ memcpy(restoration_buffer->wiener_buffer,
+ restoration_buffer->wiener_buffer + wiener_stride,
+ sizeof(*restoration_buffer->wiener_buffer) * wiener_stride);
+ WienerVerticalTap7(wiener_buffer_vertical, wiener_stride, height,
+ filter_vertical, dst, stride);
+ } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 1) {
+ WienerVerticalTap5(wiener_buffer_vertical + wiener_stride, wiener_stride,
+ height, filter_vertical + 1, dst, stride);
+ } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 2) {
+ WienerVerticalTap3(wiener_buffer_vertical + 2 * wiener_stride,
+ wiener_stride, height, filter_vertical + 2, dst, stride);
+ } else {
+ assert(number_leading_zero_coefficients[WienerInfo::kVertical] == 3);
+ WienerVerticalTap1(wiener_buffer_vertical + 3 * wiener_stride,
+ wiener_stride, height, dst, stride);
+ }
+}
+
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+ static_cast<void>(dsp);
+#if DSP_ENABLED_10BPP_SSE4_1(WienerFilter)
+ dsp->loop_restorations[0] = WienerFilter_SSE4_1;
+#else
+ static_cast<void>(WienerFilter_SSE4_1);
+#endif
+}
+
+} // namespace
+
+void LoopRestorationInit10bpp_SSE4_1() { Init10bpp(); }
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !(LIBGAV1_TARGETING_SSE4_1 && LIBGAV1_MAX_BITDEPTH >= 10)
+namespace libgav1 {
+namespace dsp {
+
+void LoopRestorationInit10bpp_SSE4_1() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_TARGETING_SSE4_1 && LIBGAV1_MAX_BITDEPTH >= 10
diff --git a/src/dsp/x86/loop_restoration_avx2.cc b/src/dsp/x86/loop_restoration_avx2.cc
new file mode 100644
index 0000000..7ae7c90
--- /dev/null
+++ b/src/dsp/x86/loop_restoration_avx2.cc
@@ -0,0 +1,2902 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/loop_restoration.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_AVX2
+#include <immintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/common.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_avx2.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+inline void WienerHorizontalClip(const __m256i s[2], const __m256i s_3x128,
+ int16_t* const wiener_buffer) {
+ constexpr int offset =
+ 1 << (8 + kWienerFilterBits - kInterRoundBitsHorizontal - 1);
+ constexpr int limit =
+ (1 << (8 + 1 + kWienerFilterBits - kInterRoundBitsHorizontal)) - 1;
+ const __m256i offsets = _mm256_set1_epi16(-offset);
+ const __m256i limits = _mm256_set1_epi16(limit - offset);
+ const __m256i round = _mm256_set1_epi16(1 << (kInterRoundBitsHorizontal - 1));
+ // The sum range here is [-128 * 255, 90 * 255].
+ const __m256i madd = _mm256_add_epi16(s[0], s[1]);
+ const __m256i sum = _mm256_add_epi16(madd, round);
+ const __m256i rounded_sum0 =
+ _mm256_srai_epi16(sum, kInterRoundBitsHorizontal);
+ // Add back scaled down offset correction.
+ const __m256i rounded_sum1 = _mm256_add_epi16(rounded_sum0, s_3x128);
+ const __m256i d0 = _mm256_max_epi16(rounded_sum1, offsets);
+ const __m256i d1 = _mm256_min_epi16(d0, limits);
+ StoreAligned32(wiener_buffer, d1);
+}
+
+// Using _mm256_alignr_epi8() is about 8% faster than loading all and unpacking,
+// because the compiler generates redundant code when loading all and unpacking.
+inline void WienerHorizontalTap7Kernel(const __m256i s[2],
+ const __m256i filter[4],
+ int16_t* const wiener_buffer) {
+ const auto s01 = _mm256_alignr_epi8(s[1], s[0], 1);
+ const auto s23 = _mm256_alignr_epi8(s[1], s[0], 5);
+ const auto s45 = _mm256_alignr_epi8(s[1], s[0], 9);
+ const auto s67 = _mm256_alignr_epi8(s[1], s[0], 13);
+ __m256i madds[4];
+ madds[0] = _mm256_maddubs_epi16(s01, filter[0]);
+ madds[1] = _mm256_maddubs_epi16(s23, filter[1]);
+ madds[2] = _mm256_maddubs_epi16(s45, filter[2]);
+ madds[3] = _mm256_maddubs_epi16(s67, filter[3]);
+ madds[0] = _mm256_add_epi16(madds[0], madds[2]);
+ madds[1] = _mm256_add_epi16(madds[1], madds[3]);
+ const __m256i s_3x128 = _mm256_slli_epi16(_mm256_srli_epi16(s23, 8),
+ 7 - kInterRoundBitsHorizontal);
+ WienerHorizontalClip(madds, s_3x128, wiener_buffer);
+}
+
+inline void WienerHorizontalTap5Kernel(const __m256i s[2],
+ const __m256i filter[3],
+ int16_t* const wiener_buffer) {
+ const auto s01 = _mm256_alignr_epi8(s[1], s[0], 1);
+ const auto s23 = _mm256_alignr_epi8(s[1], s[0], 5);
+ const auto s45 = _mm256_alignr_epi8(s[1], s[0], 9);
+ __m256i madds[3];
+ madds[0] = _mm256_maddubs_epi16(s01, filter[0]);
+ madds[1] = _mm256_maddubs_epi16(s23, filter[1]);
+ madds[2] = _mm256_maddubs_epi16(s45, filter[2]);
+ madds[0] = _mm256_add_epi16(madds[0], madds[2]);
+ const __m256i s_3x128 = _mm256_srli_epi16(_mm256_slli_epi16(s23, 8),
+ kInterRoundBitsHorizontal + 1);
+ WienerHorizontalClip(madds, s_3x128, wiener_buffer);
+}
+
+inline void WienerHorizontalTap3Kernel(const __m256i s[2],
+ const __m256i filter[2],
+ int16_t* const wiener_buffer) {
+ const auto s01 = _mm256_alignr_epi8(s[1], s[0], 1);
+ const auto s23 = _mm256_alignr_epi8(s[1], s[0], 5);
+ __m256i madds[2];
+ madds[0] = _mm256_maddubs_epi16(s01, filter[0]);
+ madds[1] = _mm256_maddubs_epi16(s23, filter[1]);
+ const __m256i s_3x128 = _mm256_slli_epi16(_mm256_srli_epi16(s01, 8),
+ 7 - kInterRoundBitsHorizontal);
+ WienerHorizontalClip(madds, s_3x128, wiener_buffer);
+}
+
+inline void WienerHorizontalTap7(const uint8_t* src, const ptrdiff_t src_stride,
+ const ptrdiff_t width, const int height,
+ const __m256i coefficients,
+ int16_t** const wiener_buffer) {
+ __m256i filter[4];
+ filter[0] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x0100));
+ filter[1] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x0302));
+ filter[2] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x0102));
+ filter[3] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x8000));
+ for (int y = height; y != 0; --y) {
+ __m256i s = LoadUnaligned32(src);
+ __m256i ss[4];
+ ss[0] = _mm256_unpacklo_epi8(s, s);
+ ptrdiff_t x = 0;
+ do {
+ ss[1] = _mm256_unpackhi_epi8(s, s);
+ s = LoadUnaligned32(src + x + 32);
+ ss[3] = _mm256_unpacklo_epi8(s, s);
+ ss[2] = _mm256_permute2x128_si256(ss[0], ss[3], 0x21);
+ WienerHorizontalTap7Kernel(ss + 0, filter, *wiener_buffer + x + 0);
+ WienerHorizontalTap7Kernel(ss + 1, filter, *wiener_buffer + x + 16);
+ ss[0] = ss[3];
+ x += 32;
+ } while (x < width);
+ src += src_stride;
+ *wiener_buffer += width;
+ }
+}
+
+inline void WienerHorizontalTap5(const uint8_t* src, const ptrdiff_t src_stride,
+ const ptrdiff_t width, const int height,
+ const __m256i coefficients,
+ int16_t** const wiener_buffer) {
+ __m256i filter[3];
+ filter[0] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x0201));
+ filter[1] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x0203));
+ filter[2] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x8001));
+ for (int y = height; y != 0; --y) {
+ __m256i s = LoadUnaligned32(src);
+ __m256i ss[4];
+ ss[0] = _mm256_unpacklo_epi8(s, s);
+ ptrdiff_t x = 0;
+ do {
+ ss[1] = _mm256_unpackhi_epi8(s, s);
+ s = LoadUnaligned32(src + x + 32);
+ ss[3] = _mm256_unpacklo_epi8(s, s);
+ ss[2] = _mm256_permute2x128_si256(ss[0], ss[3], 0x21);
+ WienerHorizontalTap5Kernel(ss + 0, filter, *wiener_buffer + x + 0);
+ WienerHorizontalTap5Kernel(ss + 1, filter, *wiener_buffer + x + 16);
+ ss[0] = ss[3];
+ x += 32;
+ } while (x < width);
+ src += src_stride;
+ *wiener_buffer += width;
+ }
+}
+
+inline void WienerHorizontalTap3(const uint8_t* src, const ptrdiff_t src_stride,
+ const ptrdiff_t width, const int height,
+ const __m256i coefficients,
+ int16_t** const wiener_buffer) {
+ __m256i filter[2];
+ filter[0] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x0302));
+ filter[1] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x8002));
+ for (int y = height; y != 0; --y) {
+ __m256i s = LoadUnaligned32(src);
+ __m256i ss[4];
+ ss[0] = _mm256_unpacklo_epi8(s, s);
+ ptrdiff_t x = 0;
+ do {
+ ss[1] = _mm256_unpackhi_epi8(s, s);
+ s = LoadUnaligned32(src + x + 32);
+ ss[3] = _mm256_unpacklo_epi8(s, s);
+ ss[2] = _mm256_permute2x128_si256(ss[0], ss[3], 0x21);
+ WienerHorizontalTap3Kernel(ss + 0, filter, *wiener_buffer + x + 0);
+ WienerHorizontalTap3Kernel(ss + 1, filter, *wiener_buffer + x + 16);
+ ss[0] = ss[3];
+ x += 32;
+ } while (x < width);
+ src += src_stride;
+ *wiener_buffer += width;
+ }
+}
+
+inline void WienerHorizontalTap1(const uint8_t* src, const ptrdiff_t src_stride,
+ const ptrdiff_t width, const int height,
+ int16_t** const wiener_buffer) {
+ for (int y = height; y != 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ const __m256i s = LoadUnaligned32(src + x);
+ const __m256i s0 = _mm256_unpacklo_epi8(s, _mm256_setzero_si256());
+ const __m256i s1 = _mm256_unpackhi_epi8(s, _mm256_setzero_si256());
+ __m256i d[2];
+ d[0] = _mm256_slli_epi16(s0, 4);
+ d[1] = _mm256_slli_epi16(s1, 4);
+ StoreAligned64(*wiener_buffer + x, d);
+ x += 32;
+ } while (x < width);
+ src += src_stride;
+ *wiener_buffer += width;
+ }
+}
+
+inline __m256i WienerVertical7(const __m256i a[2], const __m256i filter[2]) {
+ const __m256i round = _mm256_set1_epi32(1 << (kInterRoundBitsVertical - 1));
+ const __m256i madd0 = _mm256_madd_epi16(a[0], filter[0]);
+ const __m256i madd1 = _mm256_madd_epi16(a[1], filter[1]);
+ const __m256i sum0 = _mm256_add_epi32(round, madd0);
+ const __m256i sum1 = _mm256_add_epi32(sum0, madd1);
+ return _mm256_srai_epi32(sum1, kInterRoundBitsVertical);
+}
+
+inline __m256i WienerVertical5(const __m256i a[2], const __m256i filter[2]) {
+ const __m256i madd0 = _mm256_madd_epi16(a[0], filter[0]);
+ const __m256i madd1 = _mm256_madd_epi16(a[1], filter[1]);
+ const __m256i sum = _mm256_add_epi32(madd0, madd1);
+ return _mm256_srai_epi32(sum, kInterRoundBitsVertical);
+}
+
+inline __m256i WienerVertical3(const __m256i a, const __m256i filter) {
+ const __m256i round = _mm256_set1_epi32(1 << (kInterRoundBitsVertical - 1));
+ const __m256i madd = _mm256_madd_epi16(a, filter);
+ const __m256i sum = _mm256_add_epi32(round, madd);
+ return _mm256_srai_epi32(sum, kInterRoundBitsVertical);
+}
+
+inline __m256i WienerVerticalFilter7(const __m256i a[7],
+ const __m256i filter[2]) {
+ __m256i b[2];
+ const __m256i a06 = _mm256_add_epi16(a[0], a[6]);
+ const __m256i a15 = _mm256_add_epi16(a[1], a[5]);
+ const __m256i a24 = _mm256_add_epi16(a[2], a[4]);
+ b[0] = _mm256_unpacklo_epi16(a06, a15);
+ b[1] = _mm256_unpacklo_epi16(a24, a[3]);
+ const __m256i sum0 = WienerVertical7(b, filter);
+ b[0] = _mm256_unpackhi_epi16(a06, a15);
+ b[1] = _mm256_unpackhi_epi16(a24, a[3]);
+ const __m256i sum1 = WienerVertical7(b, filter);
+ return _mm256_packs_epi32(sum0, sum1);
+}
+
+inline __m256i WienerVerticalFilter5(const __m256i a[5],
+ const __m256i filter[2]) {
+ const __m256i round = _mm256_set1_epi16(1 << (kInterRoundBitsVertical - 1));
+ __m256i b[2];
+ const __m256i a04 = _mm256_add_epi16(a[0], a[4]);
+ const __m256i a13 = _mm256_add_epi16(a[1], a[3]);
+ b[0] = _mm256_unpacklo_epi16(a04, a13);
+ b[1] = _mm256_unpacklo_epi16(a[2], round);
+ const __m256i sum0 = WienerVertical5(b, filter);
+ b[0] = _mm256_unpackhi_epi16(a04, a13);
+ b[1] = _mm256_unpackhi_epi16(a[2], round);
+ const __m256i sum1 = WienerVertical5(b, filter);
+ return _mm256_packs_epi32(sum0, sum1);
+}
+
+inline __m256i WienerVerticalFilter3(const __m256i a[3], const __m256i filter) {
+ __m256i b;
+ const __m256i a02 = _mm256_add_epi16(a[0], a[2]);
+ b = _mm256_unpacklo_epi16(a02, a[1]);
+ const __m256i sum0 = WienerVertical3(b, filter);
+ b = _mm256_unpackhi_epi16(a02, a[1]);
+ const __m256i sum1 = WienerVertical3(b, filter);
+ return _mm256_packs_epi32(sum0, sum1);
+}
+
+inline __m256i WienerVerticalTap7Kernel(const int16_t* wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const __m256i filter[2], __m256i a[7]) {
+ a[0] = LoadAligned32(wiener_buffer + 0 * wiener_stride);
+ a[1] = LoadAligned32(wiener_buffer + 1 * wiener_stride);
+ a[2] = LoadAligned32(wiener_buffer + 2 * wiener_stride);
+ a[3] = LoadAligned32(wiener_buffer + 3 * wiener_stride);
+ a[4] = LoadAligned32(wiener_buffer + 4 * wiener_stride);
+ a[5] = LoadAligned32(wiener_buffer + 5 * wiener_stride);
+ a[6] = LoadAligned32(wiener_buffer + 6 * wiener_stride);
+ return WienerVerticalFilter7(a, filter);
+}
+
+inline __m256i WienerVerticalTap5Kernel(const int16_t* wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const __m256i filter[2], __m256i a[5]) {
+ a[0] = LoadAligned32(wiener_buffer + 0 * wiener_stride);
+ a[1] = LoadAligned32(wiener_buffer + 1 * wiener_stride);
+ a[2] = LoadAligned32(wiener_buffer + 2 * wiener_stride);
+ a[3] = LoadAligned32(wiener_buffer + 3 * wiener_stride);
+ a[4] = LoadAligned32(wiener_buffer + 4 * wiener_stride);
+ return WienerVerticalFilter5(a, filter);
+}
+
+inline __m256i WienerVerticalTap3Kernel(const int16_t* wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const __m256i filter, __m256i a[3]) {
+ a[0] = LoadAligned32(wiener_buffer + 0 * wiener_stride);
+ a[1] = LoadAligned32(wiener_buffer + 1 * wiener_stride);
+ a[2] = LoadAligned32(wiener_buffer + 2 * wiener_stride);
+ return WienerVerticalFilter3(a, filter);
+}
+
+inline void WienerVerticalTap7Kernel2(const int16_t* wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const __m256i filter[2], __m256i d[2]) {
+ __m256i a[8];
+ d[0] = WienerVerticalTap7Kernel(wiener_buffer, wiener_stride, filter, a);
+ a[7] = LoadAligned32(wiener_buffer + 7 * wiener_stride);
+ d[1] = WienerVerticalFilter7(a + 1, filter);
+}
+
+inline void WienerVerticalTap5Kernel2(const int16_t* wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const __m256i filter[2], __m256i d[2]) {
+ __m256i a[6];
+ d[0] = WienerVerticalTap5Kernel(wiener_buffer, wiener_stride, filter, a);
+ a[5] = LoadAligned32(wiener_buffer + 5 * wiener_stride);
+ d[1] = WienerVerticalFilter5(a + 1, filter);
+}
+
+inline void WienerVerticalTap3Kernel2(const int16_t* wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const __m256i filter, __m256i d[2]) {
+ __m256i a[4];
+ d[0] = WienerVerticalTap3Kernel(wiener_buffer, wiener_stride, filter, a);
+ a[3] = LoadAligned32(wiener_buffer + 3 * wiener_stride);
+ d[1] = WienerVerticalFilter3(a + 1, filter);
+}
+
+inline void WienerVerticalTap7(const int16_t* wiener_buffer,
+ const ptrdiff_t width, const int height,
+ const int16_t coefficients[4], uint8_t* dst,
+ const ptrdiff_t dst_stride) {
+ const __m256i c = _mm256_broadcastq_epi64(LoadLo8(coefficients));
+ __m256i filter[2];
+ filter[0] = _mm256_shuffle_epi32(c, 0x0);
+ filter[1] = _mm256_shuffle_epi32(c, 0x55);
+ for (int y = height >> 1; y > 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ __m256i d[2][2];
+ WienerVerticalTap7Kernel2(wiener_buffer + x + 0, width, filter, d[0]);
+ WienerVerticalTap7Kernel2(wiener_buffer + x + 16, width, filter, d[1]);
+ StoreUnaligned32(dst + x, _mm256_packus_epi16(d[0][0], d[1][0]));
+ StoreUnaligned32(dst + dst_stride + x,
+ _mm256_packus_epi16(d[0][1], d[1][1]));
+ x += 32;
+ } while (x < width);
+ dst += 2 * dst_stride;
+ wiener_buffer += 2 * width;
+ }
+
+ if ((height & 1) != 0) {
+ ptrdiff_t x = 0;
+ do {
+ __m256i a[7];
+ const __m256i d0 =
+ WienerVerticalTap7Kernel(wiener_buffer + x + 0, width, filter, a);
+ const __m256i d1 =
+ WienerVerticalTap7Kernel(wiener_buffer + x + 16, width, filter, a);
+ StoreUnaligned32(dst + x, _mm256_packus_epi16(d0, d1));
+ x += 32;
+ } while (x < width);
+ }
+}
+
+inline void WienerVerticalTap5(const int16_t* wiener_buffer,
+ const ptrdiff_t width, const int height,
+ const int16_t coefficients[3], uint8_t* dst,
+ const ptrdiff_t dst_stride) {
+ const __m256i c = _mm256_broadcastd_epi32(Load4(coefficients));
+ __m256i filter[2];
+ filter[0] = _mm256_shuffle_epi32(c, 0);
+ filter[1] =
+ _mm256_set1_epi32((1 << 16) | static_cast<uint16_t>(coefficients[2]));
+ for (int y = height >> 1; y > 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ __m256i d[2][2];
+ WienerVerticalTap5Kernel2(wiener_buffer + x + 0, width, filter, d[0]);
+ WienerVerticalTap5Kernel2(wiener_buffer + x + 16, width, filter, d[1]);
+ StoreUnaligned32(dst + x, _mm256_packus_epi16(d[0][0], d[1][0]));
+ StoreUnaligned32(dst + dst_stride + x,
+ _mm256_packus_epi16(d[0][1], d[1][1]));
+ x += 32;
+ } while (x < width);
+ dst += 2 * dst_stride;
+ wiener_buffer += 2 * width;
+ }
+
+ if ((height & 1) != 0) {
+ ptrdiff_t x = 0;
+ do {
+ __m256i a[5];
+ const __m256i d0 =
+ WienerVerticalTap5Kernel(wiener_buffer + x + 0, width, filter, a);
+ const __m256i d1 =
+ WienerVerticalTap5Kernel(wiener_buffer + x + 16, width, filter, a);
+ StoreUnaligned32(dst + x, _mm256_packus_epi16(d0, d1));
+ x += 32;
+ } while (x < width);
+ }
+}
+
+inline void WienerVerticalTap3(const int16_t* wiener_buffer,
+ const ptrdiff_t width, const int height,
+ const int16_t coefficients[2], uint8_t* dst,
+ const ptrdiff_t dst_stride) {
+ const __m256i filter =
+ _mm256_set1_epi32(*reinterpret_cast<const int32_t*>(coefficients));
+ for (int y = height >> 1; y > 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ __m256i d[2][2];
+ WienerVerticalTap3Kernel2(wiener_buffer + x + 0, width, filter, d[0]);
+ WienerVerticalTap3Kernel2(wiener_buffer + x + 16, width, filter, d[1]);
+ StoreUnaligned32(dst + x, _mm256_packus_epi16(d[0][0], d[1][0]));
+ StoreUnaligned32(dst + dst_stride + x,
+ _mm256_packus_epi16(d[0][1], d[1][1]));
+ x += 32;
+ } while (x < width);
+ dst += 2 * dst_stride;
+ wiener_buffer += 2 * width;
+ }
+
+ if ((height & 1) != 0) {
+ ptrdiff_t x = 0;
+ do {
+ __m256i a[3];
+ const __m256i d0 =
+ WienerVerticalTap3Kernel(wiener_buffer + x + 0, width, filter, a);
+ const __m256i d1 =
+ WienerVerticalTap3Kernel(wiener_buffer + x + 16, width, filter, a);
+ StoreUnaligned32(dst + x, _mm256_packus_epi16(d0, d1));
+ x += 32;
+ } while (x < width);
+ }
+}
+
+inline void WienerVerticalTap1Kernel(const int16_t* const wiener_buffer,
+ uint8_t* const dst) {
+ const __m256i a0 = LoadAligned32(wiener_buffer + 0);
+ const __m256i a1 = LoadAligned32(wiener_buffer + 16);
+ const __m256i b0 = _mm256_add_epi16(a0, _mm256_set1_epi16(8));
+ const __m256i b1 = _mm256_add_epi16(a1, _mm256_set1_epi16(8));
+ const __m256i c0 = _mm256_srai_epi16(b0, 4);
+ const __m256i c1 = _mm256_srai_epi16(b1, 4);
+ const __m256i d = _mm256_packus_epi16(c0, c1);
+ StoreUnaligned32(dst, d);
+}
+
+inline void WienerVerticalTap1(const int16_t* wiener_buffer,
+ const ptrdiff_t width, const int height,
+ uint8_t* dst, const ptrdiff_t dst_stride) {
+ for (int y = height >> 1; y > 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ WienerVerticalTap1Kernel(wiener_buffer + x, dst + x);
+ WienerVerticalTap1Kernel(wiener_buffer + width + x, dst + dst_stride + x);
+ x += 32;
+ } while (x < width);
+ dst += 2 * dst_stride;
+ wiener_buffer += 2 * width;
+ }
+
+ if ((height & 1) != 0) {
+ ptrdiff_t x = 0;
+ do {
+ WienerVerticalTap1Kernel(wiener_buffer + x, dst + x);
+ x += 32;
+ } while (x < width);
+ }
+}
+
+void WienerFilter_AVX2(const RestorationUnitInfo& restoration_info,
+ const void* const source, const void* const top_border,
+ const void* const bottom_border, const ptrdiff_t stride,
+ const int width, const int height,
+ RestorationBuffer* const restoration_buffer,
+ void* const dest) {
+ const int16_t* const number_leading_zero_coefficients =
+ restoration_info.wiener_info.number_leading_zero_coefficients;
+ const int number_rows_to_skip = std::max(
+ static_cast<int>(number_leading_zero_coefficients[WienerInfo::kVertical]),
+ 1);
+ const ptrdiff_t wiener_stride = Align(width, 32);
+ int16_t* const wiener_buffer_vertical = restoration_buffer->wiener_buffer;
+ // The values are saturated to 13 bits before storing.
+ int16_t* wiener_buffer_horizontal =
+ wiener_buffer_vertical + number_rows_to_skip * wiener_stride;
+
+ // horizontal filtering.
+ // Over-reads up to 15 - |kRestorationHorizontalBorder| values.
+ const int height_horizontal =
+ height + kWienerFilterTaps - 1 - 2 * number_rows_to_skip;
+ const int height_extra = (height_horizontal - height) >> 1;
+ assert(height_extra <= 2);
+ const auto* const src = static_cast<const uint8_t*>(source);
+ const auto* const top = static_cast<const uint8_t*>(top_border);
+ const auto* const bottom = static_cast<const uint8_t*>(bottom_border);
+ const __m128i c =
+ LoadLo8(restoration_info.wiener_info.filter[WienerInfo::kHorizontal]);
+ // In order to keep the horizontal pass intermediate values within 16 bits we
+ // offset |filter[3]| by 128. The 128 offset will be added back in the loop.
+ __m128i c_horizontal =
+ _mm_sub_epi16(c, _mm_setr_epi16(0, 0, 0, 128, 0, 0, 0, 0));
+ c_horizontal = _mm_packs_epi16(c_horizontal, c_horizontal);
+ const __m256i coefficients_horizontal = _mm256_broadcastd_epi32(c_horizontal);
+ if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) {
+ WienerHorizontalTap7(top + (2 - height_extra) * stride - 3, stride,
+ wiener_stride, height_extra, coefficients_horizontal,
+ &wiener_buffer_horizontal);
+ WienerHorizontalTap7(src - 3, stride, wiener_stride, height,
+ coefficients_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap7(bottom - 3, stride, wiener_stride, height_extra,
+ coefficients_horizontal, &wiener_buffer_horizontal);
+ } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
+ WienerHorizontalTap5(top + (2 - height_extra) * stride - 2, stride,
+ wiener_stride, height_extra, coefficients_horizontal,
+ &wiener_buffer_horizontal);
+ WienerHorizontalTap5(src - 2, stride, wiener_stride, height,
+ coefficients_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap5(bottom - 2, stride, wiener_stride, height_extra,
+ coefficients_horizontal, &wiener_buffer_horizontal);
+ } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) {
+ // The maximum over-reads happen here.
+ WienerHorizontalTap3(top + (2 - height_extra) * stride - 1, stride,
+ wiener_stride, height_extra, coefficients_horizontal,
+ &wiener_buffer_horizontal);
+ WienerHorizontalTap3(src - 1, stride, wiener_stride, height,
+ coefficients_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap3(bottom - 1, stride, wiener_stride, height_extra,
+ coefficients_horizontal, &wiener_buffer_horizontal);
+ } else {
+ assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3);
+ WienerHorizontalTap1(top + (2 - height_extra) * stride, stride,
+ wiener_stride, height_extra,
+ &wiener_buffer_horizontal);
+ WienerHorizontalTap1(src, stride, wiener_stride, height,
+ &wiener_buffer_horizontal);
+ WienerHorizontalTap1(bottom, stride, wiener_stride, height_extra,
+ &wiener_buffer_horizontal);
+ }
+
+ // vertical filtering.
+ // Over-writes up to 15 values.
+ const int16_t* const filter_vertical =
+ restoration_info.wiener_info.filter[WienerInfo::kVertical];
+ auto* dst = static_cast<uint8_t*>(dest);
+ if (number_leading_zero_coefficients[WienerInfo::kVertical] == 0) {
+ // Because the top row of |source| is a duplicate of the second row, and the
+ // bottom row of |source| is a duplicate of its above row, we can duplicate
+ // the top and bottom row of |wiener_buffer| accordingly.
+ memcpy(wiener_buffer_horizontal, wiener_buffer_horizontal - wiener_stride,
+ sizeof(*wiener_buffer_horizontal) * wiener_stride);
+ memcpy(restoration_buffer->wiener_buffer,
+ restoration_buffer->wiener_buffer + wiener_stride,
+ sizeof(*restoration_buffer->wiener_buffer) * wiener_stride);
+ WienerVerticalTap7(wiener_buffer_vertical, wiener_stride, height,
+ filter_vertical, dst, stride);
+ } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 1) {
+ WienerVerticalTap5(wiener_buffer_vertical + wiener_stride, wiener_stride,
+ height, filter_vertical + 1, dst, stride);
+ } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 2) {
+ WienerVerticalTap3(wiener_buffer_vertical + 2 * wiener_stride,
+ wiener_stride, height, filter_vertical + 2, dst, stride);
+ } else {
+ assert(number_leading_zero_coefficients[WienerInfo::kVertical] == 3);
+ WienerVerticalTap1(wiener_buffer_vertical + 3 * wiener_stride,
+ wiener_stride, height, dst, stride);
+ }
+}
+
+//------------------------------------------------------------------------------
+// SGR
+
+constexpr int kSumOffset = 24;
+
+// SIMD overreads the number of bytes in SIMD registers - (width % 16) - 2 *
+// padding pixels, where padding is 3 for Pass 1 and 2 for Pass 2. The number of
+// bytes in SIMD registers is 16 for SSE4.1 and 32 for AVX2.
+constexpr int kOverreadInBytesPass1_128 = 10;
+constexpr int kOverreadInBytesPass2_128 = 12;
+constexpr int kOverreadInBytesPass1_256 = kOverreadInBytesPass1_128 + 16;
+constexpr int kOverreadInBytesPass2_256 = kOverreadInBytesPass2_128 + 16;
+
+inline void LoadAligned16x2U16(const uint16_t* const src[2], const ptrdiff_t x,
+ __m128i dst[2]) {
+ dst[0] = LoadAligned16(src[0] + x);
+ dst[1] = LoadAligned16(src[1] + x);
+}
+
+inline void LoadAligned32x2U16(const uint16_t* const src[2], const ptrdiff_t x,
+ __m256i dst[2]) {
+ dst[0] = LoadAligned32(src[0] + x);
+ dst[1] = LoadAligned32(src[1] + x);
+}
+
+inline void LoadAligned32x2U16Msan(const uint16_t* const src[2],
+ const ptrdiff_t x, const ptrdiff_t border,
+ __m256i dst[2]) {
+ dst[0] = LoadAligned32Msan(src[0] + x, sizeof(**src) * (x + 16 - border));
+ dst[1] = LoadAligned32Msan(src[1] + x, sizeof(**src) * (x + 16 - border));
+}
+
+inline void LoadAligned16x3U16(const uint16_t* const src[3], const ptrdiff_t x,
+ __m128i dst[3]) {
+ dst[0] = LoadAligned16(src[0] + x);
+ dst[1] = LoadAligned16(src[1] + x);
+ dst[2] = LoadAligned16(src[2] + x);
+}
+
+inline void LoadAligned32x3U16(const uint16_t* const src[3], const ptrdiff_t x,
+ __m256i dst[3]) {
+ dst[0] = LoadAligned32(src[0] + x);
+ dst[1] = LoadAligned32(src[1] + x);
+ dst[2] = LoadAligned32(src[2] + x);
+}
+
+inline void LoadAligned32x3U16Msan(const uint16_t* const src[3],
+ const ptrdiff_t x, const ptrdiff_t border,
+ __m256i dst[3]) {
+ dst[0] = LoadAligned32Msan(src[0] + x, sizeof(**src) * (x + 16 - border));
+ dst[1] = LoadAligned32Msan(src[1] + x, sizeof(**src) * (x + 16 - border));
+ dst[2] = LoadAligned32Msan(src[2] + x, sizeof(**src) * (x + 16 - border));
+}
+
+inline void LoadAligned32U32(const uint32_t* const src, __m128i dst[2]) {
+ dst[0] = LoadAligned16(src + 0);
+ dst[1] = LoadAligned16(src + 4);
+}
+
+inline void LoadAligned32x2U32(const uint32_t* const src[2], const ptrdiff_t x,
+ __m128i dst[2][2]) {
+ LoadAligned32U32(src[0] + x, dst[0]);
+ LoadAligned32U32(src[1] + x, dst[1]);
+}
+
+inline void LoadAligned64x2U32(const uint32_t* const src[2], const ptrdiff_t x,
+ __m256i dst[2][2]) {
+ LoadAligned64(src[0] + x, dst[0]);
+ LoadAligned64(src[1] + x, dst[1]);
+}
+
+inline void LoadAligned64x2U32Msan(const uint32_t* const src[2],
+ const ptrdiff_t x, const ptrdiff_t border,
+ __m256i dst[2][2]) {
+ LoadAligned64Msan(src[0] + x, sizeof(**src) * (x + 16 - border), dst[0]);
+ LoadAligned64Msan(src[1] + x, sizeof(**src) * (x + 16 - border), dst[1]);
+}
+
+inline void LoadAligned32x3U32(const uint32_t* const src[3], const ptrdiff_t x,
+ __m128i dst[3][2]) {
+ LoadAligned32U32(src[0] + x, dst[0]);
+ LoadAligned32U32(src[1] + x, dst[1]);
+ LoadAligned32U32(src[2] + x, dst[2]);
+}
+
+inline void LoadAligned64x3U32(const uint32_t* const src[3], const ptrdiff_t x,
+ __m256i dst[3][2]) {
+ LoadAligned64(src[0] + x, dst[0]);
+ LoadAligned64(src[1] + x, dst[1]);
+ LoadAligned64(src[2] + x, dst[2]);
+}
+
+inline void LoadAligned64x3U32Msan(const uint32_t* const src[3],
+ const ptrdiff_t x, const ptrdiff_t border,
+ __m256i dst[3][2]) {
+ LoadAligned64Msan(src[0] + x, sizeof(**src) * (x + 16 - border), dst[0]);
+ LoadAligned64Msan(src[1] + x, sizeof(**src) * (x + 16 - border), dst[1]);
+ LoadAligned64Msan(src[2] + x, sizeof(**src) * (x + 16 - border), dst[2]);
+}
+
+inline void StoreAligned32U32(uint32_t* const dst, const __m128i src[2]) {
+ StoreAligned16(dst + 0, src[0]);
+ StoreAligned16(dst + 4, src[1]);
+}
+
+// Don't use _mm_cvtepu8_epi16() or _mm_cvtepu16_epi32() in the following
+// functions. Some compilers may generate super inefficient code and the whole
+// decoder could be 15% slower.
+
+inline __m128i VaddlLo8(const __m128i src0, const __m128i src1) {
+ const __m128i s0 = _mm_unpacklo_epi8(src0, _mm_setzero_si128());
+ const __m128i s1 = _mm_unpacklo_epi8(src1, _mm_setzero_si128());
+ return _mm_add_epi16(s0, s1);
+}
+
+inline __m256i VaddlLo8(const __m256i src0, const __m256i src1) {
+ const __m256i s0 = _mm256_unpacklo_epi8(src0, _mm256_setzero_si256());
+ const __m256i s1 = _mm256_unpacklo_epi8(src1, _mm256_setzero_si256());
+ return _mm256_add_epi16(s0, s1);
+}
+
+inline __m256i VaddlHi8(const __m256i src0, const __m256i src1) {
+ const __m256i s0 = _mm256_unpackhi_epi8(src0, _mm256_setzero_si256());
+ const __m256i s1 = _mm256_unpackhi_epi8(src1, _mm256_setzero_si256());
+ return _mm256_add_epi16(s0, s1);
+}
+
+inline __m128i VaddlLo16(const __m128i src0, const __m128i src1) {
+ const __m128i s0 = _mm_unpacklo_epi16(src0, _mm_setzero_si128());
+ const __m128i s1 = _mm_unpacklo_epi16(src1, _mm_setzero_si128());
+ return _mm_add_epi32(s0, s1);
+}
+
+inline __m256i VaddlLo16(const __m256i src0, const __m256i src1) {
+ const __m256i s0 = _mm256_unpacklo_epi16(src0, _mm256_setzero_si256());
+ const __m256i s1 = _mm256_unpacklo_epi16(src1, _mm256_setzero_si256());
+ return _mm256_add_epi32(s0, s1);
+}
+
+inline __m128i VaddlHi16(const __m128i src0, const __m128i src1) {
+ const __m128i s0 = _mm_unpackhi_epi16(src0, _mm_setzero_si128());
+ const __m128i s1 = _mm_unpackhi_epi16(src1, _mm_setzero_si128());
+ return _mm_add_epi32(s0, s1);
+}
+
+inline __m256i VaddlHi16(const __m256i src0, const __m256i src1) {
+ const __m256i s0 = _mm256_unpackhi_epi16(src0, _mm256_setzero_si256());
+ const __m256i s1 = _mm256_unpackhi_epi16(src1, _mm256_setzero_si256());
+ return _mm256_add_epi32(s0, s1);
+}
+
+inline __m128i VaddwLo8(const __m128i src0, const __m128i src1) {
+ const __m128i s1 = _mm_unpacklo_epi8(src1, _mm_setzero_si128());
+ return _mm_add_epi16(src0, s1);
+}
+
+inline __m256i VaddwLo8(const __m256i src0, const __m256i src1) {
+ const __m256i s1 = _mm256_unpacklo_epi8(src1, _mm256_setzero_si256());
+ return _mm256_add_epi16(src0, s1);
+}
+
+inline __m256i VaddwHi8(const __m256i src0, const __m256i src1) {
+ const __m256i s1 = _mm256_unpackhi_epi8(src1, _mm256_setzero_si256());
+ return _mm256_add_epi16(src0, s1);
+}
+
+inline __m128i VaddwLo16(const __m128i src0, const __m128i src1) {
+ const __m128i s1 = _mm_unpacklo_epi16(src1, _mm_setzero_si128());
+ return _mm_add_epi32(src0, s1);
+}
+
+inline __m256i VaddwLo16(const __m256i src0, const __m256i src1) {
+ const __m256i s1 = _mm256_unpacklo_epi16(src1, _mm256_setzero_si256());
+ return _mm256_add_epi32(src0, s1);
+}
+
+inline __m128i VaddwHi16(const __m128i src0, const __m128i src1) {
+ const __m128i s1 = _mm_unpackhi_epi16(src1, _mm_setzero_si128());
+ return _mm_add_epi32(src0, s1);
+}
+
+inline __m256i VaddwHi16(const __m256i src0, const __m256i src1) {
+ const __m256i s1 = _mm256_unpackhi_epi16(src1, _mm256_setzero_si256());
+ return _mm256_add_epi32(src0, s1);
+}
+
+// Using VgetLane16() can save a sign extension instruction.
+template <int n>
+inline int VgetLane16(__m256i src) {
+ return _mm256_extract_epi16(src, n);
+}
+
+template <int n>
+inline int VgetLane8(__m256i src) {
+ return _mm256_extract_epi8(src, n);
+}
+
+inline __m256i VmullNLo8(const __m256i src0, const int src1) {
+ const __m256i s0 = _mm256_unpacklo_epi16(src0, _mm256_setzero_si256());
+ return _mm256_madd_epi16(s0, _mm256_set1_epi32(src1));
+}
+
+inline __m256i VmullNHi8(const __m256i src0, const int src1) {
+ const __m256i s0 = _mm256_unpackhi_epi16(src0, _mm256_setzero_si256());
+ return _mm256_madd_epi16(s0, _mm256_set1_epi32(src1));
+}
+
+inline __m128i VmullLo16(const __m128i src0, const __m128i src1) {
+ const __m128i s0 = _mm_unpacklo_epi16(src0, _mm_setzero_si128());
+ const __m128i s1 = _mm_unpacklo_epi16(src1, _mm_setzero_si128());
+ return _mm_madd_epi16(s0, s1);
+}
+
+inline __m256i VmullLo16(const __m256i src0, const __m256i src1) {
+ const __m256i s0 = _mm256_unpacklo_epi16(src0, _mm256_setzero_si256());
+ const __m256i s1 = _mm256_unpacklo_epi16(src1, _mm256_setzero_si256());
+ return _mm256_madd_epi16(s0, s1);
+}
+
+inline __m128i VmullHi16(const __m128i src0, const __m128i src1) {
+ const __m128i s0 = _mm_unpackhi_epi16(src0, _mm_setzero_si128());
+ const __m128i s1 = _mm_unpackhi_epi16(src1, _mm_setzero_si128());
+ return _mm_madd_epi16(s0, s1);
+}
+
+inline __m256i VmullHi16(const __m256i src0, const __m256i src1) {
+ const __m256i s0 = _mm256_unpackhi_epi16(src0, _mm256_setzero_si256());
+ const __m256i s1 = _mm256_unpackhi_epi16(src1, _mm256_setzero_si256());
+ return _mm256_madd_epi16(s0, s1);
+}
+
+inline __m256i VrshrS32(const __m256i src0, const int src1) {
+ const __m256i sum =
+ _mm256_add_epi32(src0, _mm256_set1_epi32(1 << (src1 - 1)));
+ return _mm256_srai_epi32(sum, src1);
+}
+
+inline __m128i VrshrU32(const __m128i src0, const int src1) {
+ const __m128i sum = _mm_add_epi32(src0, _mm_set1_epi32(1 << (src1 - 1)));
+ return _mm_srli_epi32(sum, src1);
+}
+
+inline __m256i VrshrU32(const __m256i src0, const int src1) {
+ const __m256i sum =
+ _mm256_add_epi32(src0, _mm256_set1_epi32(1 << (src1 - 1)));
+ return _mm256_srli_epi32(sum, src1);
+}
+
+inline __m128i SquareLo8(const __m128i src) {
+ const __m128i s = _mm_unpacklo_epi8(src, _mm_setzero_si128());
+ return _mm_mullo_epi16(s, s);
+}
+
+inline __m256i SquareLo8(const __m256i src) {
+ const __m256i s = _mm256_unpacklo_epi8(src, _mm256_setzero_si256());
+ return _mm256_mullo_epi16(s, s);
+}
+
+inline __m128i SquareHi8(const __m128i src) {
+ const __m128i s = _mm_unpackhi_epi8(src, _mm_setzero_si128());
+ return _mm_mullo_epi16(s, s);
+}
+
+inline __m256i SquareHi8(const __m256i src) {
+ const __m256i s = _mm256_unpackhi_epi8(src, _mm256_setzero_si256());
+ return _mm256_mullo_epi16(s, s);
+}
+
+inline void Prepare3Lo8(const __m128i src, __m128i dst[3]) {
+ dst[0] = src;
+ dst[1] = _mm_srli_si128(src, 1);
+ dst[2] = _mm_srli_si128(src, 2);
+}
+
+inline void Prepare3_8(const __m256i src[2], __m256i dst[3]) {
+ dst[0] = _mm256_alignr_epi8(src[1], src[0], 0);
+ dst[1] = _mm256_alignr_epi8(src[1], src[0], 1);
+ dst[2] = _mm256_alignr_epi8(src[1], src[0], 2);
+}
+
+inline void Prepare3_16(const __m128i src[2], __m128i dst[3]) {
+ dst[0] = src[0];
+ dst[1] = _mm_alignr_epi8(src[1], src[0], 2);
+ dst[2] = _mm_alignr_epi8(src[1], src[0], 4);
+}
+
+inline void Prepare3_16(const __m256i src[2], __m256i dst[3]) {
+ dst[0] = src[0];
+ dst[1] = _mm256_alignr_epi8(src[1], src[0], 2);
+ dst[2] = _mm256_alignr_epi8(src[1], src[0], 4);
+}
+
+inline void Prepare5Lo8(const __m128i src, __m128i dst[5]) {
+ dst[0] = src;
+ dst[1] = _mm_srli_si128(src, 1);
+ dst[2] = _mm_srli_si128(src, 2);
+ dst[3] = _mm_srli_si128(src, 3);
+ dst[4] = _mm_srli_si128(src, 4);
+}
+
+inline void Prepare5_16(const __m128i src[2], __m128i dst[5]) {
+ Prepare3_16(src, dst);
+ dst[3] = _mm_alignr_epi8(src[1], src[0], 6);
+ dst[4] = _mm_alignr_epi8(src[1], src[0], 8);
+}
+
+inline void Prepare5_16(const __m256i src[2], __m256i dst[5]) {
+ Prepare3_16(src, dst);
+ dst[3] = _mm256_alignr_epi8(src[1], src[0], 6);
+ dst[4] = _mm256_alignr_epi8(src[1], src[0], 8);
+}
+
+inline __m128i Sum3_16(const __m128i src0, const __m128i src1,
+ const __m128i src2) {
+ const __m128i sum = _mm_add_epi16(src0, src1);
+ return _mm_add_epi16(sum, src2);
+}
+
+inline __m256i Sum3_16(const __m256i src0, const __m256i src1,
+ const __m256i src2) {
+ const __m256i sum = _mm256_add_epi16(src0, src1);
+ return _mm256_add_epi16(sum, src2);
+}
+
+inline __m128i Sum3_16(const __m128i src[3]) {
+ return Sum3_16(src[0], src[1], src[2]);
+}
+
+inline __m256i Sum3_16(const __m256i src[3]) {
+ return Sum3_16(src[0], src[1], src[2]);
+}
+
+inline __m128i Sum3_32(const __m128i src0, const __m128i src1,
+ const __m128i src2) {
+ const __m128i sum = _mm_add_epi32(src0, src1);
+ return _mm_add_epi32(sum, src2);
+}
+
+inline __m256i Sum3_32(const __m256i src0, const __m256i src1,
+ const __m256i src2) {
+ const __m256i sum = _mm256_add_epi32(src0, src1);
+ return _mm256_add_epi32(sum, src2);
+}
+
+inline void Sum3_32(const __m128i src[3][2], __m128i dst[2]) {
+ dst[0] = Sum3_32(src[0][0], src[1][0], src[2][0]);
+ dst[1] = Sum3_32(src[0][1], src[1][1], src[2][1]);
+}
+
+inline void Sum3_32(const __m256i src[3][2], __m256i dst[2]) {
+ dst[0] = Sum3_32(src[0][0], src[1][0], src[2][0]);
+ dst[1] = Sum3_32(src[0][1], src[1][1], src[2][1]);
+}
+
+inline __m128i Sum3WLo16(const __m128i src[3]) {
+ const __m128i sum = VaddlLo8(src[0], src[1]);
+ return VaddwLo8(sum, src[2]);
+}
+
+inline __m256i Sum3WLo16(const __m256i src[3]) {
+ const __m256i sum = VaddlLo8(src[0], src[1]);
+ return VaddwLo8(sum, src[2]);
+}
+
+inline __m256i Sum3WHi16(const __m256i src[3]) {
+ const __m256i sum = VaddlHi8(src[0], src[1]);
+ return VaddwHi8(sum, src[2]);
+}
+
+inline __m128i Sum3WLo32(const __m128i src[3]) {
+ const __m128i sum = VaddlLo16(src[0], src[1]);
+ return VaddwLo16(sum, src[2]);
+}
+
+inline __m256i Sum3WLo32(const __m256i src[3]) {
+ const __m256i sum = VaddlLo16(src[0], src[1]);
+ return VaddwLo16(sum, src[2]);
+}
+
+inline __m128i Sum3WHi32(const __m128i src[3]) {
+ const __m128i sum = VaddlHi16(src[0], src[1]);
+ return VaddwHi16(sum, src[2]);
+}
+
+inline __m256i Sum3WHi32(const __m256i src[3]) {
+ const __m256i sum = VaddlHi16(src[0], src[1]);
+ return VaddwHi16(sum, src[2]);
+}
+
+inline __m128i Sum5_16(const __m128i src[5]) {
+ const __m128i sum01 = _mm_add_epi16(src[0], src[1]);
+ const __m128i sum23 = _mm_add_epi16(src[2], src[3]);
+ const __m128i sum = _mm_add_epi16(sum01, sum23);
+ return _mm_add_epi16(sum, src[4]);
+}
+
+inline __m256i Sum5_16(const __m256i src[5]) {
+ const __m256i sum01 = _mm256_add_epi16(src[0], src[1]);
+ const __m256i sum23 = _mm256_add_epi16(src[2], src[3]);
+ const __m256i sum = _mm256_add_epi16(sum01, sum23);
+ return _mm256_add_epi16(sum, src[4]);
+}
+
+inline __m128i Sum5_32(const __m128i* const src0, const __m128i* const src1,
+ const __m128i* const src2, const __m128i* const src3,
+ const __m128i* const src4) {
+ const __m128i sum01 = _mm_add_epi32(*src0, *src1);
+ const __m128i sum23 = _mm_add_epi32(*src2, *src3);
+ const __m128i sum = _mm_add_epi32(sum01, sum23);
+ return _mm_add_epi32(sum, *src4);
+}
+
+inline __m256i Sum5_32(const __m256i* const src0, const __m256i* const src1,
+ const __m256i* const src2, const __m256i* const src3,
+ const __m256i* const src4) {
+ const __m256i sum01 = _mm256_add_epi32(*src0, *src1);
+ const __m256i sum23 = _mm256_add_epi32(*src2, *src3);
+ const __m256i sum = _mm256_add_epi32(sum01, sum23);
+ return _mm256_add_epi32(sum, *src4);
+}
+
+inline void Sum5_32(const __m128i src[5][2], __m128i dst[2]) {
+ dst[0] = Sum5_32(&src[0][0], &src[1][0], &src[2][0], &src[3][0], &src[4][0]);
+ dst[1] = Sum5_32(&src[0][1], &src[1][1], &src[2][1], &src[3][1], &src[4][1]);
+}
+
+inline void Sum5_32(const __m256i src[5][2], __m256i dst[2]) {
+ dst[0] = Sum5_32(&src[0][0], &src[1][0], &src[2][0], &src[3][0], &src[4][0]);
+ dst[1] = Sum5_32(&src[0][1], &src[1][1], &src[2][1], &src[3][1], &src[4][1]);
+}
+
+inline __m128i Sum5WLo16(const __m128i src[5]) {
+ const __m128i sum01 = VaddlLo8(src[0], src[1]);
+ const __m128i sum23 = VaddlLo8(src[2], src[3]);
+ const __m128i sum = _mm_add_epi16(sum01, sum23);
+ return VaddwLo8(sum, src[4]);
+}
+
+inline __m256i Sum5WLo16(const __m256i src[5]) {
+ const __m256i sum01 = VaddlLo8(src[0], src[1]);
+ const __m256i sum23 = VaddlLo8(src[2], src[3]);
+ const __m256i sum = _mm256_add_epi16(sum01, sum23);
+ return VaddwLo8(sum, src[4]);
+}
+
+inline __m256i Sum5WHi16(const __m256i src[5]) {
+ const __m256i sum01 = VaddlHi8(src[0], src[1]);
+ const __m256i sum23 = VaddlHi8(src[2], src[3]);
+ const __m256i sum = _mm256_add_epi16(sum01, sum23);
+ return VaddwHi8(sum, src[4]);
+}
+
+inline __m128i Sum3Horizontal(const __m128i src) {
+ __m128i s[3];
+ Prepare3Lo8(src, s);
+ return Sum3WLo16(s);
+}
+
+inline void Sum3Horizontal(const uint8_t* const src,
+ const ptrdiff_t over_read_in_bytes, __m256i dst[2]) {
+ __m256i s[3];
+ s[0] = LoadUnaligned32Msan(src + 0, over_read_in_bytes + 0);
+ s[1] = LoadUnaligned32Msan(src + 1, over_read_in_bytes + 1);
+ s[2] = LoadUnaligned32Msan(src + 2, over_read_in_bytes + 2);
+ dst[0] = Sum3WLo16(s);
+ dst[1] = Sum3WHi16(s);
+}
+
+inline void Sum3WHorizontal(const __m128i src[2], __m128i dst[2]) {
+ __m128i s[3];
+ Prepare3_16(src, s);
+ dst[0] = Sum3WLo32(s);
+ dst[1] = Sum3WHi32(s);
+}
+
+inline void Sum3WHorizontal(const __m256i src[2], __m256i dst[2]) {
+ __m256i s[3];
+ Prepare3_16(src, s);
+ dst[0] = Sum3WLo32(s);
+ dst[1] = Sum3WHi32(s);
+}
+
+inline __m128i Sum5Horizontal(const __m128i src) {
+ __m128i s[5];
+ Prepare5Lo8(src, s);
+ return Sum5WLo16(s);
+}
+
+inline void Sum5Horizontal(const uint8_t* const src,
+ const ptrdiff_t over_read_in_bytes,
+ __m256i* const dst0, __m256i* const dst1) {
+ __m256i s[5];
+ s[0] = LoadUnaligned32Msan(src + 0, over_read_in_bytes + 0);
+ s[1] = LoadUnaligned32Msan(src + 1, over_read_in_bytes + 1);
+ s[2] = LoadUnaligned32Msan(src + 2, over_read_in_bytes + 2);
+ s[3] = LoadUnaligned32Msan(src + 3, over_read_in_bytes + 3);
+ s[4] = LoadUnaligned32Msan(src + 4, over_read_in_bytes + 4);
+ *dst0 = Sum5WLo16(s);
+ *dst1 = Sum5WHi16(s);
+}
+
+inline void Sum5WHorizontal(const __m128i src[2], __m128i dst[2]) {
+ __m128i s[5];
+ Prepare5_16(src, s);
+ const __m128i sum01_lo = VaddlLo16(s[0], s[1]);
+ const __m128i sum23_lo = VaddlLo16(s[2], s[3]);
+ const __m128i sum0123_lo = _mm_add_epi32(sum01_lo, sum23_lo);
+ dst[0] = VaddwLo16(sum0123_lo, s[4]);
+ const __m128i sum01_hi = VaddlHi16(s[0], s[1]);
+ const __m128i sum23_hi = VaddlHi16(s[2], s[3]);
+ const __m128i sum0123_hi = _mm_add_epi32(sum01_hi, sum23_hi);
+ dst[1] = VaddwHi16(sum0123_hi, s[4]);
+}
+
+inline void Sum5WHorizontal(const __m256i src[2], __m256i dst[2]) {
+ __m256i s[5];
+ Prepare5_16(src, s);
+ const __m256i sum01_lo = VaddlLo16(s[0], s[1]);
+ const __m256i sum23_lo = VaddlLo16(s[2], s[3]);
+ const __m256i sum0123_lo = _mm256_add_epi32(sum01_lo, sum23_lo);
+ dst[0] = VaddwLo16(sum0123_lo, s[4]);
+ const __m256i sum01_hi = VaddlHi16(s[0], s[1]);
+ const __m256i sum23_hi = VaddlHi16(s[2], s[3]);
+ const __m256i sum0123_hi = _mm256_add_epi32(sum01_hi, sum23_hi);
+ dst[1] = VaddwHi16(sum0123_hi, s[4]);
+}
+
+void SumHorizontalLo(const __m128i src[5], __m128i* const row_sq3,
+ __m128i* const row_sq5) {
+ const __m128i sum04 = VaddlLo16(src[0], src[4]);
+ *row_sq3 = Sum3WLo32(src + 1);
+ *row_sq5 = _mm_add_epi32(sum04, *row_sq3);
+}
+
+void SumHorizontalLo(const __m256i src[5], __m256i* const row_sq3,
+ __m256i* const row_sq5) {
+ const __m256i sum04 = VaddlLo16(src[0], src[4]);
+ *row_sq3 = Sum3WLo32(src + 1);
+ *row_sq5 = _mm256_add_epi32(sum04, *row_sq3);
+}
+
+void SumHorizontalHi(const __m128i src[5], __m128i* const row_sq3,
+ __m128i* const row_sq5) {
+ const __m128i sum04 = VaddlHi16(src[0], src[4]);
+ *row_sq3 = Sum3WHi32(src + 1);
+ *row_sq5 = _mm_add_epi32(sum04, *row_sq3);
+}
+
+void SumHorizontalHi(const __m256i src[5], __m256i* const row_sq3,
+ __m256i* const row_sq5) {
+ const __m256i sum04 = VaddlHi16(src[0], src[4]);
+ *row_sq3 = Sum3WHi32(src + 1);
+ *row_sq5 = _mm256_add_epi32(sum04, *row_sq3);
+}
+
+void SumHorizontalLo(const __m128i src, __m128i* const row3,
+ __m128i* const row5) {
+ __m128i s[5];
+ Prepare5Lo8(src, s);
+ const __m128i sum04 = VaddlLo8(s[0], s[4]);
+ *row3 = Sum3WLo16(s + 1);
+ *row5 = _mm_add_epi16(sum04, *row3);
+}
+
+inline void SumHorizontal(const uint8_t* const src,
+ const ptrdiff_t over_read_in_bytes,
+ __m256i* const row3_0, __m256i* const row3_1,
+ __m256i* const row5_0, __m256i* const row5_1) {
+ __m256i s[5];
+ s[0] = LoadUnaligned32Msan(src + 0, over_read_in_bytes + 0);
+ s[1] = LoadUnaligned32Msan(src + 1, over_read_in_bytes + 1);
+ s[2] = LoadUnaligned32Msan(src + 2, over_read_in_bytes + 2);
+ s[3] = LoadUnaligned32Msan(src + 3, over_read_in_bytes + 3);
+ s[4] = LoadUnaligned32Msan(src + 4, over_read_in_bytes + 4);
+ const __m256i sum04_lo = VaddlLo8(s[0], s[4]);
+ const __m256i sum04_hi = VaddlHi8(s[0], s[4]);
+ *row3_0 = Sum3WLo16(s + 1);
+ *row3_1 = Sum3WHi16(s + 1);
+ *row5_0 = _mm256_add_epi16(sum04_lo, *row3_0);
+ *row5_1 = _mm256_add_epi16(sum04_hi, *row3_1);
+}
+
+inline void SumHorizontal(const __m128i src[2], __m128i* const row_sq3_0,
+ __m128i* const row_sq3_1, __m128i* const row_sq5_0,
+ __m128i* const row_sq5_1) {
+ __m128i s[5];
+ Prepare5_16(src, s);
+ SumHorizontalLo(s, row_sq3_0, row_sq5_0);
+ SumHorizontalHi(s, row_sq3_1, row_sq5_1);
+}
+
+inline void SumHorizontal(const __m256i src[2], __m256i* const row_sq3_0,
+ __m256i* const row_sq3_1, __m256i* const row_sq5_0,
+ __m256i* const row_sq5_1) {
+ __m256i s[5];
+ Prepare5_16(src, s);
+ SumHorizontalLo(s, row_sq3_0, row_sq5_0);
+ SumHorizontalHi(s, row_sq3_1, row_sq5_1);
+}
+
+inline __m256i Sum343Lo(const __m256i ma3[3]) {
+ const __m256i sum = Sum3WLo16(ma3);
+ const __m256i sum3 = Sum3_16(sum, sum, sum);
+ return VaddwLo8(sum3, ma3[1]);
+}
+
+inline __m256i Sum343Hi(const __m256i ma3[3]) {
+ const __m256i sum = Sum3WHi16(ma3);
+ const __m256i sum3 = Sum3_16(sum, sum, sum);
+ return VaddwHi8(sum3, ma3[1]);
+}
+
+inline __m256i Sum343WLo(const __m256i src[3]) {
+ const __m256i sum = Sum3WLo32(src);
+ const __m256i sum3 = Sum3_32(sum, sum, sum);
+ return VaddwLo16(sum3, src[1]);
+}
+
+inline __m256i Sum343WHi(const __m256i src[3]) {
+ const __m256i sum = Sum3WHi32(src);
+ const __m256i sum3 = Sum3_32(sum, sum, sum);
+ return VaddwHi16(sum3, src[1]);
+}
+
+inline void Sum343W(const __m256i src[2], __m256i dst[2]) {
+ __m256i s[3];
+ Prepare3_16(src, s);
+ dst[0] = Sum343WLo(s);
+ dst[1] = Sum343WHi(s);
+}
+
+inline __m256i Sum565Lo(const __m256i src[3]) {
+ const __m256i sum = Sum3WLo16(src);
+ const __m256i sum4 = _mm256_slli_epi16(sum, 2);
+ const __m256i sum5 = _mm256_add_epi16(sum4, sum);
+ return VaddwLo8(sum5, src[1]);
+}
+
+inline __m256i Sum565Hi(const __m256i src[3]) {
+ const __m256i sum = Sum3WHi16(src);
+ const __m256i sum4 = _mm256_slli_epi16(sum, 2);
+ const __m256i sum5 = _mm256_add_epi16(sum4, sum);
+ return VaddwHi8(sum5, src[1]);
+}
+
+inline __m256i Sum565WLo(const __m256i src[3]) {
+ const __m256i sum = Sum3WLo32(src);
+ const __m256i sum4 = _mm256_slli_epi32(sum, 2);
+ const __m256i sum5 = _mm256_add_epi32(sum4, sum);
+ return VaddwLo16(sum5, src[1]);
+}
+
+inline __m256i Sum565WHi(const __m256i src[3]) {
+ const __m256i sum = Sum3WHi32(src);
+ const __m256i sum4 = _mm256_slli_epi32(sum, 2);
+ const __m256i sum5 = _mm256_add_epi32(sum4, sum);
+ return VaddwHi16(sum5, src[1]);
+}
+
+inline void Sum565W(const __m256i src[2], __m256i dst[2]) {
+ __m256i s[3];
+ Prepare3_16(src, s);
+ dst[0] = Sum565WLo(s);
+ dst[1] = Sum565WHi(s);
+}
+
+inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride,
+ const ptrdiff_t width, const ptrdiff_t sum_stride,
+ const ptrdiff_t sum_width, uint16_t* sum3, uint16_t* sum5,
+ uint32_t* square_sum3, uint32_t* square_sum5) {
+ int y = 2;
+ do {
+ const __m128i s0 =
+ LoadUnaligned16Msan(src, kOverreadInBytesPass1_128 - width);
+ __m128i sq_128[2];
+ __m256i sq[3];
+ __m128i s3, s5, sq3[2], sq5[2];
+ sq_128[0] = SquareLo8(s0);
+ sq_128[1] = SquareHi8(s0);
+ SumHorizontalLo(s0, &s3, &s5);
+ StoreAligned16(sum3, s3);
+ StoreAligned16(sum5, s5);
+ SumHorizontal(sq_128, &sq3[0], &sq3[1], &sq5[0], &sq5[1]);
+ StoreAligned32U32(square_sum3, sq3);
+ StoreAligned32U32(square_sum5, sq5);
+ src += 8;
+ sum3 += 8;
+ sum5 += 8;
+ square_sum3 += 8;
+ square_sum5 += 8;
+ sq[0] = SetrM128i(sq_128[1], sq_128[1]);
+ ptrdiff_t x = sum_width;
+ do {
+ __m256i row3[2], row5[2], row_sq3[2], row_sq5[2];
+ const __m256i s = LoadUnaligned32Msan(
+ src + 8, sum_width - x + 16 + kOverreadInBytesPass1_256 - width);
+ sq[1] = SquareLo8(s);
+ sq[2] = SquareHi8(s);
+ sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21);
+ SumHorizontal(src, sum_width - x + 8 + kOverreadInBytesPass1_256 - width,
+ &row3[0], &row3[1], &row5[0], &row5[1]);
+ StoreAligned64(sum3, row3);
+ StoreAligned64(sum5, row5);
+ SumHorizontal(sq + 0, &row_sq3[0], &row_sq3[1], &row_sq5[0], &row_sq5[1]);
+ StoreAligned64(square_sum3 + 0, row_sq3);
+ StoreAligned64(square_sum5 + 0, row_sq5);
+ SumHorizontal(sq + 1, &row_sq3[0], &row_sq3[1], &row_sq5[0], &row_sq5[1]);
+ StoreAligned64(square_sum3 + 16, row_sq3);
+ StoreAligned64(square_sum5 + 16, row_sq5);
+ sq[0] = sq[2];
+ src += 32;
+ sum3 += 32;
+ sum5 += 32;
+ square_sum3 += 32;
+ square_sum5 += 32;
+ x -= 32;
+ } while (x != 0);
+ src += src_stride - sum_width - 8;
+ sum3 += sum_stride - sum_width - 8;
+ sum5 += sum_stride - sum_width - 8;
+ square_sum3 += sum_stride - sum_width - 8;
+ square_sum5 += sum_stride - sum_width - 8;
+ } while (--y != 0);
+}
+
+template <int size>
+inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride,
+ const ptrdiff_t width, const ptrdiff_t sum_stride,
+ const ptrdiff_t sum_width, uint16_t* sums,
+ uint32_t* square_sums) {
+ static_assert(size == 3 || size == 5, "");
+ int kOverreadInBytes_128, kOverreadInBytes_256;
+ if (size == 3) {
+ kOverreadInBytes_128 = kOverreadInBytesPass2_128;
+ kOverreadInBytes_256 = kOverreadInBytesPass2_256;
+ } else {
+ kOverreadInBytes_128 = kOverreadInBytesPass1_128;
+ kOverreadInBytes_256 = kOverreadInBytesPass1_256;
+ }
+ int y = 2;
+ do {
+ const __m128i s = LoadUnaligned16Msan(src, kOverreadInBytes_128 - width);
+ __m128i ss, sq_128[2], sqs[2];
+ __m256i sq[3];
+ sq_128[0] = SquareLo8(s);
+ sq_128[1] = SquareHi8(s);
+ if (size == 3) {
+ ss = Sum3Horizontal(s);
+ Sum3WHorizontal(sq_128, sqs);
+ } else {
+ ss = Sum5Horizontal(s);
+ Sum5WHorizontal(sq_128, sqs);
+ }
+ StoreAligned16(sums, ss);
+ StoreAligned32U32(square_sums, sqs);
+ src += 8;
+ sums += 8;
+ square_sums += 8;
+ sq[0] = SetrM128i(sq_128[1], sq_128[1]);
+ ptrdiff_t x = sum_width;
+ do {
+ __m256i row[2], row_sq[4];
+ const __m256i s = LoadUnaligned32Msan(
+ src + 8, sum_width - x + 16 + kOverreadInBytes_256 - width);
+ sq[1] = SquareLo8(s);
+ sq[2] = SquareHi8(s);
+ sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21);
+ if (size == 3) {
+ Sum3Horizontal(src, sum_width - x + 8 + kOverreadInBytes_256 - width,
+ row);
+ Sum3WHorizontal(sq + 0, row_sq + 0);
+ Sum3WHorizontal(sq + 1, row_sq + 2);
+ } else {
+ Sum5Horizontal(src, sum_width - x + 8 + kOverreadInBytes_256 - width,
+ &row[0], &row[1]);
+ Sum5WHorizontal(sq + 0, row_sq + 0);
+ Sum5WHorizontal(sq + 1, row_sq + 2);
+ }
+ StoreAligned64(sums, row);
+ StoreAligned64(square_sums + 0, row_sq + 0);
+ StoreAligned64(square_sums + 16, row_sq + 2);
+ sq[0] = sq[2];
+ src += 32;
+ sums += 32;
+ square_sums += 32;
+ x -= 32;
+ } while (x != 0);
+ src += src_stride - sum_width - 8;
+ sums += sum_stride - sum_width - 8;
+ square_sums += sum_stride - sum_width - 8;
+ } while (--y != 0);
+}
+
+template <int n>
+inline __m128i CalculateMa(const __m128i sum, const __m128i sum_sq,
+ const uint32_t scale) {
+ static_assert(n == 9 || n == 25, "");
+ // a = |sum_sq|
+ // d = |sum|
+ // p = (a * n < d * d) ? 0 : a * n - d * d;
+ const __m128i dxd = _mm_madd_epi16(sum, sum);
+ // _mm_mullo_epi32() has high latency. Using shifts and additions instead.
+ // Some compilers could do this for us but we make this explicit.
+ // return _mm_mullo_epi32(sum_sq, _mm_set1_epi32(n));
+ __m128i axn = _mm_add_epi32(sum_sq, _mm_slli_epi32(sum_sq, 3));
+ if (n == 25) axn = _mm_add_epi32(axn, _mm_slli_epi32(sum_sq, 4));
+ const __m128i sub = _mm_sub_epi32(axn, dxd);
+ const __m128i p = _mm_max_epi32(sub, _mm_setzero_si128());
+ const __m128i pxs = _mm_mullo_epi32(p, _mm_set1_epi32(scale));
+ return VrshrU32(pxs, kSgrProjScaleBits);
+}
+
+template <int n>
+inline __m128i CalculateMa(const __m128i sum, const __m128i sum_sq[2],
+ const uint32_t scale) {
+ static_assert(n == 9 || n == 25, "");
+ const __m128i sum_lo = _mm_unpacklo_epi16(sum, _mm_setzero_si128());
+ const __m128i sum_hi = _mm_unpackhi_epi16(sum, _mm_setzero_si128());
+ const __m128i z0 = CalculateMa<n>(sum_lo, sum_sq[0], scale);
+ const __m128i z1 = CalculateMa<n>(sum_hi, sum_sq[1], scale);
+ return _mm_packus_epi32(z0, z1);
+}
+
+template <int n>
+inline __m256i CalculateMa(const __m256i sum, const __m256i sum_sq,
+ const uint32_t scale) {
+ static_assert(n == 9 || n == 25, "");
+ // a = |sum_sq|
+ // d = |sum|
+ // p = (a * n < d * d) ? 0 : a * n - d * d;
+ const __m256i dxd = _mm256_madd_epi16(sum, sum);
+ // _mm256_mullo_epi32() has high latency. Using shifts and additions instead.
+ // Some compilers could do this for us but we make this explicit.
+ // return _mm256_mullo_epi32(sum_sq, _mm256_set1_epi32(n));
+ __m256i axn = _mm256_add_epi32(sum_sq, _mm256_slli_epi32(sum_sq, 3));
+ if (n == 25) axn = _mm256_add_epi32(axn, _mm256_slli_epi32(sum_sq, 4));
+ const __m256i sub = _mm256_sub_epi32(axn, dxd);
+ const __m256i p = _mm256_max_epi32(sub, _mm256_setzero_si256());
+ const __m256i pxs = _mm256_mullo_epi32(p, _mm256_set1_epi32(scale));
+ return VrshrU32(pxs, kSgrProjScaleBits);
+}
+
+template <int n>
+inline __m256i CalculateMa(const __m256i sum, const __m256i sum_sq[2],
+ const uint32_t scale) {
+ static_assert(n == 9 || n == 25, "");
+ const __m256i sum_lo = _mm256_unpacklo_epi16(sum, _mm256_setzero_si256());
+ const __m256i sum_hi = _mm256_unpackhi_epi16(sum, _mm256_setzero_si256());
+ const __m256i z0 = CalculateMa<n>(sum_lo, sum_sq[0], scale);
+ const __m256i z1 = CalculateMa<n>(sum_hi, sum_sq[1], scale);
+ return _mm256_packus_epi32(z0, z1);
+}
+
+template <int n>
+inline __m128i CalculateB(const __m128i sum, const __m128i ma) {
+ static_assert(n == 9 || n == 25, "");
+ constexpr uint32_t one_over_n =
+ ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n;
+ const __m128i m0 = VmullLo16(ma, sum);
+ const __m128i m1 = VmullHi16(ma, sum);
+ const __m128i m2 = _mm_mullo_epi32(m0, _mm_set1_epi32(one_over_n));
+ const __m128i m3 = _mm_mullo_epi32(m1, _mm_set1_epi32(one_over_n));
+ const __m128i b_lo = VrshrU32(m2, kSgrProjReciprocalBits);
+ const __m128i b_hi = VrshrU32(m3, kSgrProjReciprocalBits);
+ return _mm_packus_epi32(b_lo, b_hi);
+}
+
+template <int n>
+inline __m256i CalculateB(const __m256i sum, const __m256i ma) {
+ static_assert(n == 9 || n == 25, "");
+ constexpr uint32_t one_over_n =
+ ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n;
+ const __m256i m0 = VmullLo16(ma, sum);
+ const __m256i m1 = VmullHi16(ma, sum);
+ const __m256i m2 = _mm256_mullo_epi32(m0, _mm256_set1_epi32(one_over_n));
+ const __m256i m3 = _mm256_mullo_epi32(m1, _mm256_set1_epi32(one_over_n));
+ const __m256i b_lo = VrshrU32(m2, kSgrProjReciprocalBits);
+ const __m256i b_hi = VrshrU32(m3, kSgrProjReciprocalBits);
+ return _mm256_packus_epi32(b_lo, b_hi);
+}
+
+inline void CalculateSumAndIndex5(const __m128i s5[5], const __m128i sq5[5][2],
+ const uint32_t scale, __m128i* const sum,
+ __m128i* const index) {
+ __m128i sum_sq[2];
+ *sum = Sum5_16(s5);
+ Sum5_32(sq5, sum_sq);
+ *index = CalculateMa<25>(*sum, sum_sq, scale);
+}
+
+inline void CalculateSumAndIndex5(const __m256i s5[5], const __m256i sq5[5][2],
+ const uint32_t scale, __m256i* const sum,
+ __m256i* const index) {
+ __m256i sum_sq[2];
+ *sum = Sum5_16(s5);
+ Sum5_32(sq5, sum_sq);
+ *index = CalculateMa<25>(*sum, sum_sq, scale);
+}
+
+inline void CalculateSumAndIndex3(const __m128i s3[3], const __m128i sq3[3][2],
+ const uint32_t scale, __m128i* const sum,
+ __m128i* const index) {
+ __m128i sum_sq[2];
+ *sum = Sum3_16(s3);
+ Sum3_32(sq3, sum_sq);
+ *index = CalculateMa<9>(*sum, sum_sq, scale);
+}
+
+inline void CalculateSumAndIndex3(const __m256i s3[3], const __m256i sq3[3][2],
+ const uint32_t scale, __m256i* const sum,
+ __m256i* const index) {
+ __m256i sum_sq[2];
+ *sum = Sum3_16(s3);
+ Sum3_32(sq3, sum_sq);
+ *index = CalculateMa<9>(*sum, sum_sq, scale);
+}
+
+template <int n>
+inline void LookupIntermediate(const __m128i sum, const __m128i index,
+ __m128i* const ma, __m128i* const b) {
+ static_assert(n == 9 || n == 25, "");
+ const __m128i idx = _mm_packus_epi16(index, index);
+ // Actually it's not stored and loaded. The compiler will use a 64-bit
+ // general-purpose register to process. Faster than using _mm_extract_epi8().
+ uint8_t temp[8];
+ StoreLo8(temp, idx);
+ *ma = _mm_cvtsi32_si128(kSgrMaLookup[temp[0]]);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[1]], 1);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[2]], 2);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[3]], 3);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[4]], 4);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[5]], 5);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[6]], 6);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[7]], 7);
+ // b = ma * b * one_over_n
+ // |ma| = [0, 255]
+ // |sum| is a box sum with radius 1 or 2.
+ // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
+ // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
+ // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
+ // When radius is 2 |n| is 25. |one_over_n| is 164.
+ // When radius is 1 |n| is 9. |one_over_n| is 455.
+ // |kSgrProjReciprocalBits| is 12.
+ // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
+ // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
+ const __m128i maq = _mm_unpacklo_epi8(*ma, _mm_setzero_si128());
+ *b = CalculateB<n>(sum, maq);
+}
+
+// Repeat the first 48 elements in kSgrMaLookup with a period of 16.
+alignas(32) constexpr uint8_t kSgrMaLookupAvx2[96] = {
+ 255, 128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17, 16,
+ 255, 128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17, 16,
+ 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8,
+ 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8,
+ 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 5, 5,
+ 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 5, 5};
+
+// Set the shuffle control mask of indices out of range [0, 15] to (1xxxxxxx)b
+// to get value 0 as the shuffle result. The most significiant bit 1 comes
+// either from the comparision instruction, or from the sign bit of the index.
+inline __m256i ShuffleIndex(const __m256i table, const __m256i index) {
+ __m256i mask;
+ mask = _mm256_cmpgt_epi8(index, _mm256_set1_epi8(15));
+ mask = _mm256_or_si256(mask, index);
+ return _mm256_shuffle_epi8(table, mask);
+}
+
+inline __m256i AdjustValue(const __m256i value, const __m256i index,
+ const int threshold) {
+ const __m256i thresholds = _mm256_set1_epi8(threshold - 128);
+ const __m256i offset = _mm256_cmpgt_epi8(index, thresholds);
+ return _mm256_add_epi8(value, offset);
+}
+
+template <int n>
+inline void CalculateIntermediate(const __m256i sum[2], const __m256i index[2],
+ __m256i ma[3], __m256i b[2]) {
+ static_assert(n == 9 || n == 25, "");
+ // Use table lookup to read elements which indices are less than 48.
+ const __m256i c0 = LoadAligned32(kSgrMaLookupAvx2 + 0 * 32);
+ const __m256i c1 = LoadAligned32(kSgrMaLookupAvx2 + 1 * 32);
+ const __m256i c2 = LoadAligned32(kSgrMaLookupAvx2 + 2 * 32);
+ const __m256i indices = _mm256_packus_epi16(index[0], index[1]);
+ __m256i idx, mas;
+ // Clip idx to 127 to apply signed comparision instructions.
+ idx = _mm256_min_epu8(indices, _mm256_set1_epi8(127));
+ // All elements which indices are less than 48 are set to 0.
+ // Get shuffle results for indices in range [0, 15].
+ mas = ShuffleIndex(c0, idx);
+ // Get shuffle results for indices in range [16, 31].
+ // Subtract 16 to utilize the sign bit of the index.
+ idx = _mm256_sub_epi8(idx, _mm256_set1_epi8(16));
+ const __m256i res1 = ShuffleIndex(c1, idx);
+ // Use OR instruction to combine shuffle results together.
+ mas = _mm256_or_si256(mas, res1);
+ // Get shuffle results for indices in range [32, 47].
+ // Subtract 16 to utilize the sign bit of the index.
+ idx = _mm256_sub_epi8(idx, _mm256_set1_epi8(16));
+ const __m256i res2 = ShuffleIndex(c2, idx);
+ mas = _mm256_or_si256(mas, res2);
+
+ // For elements which indices are larger than 47, since they seldom change
+ // values with the increase of the index, we use comparison and arithmetic
+ // operations to calculate their values.
+ // Add -128 to apply signed comparision instructions.
+ idx = _mm256_add_epi8(indices, _mm256_set1_epi8(-128));
+ // Elements which indices are larger than 47 (with value 0) are set to 5.
+ mas = _mm256_max_epu8(mas, _mm256_set1_epi8(5));
+ mas = AdjustValue(mas, idx, 55); // 55 is the last index which value is 5.
+ mas = AdjustValue(mas, idx, 72); // 72 is the last index which value is 4.
+ mas = AdjustValue(mas, idx, 101); // 101 is the last index which value is 3.
+ mas = AdjustValue(mas, idx, 169); // 169 is the last index which value is 2.
+ mas = AdjustValue(mas, idx, 254); // 254 is the last index which value is 1.
+
+ ma[2] = _mm256_permute4x64_epi64(mas, 0x93); // 32-39 8-15 16-23 24-31
+ ma[0] = _mm256_blend_epi32(ma[0], ma[2], 0xfc); // 0-7 8-15 16-23 24-31
+ ma[1] = _mm256_permute2x128_si256(ma[0], ma[2], 0x21);
+
+ // b = ma * b * one_over_n
+ // |ma| = [0, 255]
+ // |sum| is a box sum with radius 1 or 2.
+ // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
+ // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
+ // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
+ // When radius is 2 |n| is 25. |one_over_n| is 164.
+ // When radius is 1 |n| is 9. |one_over_n| is 455.
+ // |kSgrProjReciprocalBits| is 12.
+ // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
+ // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
+ const __m256i maq0 = _mm256_unpackhi_epi8(ma[0], _mm256_setzero_si256());
+ const __m256i maq1 = _mm256_unpacklo_epi8(ma[1], _mm256_setzero_si256());
+ b[0] = CalculateB<n>(sum[0], maq0);
+ b[1] = CalculateB<n>(sum[1], maq1);
+}
+
+inline void CalculateIntermediate5(const __m128i s5[5], const __m128i sq5[5][2],
+ const uint32_t scale, __m128i* const ma,
+ __m128i* const b) {
+ __m128i sum, index;
+ CalculateSumAndIndex5(s5, sq5, scale, &sum, &index);
+ LookupIntermediate<25>(sum, index, ma, b);
+}
+
+inline void CalculateIntermediate3(const __m128i s3[3], const __m128i sq3[3][2],
+ const uint32_t scale, __m128i* const ma,
+ __m128i* const b) {
+ __m128i sum, index;
+ CalculateSumAndIndex3(s3, sq3, scale, &sum, &index);
+ LookupIntermediate<9>(sum, index, ma, b);
+}
+
+inline void Store343_444(const __m256i b3[2], const ptrdiff_t x,
+ __m256i sum_b343[2], __m256i sum_b444[2],
+ uint32_t* const b343, uint32_t* const b444) {
+ __m256i b[3], sum_b111[2];
+ Prepare3_16(b3, b);
+ sum_b111[0] = Sum3WLo32(b);
+ sum_b111[1] = Sum3WHi32(b);
+ sum_b444[0] = _mm256_slli_epi32(sum_b111[0], 2);
+ sum_b444[1] = _mm256_slli_epi32(sum_b111[1], 2);
+ StoreAligned64(b444 + x, sum_b444);
+ sum_b343[0] = _mm256_sub_epi32(sum_b444[0], sum_b111[0]);
+ sum_b343[1] = _mm256_sub_epi32(sum_b444[1], sum_b111[1]);
+ sum_b343[0] = VaddwLo16(sum_b343[0], b[1]);
+ sum_b343[1] = VaddwHi16(sum_b343[1], b[1]);
+ StoreAligned64(b343 + x, sum_b343);
+}
+
+inline void Store343_444Lo(const __m256i ma3[3], const __m256i b3[2],
+ const ptrdiff_t x, __m256i* const sum_ma343,
+ __m256i* const sum_ma444, __m256i sum_b343[2],
+ __m256i sum_b444[2], uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ const __m256i sum_ma111 = Sum3WLo16(ma3);
+ *sum_ma444 = _mm256_slli_epi16(sum_ma111, 2);
+ StoreAligned32(ma444 + x, *sum_ma444);
+ const __m256i sum333 = _mm256_sub_epi16(*sum_ma444, sum_ma111);
+ *sum_ma343 = VaddwLo8(sum333, ma3[1]);
+ StoreAligned32(ma343 + x, *sum_ma343);
+ Store343_444(b3, x, sum_b343, sum_b444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m256i ma3[3], const __m256i b3[2],
+ const ptrdiff_t x, __m256i* const sum_ma343,
+ __m256i* const sum_ma444, __m256i sum_b343[2],
+ __m256i sum_b444[2], uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ const __m256i sum_ma111 = Sum3WHi16(ma3);
+ *sum_ma444 = _mm256_slli_epi16(sum_ma111, 2);
+ StoreAligned32(ma444 + x, *sum_ma444);
+ const __m256i sum333 = _mm256_sub_epi16(*sum_ma444, sum_ma111);
+ *sum_ma343 = VaddwHi8(sum333, ma3[1]);
+ StoreAligned32(ma343 + x, *sum_ma343);
+ Store343_444(b3, x, sum_b343, sum_b444, b343, b444);
+}
+
+inline void Store343_444Lo(const __m256i ma3[3], const __m256i b3[2],
+ const ptrdiff_t x, __m256i* const sum_ma343,
+ __m256i sum_b343[2], uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ __m256i sum_ma444, sum_b444[2];
+ Store343_444Lo(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, sum_b444, ma343,
+ ma444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m256i ma3[3], const __m256i b3[2],
+ const ptrdiff_t x, __m256i* const sum_ma343,
+ __m256i sum_b343[2], uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ __m256i sum_ma444, sum_b444[2];
+ Store343_444Hi(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, sum_b444, ma343,
+ ma444, b343, b444);
+}
+
+inline void Store343_444Lo(const __m256i ma3[3], const __m256i b3[2],
+ const ptrdiff_t x, uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ __m256i sum_ma343, sum_b343[2];
+ Store343_444Lo(ma3, b3, x, &sum_ma343, sum_b343, ma343, ma444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m256i ma3[3], const __m256i b3[2],
+ const ptrdiff_t x, uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ __m256i sum_ma343, sum_b343[2];
+ Store343_444Hi(ma3, b3, x, &sum_ma343, sum_b343, ma343, ma444, b343, b444);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5Lo(
+ const __m128i s[2][3], const uint32_t scale, uint16_t* const sum5[5],
+ uint32_t* const square_sum5[5], __m128i sq[2][2], __m128i* const ma,
+ __m128i* const b) {
+ __m128i s5[2][5], sq5[5][2];
+ sq[0][1] = SquareHi8(s[0][0]);
+ sq[1][1] = SquareHi8(s[1][0]);
+ s5[0][3] = Sum5Horizontal(s[0][0]);
+ StoreAligned16(sum5[3], s5[0][3]);
+ s5[0][4] = Sum5Horizontal(s[1][0]);
+ StoreAligned16(sum5[4], s5[0][4]);
+ Sum5WHorizontal(sq[0], sq5[3]);
+ StoreAligned32U32(square_sum5[3], sq5[3]);
+ Sum5WHorizontal(sq[1], sq5[4]);
+ StoreAligned32U32(square_sum5[4], sq5[4]);
+ LoadAligned16x3U16(sum5, 0, s5[0]);
+ LoadAligned32x3U32(square_sum5, 0, sq5);
+ CalculateIntermediate5(s5[0], sq5, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5(
+ const uint8_t* const src0, const uint8_t* const src1,
+ const ptrdiff_t over_read_in_bytes, const ptrdiff_t sum_width,
+ const ptrdiff_t x, const uint32_t scale, uint16_t* const sum5[5],
+ uint32_t* const square_sum5[5], __m256i sq[2][3], __m256i ma[3],
+ __m256i b[3]) {
+ const __m256i s0 = LoadUnaligned32Msan(src0 + 8, over_read_in_bytes + 8);
+ const __m256i s1 = LoadUnaligned32Msan(src1 + 8, over_read_in_bytes + 8);
+ __m256i s5[2][5], sq5[5][2], sum[2], index[2];
+ sq[0][1] = SquareLo8(s0);
+ sq[0][2] = SquareHi8(s0);
+ sq[1][1] = SquareLo8(s1);
+ sq[1][2] = SquareHi8(s1);
+ sq[0][0] = _mm256_permute2x128_si256(sq[0][0], sq[0][2], 0x21);
+ sq[1][0] = _mm256_permute2x128_si256(sq[1][0], sq[1][2], 0x21);
+ Sum5Horizontal(src0, over_read_in_bytes, &s5[0][3], &s5[1][3]);
+ Sum5Horizontal(src1, over_read_in_bytes, &s5[0][4], &s5[1][4]);
+ StoreAligned32(sum5[3] + x + 0, s5[0][3]);
+ StoreAligned32(sum5[3] + x + 16, s5[1][3]);
+ StoreAligned32(sum5[4] + x + 0, s5[0][4]);
+ StoreAligned32(sum5[4] + x + 16, s5[1][4]);
+ Sum5WHorizontal(sq[0], sq5[3]);
+ StoreAligned64(square_sum5[3] + x, sq5[3]);
+ Sum5WHorizontal(sq[1], sq5[4]);
+ StoreAligned64(square_sum5[4] + x, sq5[4]);
+ LoadAligned32x3U16(sum5, x, s5[0]);
+ LoadAligned64x3U32(square_sum5, x, sq5);
+ CalculateSumAndIndex5(s5[0], sq5, scale, &sum[0], &index[0]);
+
+ Sum5WHorizontal(sq[0] + 1, sq5[3]);
+ StoreAligned64(square_sum5[3] + x + 16, sq5[3]);
+ Sum5WHorizontal(sq[1] + 1, sq5[4]);
+ StoreAligned64(square_sum5[4] + x + 16, sq5[4]);
+ LoadAligned32x3U16Msan(sum5, x + 16, sum_width, s5[1]);
+ LoadAligned64x3U32Msan(square_sum5, x + 16, sum_width, sq5);
+ CalculateSumAndIndex5(s5[1], sq5, scale, &sum[1], &index[1]);
+ CalculateIntermediate<25>(sum, index, ma, b + 1);
+ b[0] = _mm256_permute2x128_si256(b[0], b[2], 0x21);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRowLo(
+ const __m128i s, const uint32_t scale, const uint16_t* const sum5[5],
+ const uint32_t* const square_sum5[5], __m128i sq[2], __m128i* const ma,
+ __m128i* const b) {
+ __m128i s5[5], sq5[5][2];
+ sq[1] = SquareHi8(s);
+ s5[3] = s5[4] = Sum5Horizontal(s);
+ Sum5WHorizontal(sq, sq5[3]);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ LoadAligned16x3U16(sum5, 0, s5);
+ LoadAligned32x3U32(square_sum5, 0, sq5);
+ CalculateIntermediate5(s5, sq5, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRow(
+ const uint8_t* const src, const ptrdiff_t over_read_in_bytes,
+ const ptrdiff_t sum_width, const ptrdiff_t x, const uint32_t scale,
+ const uint16_t* const sum5[5], const uint32_t* const square_sum5[5],
+ __m256i sq[3], __m256i ma[3], __m256i b[3]) {
+ const __m256i s = LoadUnaligned32Msan(src + 8, over_read_in_bytes + 8);
+ __m256i s5[2][5], sq5[5][2], sum[2], index[2];
+ sq[1] = SquareLo8(s);
+ sq[2] = SquareHi8(s);
+ sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21);
+ Sum5Horizontal(src, over_read_in_bytes, &s5[0][3], &s5[1][3]);
+ s5[0][4] = s5[0][3];
+ s5[1][4] = s5[1][3];
+ Sum5WHorizontal(sq, sq5[3]);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ LoadAligned32x3U16(sum5, x, s5[0]);
+ LoadAligned64x3U32(square_sum5, x, sq5);
+ CalculateSumAndIndex5(s5[0], sq5, scale, &sum[0], &index[0]);
+
+ Sum5WHorizontal(sq + 1, sq5[3]);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ LoadAligned32x3U16Msan(sum5, x + 16, sum_width, s5[1]);
+ LoadAligned64x3U32Msan(square_sum5, x + 16, sum_width, sq5);
+ CalculateSumAndIndex5(s5[1], sq5, scale, &sum[1], &index[1]);
+ CalculateIntermediate<25>(sum, index, ma, b + 1);
+ b[0] = _mm256_permute2x128_si256(b[0], b[2], 0x21);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3Lo(
+ const __m128i s, const uint32_t scale, uint16_t* const sum3[3],
+ uint32_t* const square_sum3[3], __m128i sq[2], __m128i* const ma,
+ __m128i* const b) {
+ __m128i s3[3], sq3[3][2];
+ sq[1] = SquareHi8(s);
+ s3[2] = Sum3Horizontal(s);
+ StoreAligned16(sum3[2], s3[2]);
+ Sum3WHorizontal(sq, sq3[2]);
+ StoreAligned32U32(square_sum3[2], sq3[2]);
+ LoadAligned16x2U16(sum3, 0, s3);
+ LoadAligned32x2U32(square_sum3, 0, sq3);
+ CalculateIntermediate3(s3, sq3, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3(
+ const uint8_t* const src, const ptrdiff_t over_read_in_bytes,
+ const ptrdiff_t x, const ptrdiff_t sum_width, const uint32_t scale,
+ uint16_t* const sum3[3], uint32_t* const square_sum3[3], __m256i sq[3],
+ __m256i ma[3], __m256i b[3]) {
+ const __m256i s = LoadUnaligned32Msan(src + 8, over_read_in_bytes + 8);
+ __m256i s3[4], sq3[3][2], sum[2], index[2];
+ sq[1] = SquareLo8(s);
+ sq[2] = SquareHi8(s);
+ sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21);
+ Sum3Horizontal(src, over_read_in_bytes, s3 + 2);
+ StoreAligned64(sum3[2] + x, s3 + 2);
+ Sum3WHorizontal(sq + 0, sq3[2]);
+ StoreAligned64(square_sum3[2] + x, sq3[2]);
+ LoadAligned32x2U16(sum3, x, s3);
+ LoadAligned64x2U32(square_sum3, x, sq3);
+ CalculateSumAndIndex3(s3, sq3, scale, &sum[0], &index[0]);
+
+ Sum3WHorizontal(sq + 1, sq3[2]);
+ StoreAligned64(square_sum3[2] + x + 16, sq3[2]);
+ LoadAligned32x2U16Msan(sum3, x + 16, sum_width, s3 + 1);
+ LoadAligned64x2U32Msan(square_sum3, x + 16, sum_width, sq3);
+ CalculateSumAndIndex3(s3 + 1, sq3, scale, &sum[1], &index[1]);
+ CalculateIntermediate<9>(sum, index, ma, b + 1);
+ b[0] = _mm256_permute2x128_si256(b[0], b[2], 0x21);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLo(
+ const __m128i s[2], const uint16_t scales[2], uint16_t* const sum3[4],
+ uint16_t* const sum5[5], uint32_t* const square_sum3[4],
+ uint32_t* const square_sum5[5], __m128i sq[2][2], __m128i ma3[2],
+ __m128i b3[2], __m128i* const ma5, __m128i* const b5) {
+ __m128i s3[4], s5[5], sq3[4][2], sq5[5][2];
+ sq[0][1] = SquareHi8(s[0]);
+ sq[1][1] = SquareHi8(s[1]);
+ SumHorizontalLo(s[0], &s3[2], &s5[3]);
+ SumHorizontalLo(s[1], &s3[3], &s5[4]);
+ StoreAligned16(sum3[2], s3[2]);
+ StoreAligned16(sum3[3], s3[3]);
+ StoreAligned16(sum5[3], s5[3]);
+ StoreAligned16(sum5[4], s5[4]);
+ SumHorizontal(sq[0], &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+ StoreAligned32U32(square_sum3[2], sq3[2]);
+ StoreAligned32U32(square_sum5[3], sq5[3]);
+ SumHorizontal(sq[1], &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+ StoreAligned32U32(square_sum3[3], sq3[3]);
+ StoreAligned32U32(square_sum5[4], sq5[4]);
+ LoadAligned16x2U16(sum3, 0, s3);
+ LoadAligned32x2U32(square_sum3, 0, sq3);
+ LoadAligned16x3U16(sum5, 0, s5);
+ LoadAligned32x3U32(square_sum5, 0, sq5);
+ // Note: in the SSE4_1 version, CalculateIntermediate() is called
+ // to replace the slow LookupIntermediate() when calculating 16 intermediate
+ // data points. However, the AVX2 compiler generates even slower code. So we
+ // keep using CalculateIntermediate3().
+ CalculateIntermediate3(s3 + 0, sq3 + 0, scales[1], &ma3[0], &b3[0]);
+ CalculateIntermediate3(s3 + 1, sq3 + 1, scales[1], &ma3[1], &b3[1]);
+ CalculateIntermediate5(s5, sq5, scales[0], ma5, b5);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess(
+ const uint8_t* const src0, const uint8_t* const src1,
+ const ptrdiff_t over_read_in_bytes, const ptrdiff_t x,
+ const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5],
+ uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+ const ptrdiff_t sum_width, __m256i sq[2][3], __m256i ma3[2][3],
+ __m256i b3[2][5], __m256i ma5[3], __m256i b5[5]) {
+ const __m256i s0 = LoadUnaligned32Msan(src0 + 8, over_read_in_bytes + 8);
+ const __m256i s1 = LoadUnaligned32Msan(src1 + 8, over_read_in_bytes + 8);
+ __m256i s3[2][4], s5[2][5], sq3[4][2], sq5[5][2], sq3t[4][2], sq5t[5][2],
+ sum_3[2][2], index_3[2][2], sum_5[2], index_5[2];
+ sq[0][1] = SquareLo8(s0);
+ sq[0][2] = SquareHi8(s0);
+ sq[1][1] = SquareLo8(s1);
+ sq[1][2] = SquareHi8(s1);
+ sq[0][0] = _mm256_permute2x128_si256(sq[0][0], sq[0][2], 0x21);
+ sq[1][0] = _mm256_permute2x128_si256(sq[1][0], sq[1][2], 0x21);
+ SumHorizontal(src0, over_read_in_bytes, &s3[0][2], &s3[1][2], &s5[0][3],
+ &s5[1][3]);
+ SumHorizontal(src1, over_read_in_bytes, &s3[0][3], &s3[1][3], &s5[0][4],
+ &s5[1][4]);
+ StoreAligned32(sum3[2] + x + 0, s3[0][2]);
+ StoreAligned32(sum3[2] + x + 16, s3[1][2]);
+ StoreAligned32(sum3[3] + x + 0, s3[0][3]);
+ StoreAligned32(sum3[3] + x + 16, s3[1][3]);
+ StoreAligned32(sum5[3] + x + 0, s5[0][3]);
+ StoreAligned32(sum5[3] + x + 16, s5[1][3]);
+ StoreAligned32(sum5[4] + x + 0, s5[0][4]);
+ StoreAligned32(sum5[4] + x + 16, s5[1][4]);
+ SumHorizontal(sq[0], &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+ SumHorizontal(sq[1], &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+ StoreAligned64(square_sum3[2] + x, sq3[2]);
+ StoreAligned64(square_sum5[3] + x, sq5[3]);
+ StoreAligned64(square_sum3[3] + x, sq3[3]);
+ StoreAligned64(square_sum5[4] + x, sq5[4]);
+ LoadAligned32x2U16(sum3, x, s3[0]);
+ LoadAligned64x2U32(square_sum3, x, sq3);
+ CalculateSumAndIndex3(s3[0], sq3, scales[1], &sum_3[0][0], &index_3[0][0]);
+ CalculateSumAndIndex3(s3[0] + 1, sq3 + 1, scales[1], &sum_3[1][0],
+ &index_3[1][0]);
+ LoadAligned32x3U16(sum5, x, s5[0]);
+ LoadAligned64x3U32(square_sum5, x, sq5);
+ CalculateSumAndIndex5(s5[0], sq5, scales[0], &sum_5[0], &index_5[0]);
+
+ SumHorizontal(sq[0] + 1, &sq3t[2][0], &sq3t[2][1], &sq5t[3][0], &sq5t[3][1]);
+ SumHorizontal(sq[1] + 1, &sq3t[3][0], &sq3t[3][1], &sq5t[4][0], &sq5t[4][1]);
+ StoreAligned64(square_sum3[2] + x + 16, sq3t[2]);
+ StoreAligned64(square_sum5[3] + x + 16, sq5t[3]);
+ StoreAligned64(square_sum3[3] + x + 16, sq3t[3]);
+ StoreAligned64(square_sum5[4] + x + 16, sq5t[4]);
+ LoadAligned32x2U16Msan(sum3, x + 16, sum_width, s3[1]);
+ LoadAligned64x2U32Msan(square_sum3, x + 16, sum_width, sq3t);
+ CalculateSumAndIndex3(s3[1], sq3t, scales[1], &sum_3[0][1], &index_3[0][1]);
+ CalculateSumAndIndex3(s3[1] + 1, sq3t + 1, scales[1], &sum_3[1][1],
+ &index_3[1][1]);
+ CalculateIntermediate<9>(sum_3[0], index_3[0], ma3[0], b3[0] + 1);
+ CalculateIntermediate<9>(sum_3[1], index_3[1], ma3[1], b3[1] + 1);
+ LoadAligned32x3U16Msan(sum5, x + 16, sum_width, s5[1]);
+ LoadAligned64x3U32Msan(square_sum5, x + 16, sum_width, sq5t);
+ CalculateSumAndIndex5(s5[1], sq5t, scales[0], &sum_5[1], &index_5[1]);
+ CalculateIntermediate<25>(sum_5, index_5, ma5, b5 + 1);
+ b3[0][0] = _mm256_permute2x128_si256(b3[0][0], b3[0][2], 0x21);
+ b3[1][0] = _mm256_permute2x128_si256(b3[1][0], b3[1][2], 0x21);
+ b5[0] = _mm256_permute2x128_si256(b5[0], b5[2], 0x21);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRowLo(
+ const __m128i s, const uint16_t scales[2], const uint16_t* const sum3[4],
+ const uint16_t* const sum5[5], const uint32_t* const square_sum3[4],
+ const uint32_t* const square_sum5[5], __m128i sq[2], __m128i* const ma3,
+ __m128i* const ma5, __m128i* const b3, __m128i* const b5) {
+ __m128i s3[3], s5[5], sq3[3][2], sq5[5][2];
+ sq[1] = SquareHi8(s);
+ SumHorizontalLo(s, &s3[2], &s5[3]);
+ SumHorizontal(sq, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+ LoadAligned16x3U16(sum5, 0, s5);
+ s5[4] = s5[3];
+ LoadAligned32x3U32(square_sum5, 0, sq5);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ CalculateIntermediate5(s5, sq5, scales[0], ma5, b5);
+ LoadAligned16x2U16(sum3, 0, s3);
+ LoadAligned32x2U32(square_sum3, 0, sq3);
+ CalculateIntermediate3(s3, sq3, scales[1], ma3, b3);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRow(
+ const uint8_t* const src, const ptrdiff_t over_read_in_bytes,
+ const ptrdiff_t sum_width, const ptrdiff_t x, const uint16_t scales[2],
+ const uint16_t* const sum3[4], const uint16_t* const sum5[5],
+ const uint32_t* const square_sum3[4], const uint32_t* const square_sum5[5],
+ __m256i sq[6], __m256i ma3[2], __m256i ma5[2], __m256i b3[5],
+ __m256i b5[5]) {
+ const __m256i s0 = LoadUnaligned32Msan(src + 8, over_read_in_bytes + 8);
+ __m256i s3[2][3], s5[2][5], sq3[4][2], sq3t[4][2], sq5[5][2], sq5t[5][2],
+ sum_3[2], index_3[2], sum_5[2], index_5[2];
+ sq[1] = SquareLo8(s0);
+ sq[2] = SquareHi8(s0);
+ sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21);
+ SumHorizontal(src, over_read_in_bytes, &s3[0][2], &s3[1][2], &s5[0][3],
+ &s5[1][3]);
+ SumHorizontal(sq, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+ LoadAligned32x2U16(sum3, x, s3[0]);
+ LoadAligned64x2U32(square_sum3, x, sq3);
+ CalculateSumAndIndex3(s3[0], sq3, scales[1], &sum_3[0], &index_3[0]);
+ LoadAligned32x3U16(sum5, x, s5[0]);
+ s5[0][4] = s5[0][3];
+ LoadAligned64x3U32(square_sum5, x, sq5);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ CalculateSumAndIndex5(s5[0], sq5, scales[0], &sum_5[0], &index_5[0]);
+
+ SumHorizontal(sq + 1, &sq3t[2][0], &sq3t[2][1], &sq5t[3][0], &sq5t[3][1]);
+ LoadAligned32x2U16Msan(sum3, x + 16, sum_width, s3[1]);
+ LoadAligned64x2U32Msan(square_sum3, x + 16, sum_width, sq3t);
+ CalculateSumAndIndex3(s3[1], sq3t, scales[1], &sum_3[1], &index_3[1]);
+ CalculateIntermediate<9>(sum_3, index_3, ma3, b3 + 1);
+ LoadAligned32x3U16Msan(sum5, x + 16, sum_width, s5[1]);
+ s5[1][4] = s5[1][3];
+ LoadAligned64x3U32Msan(square_sum5, x + 16, sum_width, sq5t);
+ sq5t[4][0] = sq5t[3][0];
+ sq5t[4][1] = sq5t[3][1];
+ CalculateSumAndIndex5(s5[1], sq5t, scales[0], &sum_5[1], &index_5[1]);
+ CalculateIntermediate<25>(sum_5, index_5, ma5, b5 + 1);
+ b3[0] = _mm256_permute2x128_si256(b3[0], b3[2], 0x21);
+ b5[0] = _mm256_permute2x128_si256(b5[0], b5[2], 0x21);
+}
+
+inline void BoxSumFilterPreProcess5(const uint8_t* const src0,
+ const uint8_t* const src1, const int width,
+ const uint32_t scale,
+ uint16_t* const sum5[5],
+ uint32_t* const square_sum5[5],
+ const ptrdiff_t sum_width, uint16_t* ma565,
+ uint32_t* b565) {
+ __m128i ma0, b0, s[2][3], sq_128[2][2];
+ __m256i mas[3], sq[2][3], bs[3];
+ s[0][0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1_128 - width);
+ s[1][0] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1_128 - width);
+ sq_128[0][0] = SquareLo8(s[0][0]);
+ sq_128[1][0] = SquareLo8(s[1][0]);
+ BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq_128, &ma0, &b0);
+ sq[0][0] = SetrM128i(sq_128[0][0], sq_128[0][1]);
+ sq[1][0] = SetrM128i(sq_128[1][0], sq_128[1][1]);
+ mas[0] = SetrM128i(ma0, ma0);
+ bs[0] = SetrM128i(b0, b0);
+
+ int x = 0;
+ do {
+ __m256i ma5[3], ma[2], b[4];
+ BoxFilterPreProcess5(src0 + x + 8, src1 + x + 8,
+ x + 8 + kOverreadInBytesPass1_256 - width, sum_width,
+ x + 8, scale, sum5, square_sum5, sq, mas, bs);
+ Prepare3_8(mas, ma5);
+ ma[0] = Sum565Lo(ma5);
+ ma[1] = Sum565Hi(ma5);
+ StoreAligned64(ma565, ma);
+ Sum565W(bs + 0, b + 0);
+ Sum565W(bs + 1, b + 2);
+ StoreAligned64(b565, b + 0);
+ StoreAligned64(b565 + 16, b + 2);
+ sq[0][0] = sq[0][2];
+ sq[1][0] = sq[1][2];
+ mas[0] = mas[2];
+ bs[0] = bs[2];
+ ma565 += 32;
+ b565 += 32;
+ x += 32;
+ } while (x < width);
+}
+
+template <bool calculate444>
+LIBGAV1_ALWAYS_INLINE void BoxSumFilterPreProcess3(
+ const uint8_t* const src, const int width, const uint32_t scale,
+ uint16_t* const sum3[3], uint32_t* const square_sum3[3],
+ const ptrdiff_t sum_width, uint16_t* ma343, uint16_t* ma444, uint32_t* b343,
+ uint32_t* b444) {
+ __m128i ma0, sq_128[2], b0;
+ __m256i mas[3], sq[3], bs[3];
+ const __m128i s = LoadUnaligned16Msan(src, kOverreadInBytesPass2_128 - width);
+ sq_128[0] = SquareLo8(s);
+ BoxFilterPreProcess3Lo(s, scale, sum3, square_sum3, sq_128, &ma0, &b0);
+ sq[0] = SetrM128i(sq_128[0], sq_128[1]);
+ mas[0] = SetrM128i(ma0, ma0);
+ bs[0] = SetrM128i(b0, b0);
+
+ int x = 0;
+ do {
+ __m256i ma3[3];
+ BoxFilterPreProcess3(src + x + 8, x + 8 + kOverreadInBytesPass2_256 - width,
+ x + 8, sum_width, scale, sum3, square_sum3, sq, mas,
+ bs);
+ Prepare3_8(mas, ma3);
+ if (calculate444) { // NOLINT(readability-simplify-boolean-expr)
+ Store343_444Lo(ma3, bs + 0, 0, ma343, ma444, b343, b444);
+ Store343_444Hi(ma3, bs + 1, 16, ma343, ma444, b343, b444);
+ ma444 += 32;
+ b444 += 32;
+ } else {
+ __m256i ma[2], b[4];
+ ma[0] = Sum343Lo(ma3);
+ ma[1] = Sum343Hi(ma3);
+ StoreAligned64(ma343, ma);
+ Sum343W(bs + 0, b + 0);
+ Sum343W(bs + 1, b + 2);
+ StoreAligned64(b343 + 0, b + 0);
+ StoreAligned64(b343 + 16, b + 2);
+ }
+ sq[0] = sq[2];
+ mas[0] = mas[2];
+ bs[0] = bs[2];
+ ma343 += 32;
+ b343 += 32;
+ x += 32;
+ } while (x < width);
+}
+
+inline void BoxSumFilterPreProcess(
+ const uint8_t* const src0, const uint8_t* const src1, const int width,
+ const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5],
+ uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+ const ptrdiff_t sum_width, uint16_t* const ma343[4],
+ uint16_t* const ma444[2], uint16_t* ma565, uint32_t* const b343[4],
+ uint32_t* const b444[2], uint32_t* b565) {
+ __m128i s[2], ma3_128[2], ma5_0, sq_128[2][2], b3_128[2], b5_0;
+ __m256i ma3[2][3], ma5[3], sq[2][3], b3[2][5], b5[5];
+ s[0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1_128 - width);
+ s[1] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1_128 - width);
+ sq_128[0][0] = SquareLo8(s[0]);
+ sq_128[1][0] = SquareLo8(s[1]);
+ BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq_128,
+ ma3_128, b3_128, &ma5_0, &b5_0);
+ sq[0][0] = SetrM128i(sq_128[0][0], sq_128[0][1]);
+ sq[1][0] = SetrM128i(sq_128[1][0], sq_128[1][1]);
+ ma3[0][0] = SetrM128i(ma3_128[0], ma3_128[0]);
+ ma3[1][0] = SetrM128i(ma3_128[1], ma3_128[1]);
+ ma5[0] = SetrM128i(ma5_0, ma5_0);
+ b3[0][0] = SetrM128i(b3_128[0], b3_128[0]);
+ b3[1][0] = SetrM128i(b3_128[1], b3_128[1]);
+ b5[0] = SetrM128i(b5_0, b5_0);
+
+ int x = 0;
+ do {
+ __m256i ma[2], b[4], ma3x[3], ma5x[3];
+ BoxFilterPreProcess(src0 + x + 8, src1 + x + 8,
+ x + 8 + kOverreadInBytesPass1_256 - width, x + 8,
+ scales, sum3, sum5, square_sum3, square_sum5, sum_width,
+ sq, ma3, b3, ma5, b5);
+ Prepare3_8(ma3[0], ma3x);
+ ma[0] = Sum343Lo(ma3x);
+ ma[1] = Sum343Hi(ma3x);
+ StoreAligned64(ma343[0] + x, ma);
+ Sum343W(b3[0], b);
+ StoreAligned64(b343[0] + x, b);
+ Sum565W(b5, b);
+ StoreAligned64(b565, b);
+ Prepare3_8(ma3[1], ma3x);
+ Store343_444Lo(ma3x, b3[1], x, ma343[1], ma444[0], b343[1], b444[0]);
+ Store343_444Hi(ma3x, b3[1] + 1, x + 16, ma343[1], ma444[0], b343[1],
+ b444[0]);
+ Prepare3_8(ma5, ma5x);
+ ma[0] = Sum565Lo(ma5x);
+ ma[1] = Sum565Hi(ma5x);
+ StoreAligned64(ma565, ma);
+ Sum343W(b3[0] + 1, b);
+ StoreAligned64(b343[0] + x + 16, b);
+ Sum565W(b5 + 1, b);
+ StoreAligned64(b565 + 16, b);
+ sq[0][0] = sq[0][2];
+ sq[1][0] = sq[1][2];
+ ma3[0][0] = ma3[0][2];
+ ma3[1][0] = ma3[1][2];
+ ma5[0] = ma5[2];
+ b3[0][0] = b3[0][2];
+ b3[1][0] = b3[1][2];
+ b5[0] = b5[2];
+ ma565 += 32;
+ b565 += 32;
+ x += 32;
+ } while (x < width);
+}
+
+template <int shift>
+inline __m256i FilterOutput(const __m256i ma_x_src, const __m256i b) {
+ // ma: 255 * 32 = 8160 (13 bits)
+ // b: 65088 * 32 = 2082816 (21 bits)
+ // v: b - ma * 255 (22 bits)
+ const __m256i v = _mm256_sub_epi32(b, ma_x_src);
+ // kSgrProjSgrBits = 8
+ // kSgrProjRestoreBits = 4
+ // shift = 4 or 5
+ // v >> 8 or 9 (13 bits)
+ return VrshrS32(v, kSgrProjSgrBits + shift - kSgrProjRestoreBits);
+}
+
+template <int shift>
+inline __m256i CalculateFilteredOutput(const __m256i src, const __m256i ma,
+ const __m256i b[2]) {
+ const __m256i ma_x_src_lo = VmullLo16(ma, src);
+ const __m256i ma_x_src_hi = VmullHi16(ma, src);
+ const __m256i dst_lo = FilterOutput<shift>(ma_x_src_lo, b[0]);
+ const __m256i dst_hi = FilterOutput<shift>(ma_x_src_hi, b[1]);
+ return _mm256_packs_epi32(dst_lo, dst_hi); // 13 bits
+}
+
+inline __m256i CalculateFilteredOutputPass1(const __m256i src, __m256i ma[2],
+ __m256i b[2][2]) {
+ const __m256i ma_sum = _mm256_add_epi16(ma[0], ma[1]);
+ __m256i b_sum[2];
+ b_sum[0] = _mm256_add_epi32(b[0][0], b[1][0]);
+ b_sum[1] = _mm256_add_epi32(b[0][1], b[1][1]);
+ return CalculateFilteredOutput<5>(src, ma_sum, b_sum);
+}
+
+inline __m256i CalculateFilteredOutputPass2(const __m256i src, __m256i ma[3],
+ __m256i b[3][2]) {
+ const __m256i ma_sum = Sum3_16(ma);
+ __m256i b_sum[2];
+ Sum3_32(b, b_sum);
+ return CalculateFilteredOutput<5>(src, ma_sum, b_sum);
+}
+
+inline __m256i SelfGuidedFinal(const __m256i src, const __m256i v[2]) {
+ const __m256i v_lo =
+ VrshrS32(v[0], kSgrProjRestoreBits + kSgrProjPrecisionBits);
+ const __m256i v_hi =
+ VrshrS32(v[1], kSgrProjRestoreBits + kSgrProjPrecisionBits);
+ const __m256i vv = _mm256_packs_epi32(v_lo, v_hi);
+ return _mm256_add_epi16(src, vv);
+}
+
+inline __m256i SelfGuidedDoubleMultiplier(const __m256i src,
+ const __m256i filter[2], const int w0,
+ const int w2) {
+ __m256i v[2];
+ const __m256i w0_w2 =
+ _mm256_set1_epi32((w2 << 16) | static_cast<uint16_t>(w0));
+ const __m256i f_lo = _mm256_unpacklo_epi16(filter[0], filter[1]);
+ const __m256i f_hi = _mm256_unpackhi_epi16(filter[0], filter[1]);
+ v[0] = _mm256_madd_epi16(w0_w2, f_lo);
+ v[1] = _mm256_madd_epi16(w0_w2, f_hi);
+ return SelfGuidedFinal(src, v);
+}
+
+inline __m256i SelfGuidedSingleMultiplier(const __m256i src,
+ const __m256i filter, const int w0) {
+ // weight: -96 to 96 (Sgrproj_Xqd_Min/Max)
+ __m256i v[2];
+ v[0] = VmullNLo8(filter, w0);
+ v[1] = VmullNHi8(filter, w0);
+ return SelfGuidedFinal(src, v);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPass1(
+ const uint8_t* const src, const uint8_t* const src0,
+ const uint8_t* const src1, const ptrdiff_t stride, uint16_t* const sum5[5],
+ uint32_t* const square_sum5[5], const int width, const ptrdiff_t sum_width,
+ const uint32_t scale, const int16_t w0, uint16_t* const ma565[2],
+ uint32_t* const b565[2], uint8_t* const dst) {
+ __m128i ma0, b0, s[2][3], sq_128[2][2];
+ __m256i mas[3], sq[2][3], bs[3];
+ s[0][0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1_128 - width);
+ s[1][0] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1_128 - width);
+ sq_128[0][0] = SquareLo8(s[0][0]);
+ sq_128[1][0] = SquareLo8(s[1][0]);
+ BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq_128, &ma0, &b0);
+ sq[0][0] = SetrM128i(sq_128[0][0], sq_128[0][1]);
+ sq[1][0] = SetrM128i(sq_128[1][0], sq_128[1][1]);
+ mas[0] = SetrM128i(ma0, ma0);
+ bs[0] = SetrM128i(b0, b0);
+
+ int x = 0;
+ do {
+ __m256i ma[3], ma3[3], b[2][2][2];
+ BoxFilterPreProcess5(src0 + x + 8, src1 + x + 8,
+ x + 8 + kOverreadInBytesPass1_256 - width, sum_width,
+ x + 8, scale, sum5, square_sum5, sq, mas, bs);
+ Prepare3_8(mas, ma3);
+ ma[1] = Sum565Lo(ma3);
+ ma[2] = Sum565Hi(ma3);
+ StoreAligned64(ma565[1] + x, ma + 1);
+ Sum565W(bs + 0, b[0][1]);
+ Sum565W(bs + 1, b[1][1]);
+ StoreAligned64(b565[1] + x + 0, b[0][1]);
+ StoreAligned64(b565[1] + x + 16, b[1][1]);
+ const __m256i sr0 = LoadUnaligned32(src + x);
+ const __m256i sr1 = LoadUnaligned32(src + stride + x);
+ const __m256i sr0_lo = _mm256_unpacklo_epi8(sr0, _mm256_setzero_si256());
+ const __m256i sr1_lo = _mm256_unpacklo_epi8(sr1, _mm256_setzero_si256());
+ ma[0] = LoadAligned32(ma565[0] + x);
+ LoadAligned64(b565[0] + x, b[0][0]);
+ const __m256i p00 = CalculateFilteredOutputPass1(sr0_lo, ma, b[0]);
+ const __m256i p01 = CalculateFilteredOutput<4>(sr1_lo, ma[1], b[0][1]);
+ const __m256i d00 = SelfGuidedSingleMultiplier(sr0_lo, p00, w0);
+ const __m256i d10 = SelfGuidedSingleMultiplier(sr1_lo, p01, w0);
+ const __m256i sr0_hi = _mm256_unpackhi_epi8(sr0, _mm256_setzero_si256());
+ const __m256i sr1_hi = _mm256_unpackhi_epi8(sr1, _mm256_setzero_si256());
+ ma[1] = LoadAligned32(ma565[0] + x + 16);
+ LoadAligned64(b565[0] + x + 16, b[1][0]);
+ const __m256i p10 = CalculateFilteredOutputPass1(sr0_hi, ma + 1, b[1]);
+ const __m256i p11 = CalculateFilteredOutput<4>(sr1_hi, ma[2], b[1][1]);
+ const __m256i d01 = SelfGuidedSingleMultiplier(sr0_hi, p10, w0);
+ const __m256i d11 = SelfGuidedSingleMultiplier(sr1_hi, p11, w0);
+ StoreUnaligned32(dst + x, _mm256_packus_epi16(d00, d01));
+ StoreUnaligned32(dst + stride + x, _mm256_packus_epi16(d10, d11));
+ sq[0][0] = sq[0][2];
+ sq[1][0] = sq[1][2];
+ mas[0] = mas[2];
+ bs[0] = bs[2];
+ x += 32;
+ } while (x < width);
+}
+
+inline void BoxFilterPass1LastRow(
+ const uint8_t* const src, const uint8_t* const src0, const int width,
+ const ptrdiff_t sum_width, const uint32_t scale, const int16_t w0,
+ uint16_t* const sum5[5], uint32_t* const square_sum5[5], uint16_t* ma565,
+ uint32_t* b565, uint8_t* const dst) {
+ const __m128i s0 =
+ LoadUnaligned16Msan(src0, kOverreadInBytesPass1_128 - width);
+ __m128i ma0, b0, sq_128[2];
+ __m256i mas[3], sq[3], bs[3];
+ sq_128[0] = SquareLo8(s0);
+ BoxFilterPreProcess5LastRowLo(s0, scale, sum5, square_sum5, sq_128, &ma0,
+ &b0);
+ sq[0] = SetrM128i(sq_128[0], sq_128[1]);
+ mas[0] = SetrM128i(ma0, ma0);
+ bs[0] = SetrM128i(b0, b0);
+
+ int x = 0;
+ do {
+ __m256i ma[3], ma5[3], b[2][2];
+ BoxFilterPreProcess5LastRow(
+ src0 + x + 8, x + 8 + kOverreadInBytesPass1_256 - width, sum_width,
+ x + 8, scale, sum5, square_sum5, sq, mas, bs);
+ Prepare3_8(mas, ma5);
+ ma[1] = Sum565Lo(ma5);
+ ma[2] = Sum565Hi(ma5);
+ Sum565W(bs + 0, b[1]);
+ const __m256i sr = LoadUnaligned32(src + x);
+ const __m256i sr_lo = _mm256_unpacklo_epi8(sr, _mm256_setzero_si256());
+ const __m256i sr_hi = _mm256_unpackhi_epi8(sr, _mm256_setzero_si256());
+ ma[0] = LoadAligned32(ma565);
+ LoadAligned64(b565 + 0, b[0]);
+ const __m256i p0 = CalculateFilteredOutputPass1(sr_lo, ma, b);
+ ma[1] = LoadAligned32(ma565 + 16);
+ LoadAligned64(b565 + 16, b[0]);
+ Sum565W(bs + 1, b[1]);
+ const __m256i p1 = CalculateFilteredOutputPass1(sr_hi, ma + 1, b);
+ const __m256i d0 = SelfGuidedSingleMultiplier(sr_lo, p0, w0);
+ const __m256i d1 = SelfGuidedSingleMultiplier(sr_hi, p1, w0);
+ StoreUnaligned32(dst + x, _mm256_packus_epi16(d0, d1));
+ sq[0] = sq[2];
+ mas[0] = mas[2];
+ bs[0] = bs[2];
+ ma565 += 32;
+ b565 += 32;
+ x += 32;
+ } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPass2(
+ const uint8_t* const src, const uint8_t* const src0, const int width,
+ const ptrdiff_t sum_width, const uint32_t scale, const int16_t w0,
+ uint16_t* const sum3[3], uint32_t* const square_sum3[3],
+ uint16_t* const ma343[3], uint16_t* const ma444[2], uint32_t* const b343[3],
+ uint32_t* const b444[2], uint8_t* const dst) {
+ const __m128i s0 =
+ LoadUnaligned16Msan(src0, kOverreadInBytesPass2_128 - width);
+ __m128i ma0, b0, sq_128[2];
+ __m256i mas[3], sq[3], bs[3];
+ sq_128[0] = SquareLo8(s0);
+ BoxFilterPreProcess3Lo(s0, scale, sum3, square_sum3, sq_128, &ma0, &b0);
+ sq[0] = SetrM128i(sq_128[0], sq_128[1]);
+ mas[0] = SetrM128i(ma0, ma0);
+ bs[0] = SetrM128i(b0, b0);
+
+ int x = 0;
+ do {
+ __m256i ma[4], b[4][2], ma3[3];
+ BoxFilterPreProcess3(src0 + x + 8,
+ x + 8 + kOverreadInBytesPass2_256 - width, x + 8,
+ sum_width, scale, sum3, square_sum3, sq, mas, bs);
+ Prepare3_8(mas, ma3);
+ Store343_444Lo(ma3, bs + 0, x + 0, &ma[2], b[2], ma343[2], ma444[1],
+ b343[2], b444[1]);
+ Store343_444Hi(ma3, bs + 1, x + 16, &ma[3], b[3], ma343[2], ma444[1],
+ b343[2], b444[1]);
+ const __m256i sr = LoadUnaligned32(src + x);
+ const __m256i sr_lo = _mm256_unpacklo_epi8(sr, _mm256_setzero_si256());
+ const __m256i sr_hi = _mm256_unpackhi_epi8(sr, _mm256_setzero_si256());
+ ma[0] = LoadAligned32(ma343[0] + x);
+ ma[1] = LoadAligned32(ma444[0] + x);
+ LoadAligned64(b343[0] + x, b[0]);
+ LoadAligned64(b444[0] + x, b[1]);
+ const __m256i p0 = CalculateFilteredOutputPass2(sr_lo, ma, b);
+ ma[1] = LoadAligned32(ma343[0] + x + 16);
+ ma[2] = LoadAligned32(ma444[0] + x + 16);
+ LoadAligned64(b343[0] + x + 16, b[1]);
+ LoadAligned64(b444[0] + x + 16, b[2]);
+ const __m256i p1 = CalculateFilteredOutputPass2(sr_hi, ma + 1, b + 1);
+ const __m256i d0 = SelfGuidedSingleMultiplier(sr_lo, p0, w0);
+ const __m256i d1 = SelfGuidedSingleMultiplier(sr_hi, p1, w0);
+ StoreUnaligned32(dst + x, _mm256_packus_epi16(d0, d1));
+ sq[0] = sq[2];
+ mas[0] = mas[2];
+ bs[0] = bs[2];
+ x += 32;
+ } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilter(
+ const uint8_t* const src, const uint8_t* const src0,
+ const uint8_t* const src1, const ptrdiff_t stride, const int width,
+ const uint16_t scales[2], const int16_t w0, const int16_t w2,
+ uint16_t* const sum3[4], uint16_t* const sum5[5],
+ uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+ const ptrdiff_t sum_width, uint16_t* const ma343[4],
+ uint16_t* const ma444[3], uint16_t* const ma565[2], uint32_t* const b343[4],
+ uint32_t* const b444[3], uint32_t* const b565[2], uint8_t* const dst) {
+ __m128i s[2], ma3_128[2], ma5_0, sq_128[2][2], b3_128[2], b5_0;
+ __m256i ma3[2][3], ma5[3], sq[2][3], b3[2][5], b5[5];
+ s[0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1_128 - width);
+ s[1] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1_128 - width);
+ sq_128[0][0] = SquareLo8(s[0]);
+ sq_128[1][0] = SquareLo8(s[1]);
+ BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq_128,
+ ma3_128, b3_128, &ma5_0, &b5_0);
+ sq[0][0] = SetrM128i(sq_128[0][0], sq_128[0][1]);
+ sq[1][0] = SetrM128i(sq_128[1][0], sq_128[1][1]);
+ ma3[0][0] = SetrM128i(ma3_128[0], ma3_128[0]);
+ ma3[1][0] = SetrM128i(ma3_128[1], ma3_128[1]);
+ ma5[0] = SetrM128i(ma5_0, ma5_0);
+ b3[0][0] = SetrM128i(b3_128[0], b3_128[0]);
+ b3[1][0] = SetrM128i(b3_128[1], b3_128[1]);
+ b5[0] = SetrM128i(b5_0, b5_0);
+
+ int x = 0;
+ do {
+ __m256i ma[3][3], mat[3][3], b[3][3][2], p[2][2], ma3x[2][3], ma5x[3];
+ BoxFilterPreProcess(src0 + x + 8, src1 + x + 8,
+ x + 8 + kOverreadInBytesPass1_256 - width, x + 8,
+ scales, sum3, sum5, square_sum3, square_sum5, sum_width,
+ sq, ma3, b3, ma5, b5);
+ Prepare3_8(ma3[0], ma3x[0]);
+ Prepare3_8(ma3[1], ma3x[1]);
+ Prepare3_8(ma5, ma5x);
+ Store343_444Lo(ma3x[0], b3[0], x, &ma[1][2], &ma[2][1], b[1][2], b[2][1],
+ ma343[2], ma444[1], b343[2], b444[1]);
+ Store343_444Lo(ma3x[1], b3[1], x, &ma[2][2], b[2][2], ma343[3], ma444[2],
+ b343[3], b444[2]);
+ ma[0][1] = Sum565Lo(ma5x);
+ ma[0][2] = Sum565Hi(ma5x);
+ mat[0][1] = ma[0][2];
+ StoreAligned64(ma565[1] + x, ma[0] + 1);
+ Sum565W(b5, b[0][1]);
+ StoreAligned64(b565[1] + x, b[0][1]);
+ const __m256i sr0 = LoadUnaligned32(src + x);
+ const __m256i sr1 = LoadUnaligned32(src + stride + x);
+ const __m256i sr0_lo = _mm256_unpacklo_epi8(sr0, _mm256_setzero_si256());
+ const __m256i sr1_lo = _mm256_unpacklo_epi8(sr1, _mm256_setzero_si256());
+ ma[0][0] = LoadAligned32(ma565[0] + x);
+ LoadAligned64(b565[0] + x, b[0][0]);
+ p[0][0] = CalculateFilteredOutputPass1(sr0_lo, ma[0], b[0]);
+ p[1][0] = CalculateFilteredOutput<4>(sr1_lo, ma[0][1], b[0][1]);
+ ma[1][0] = LoadAligned32(ma343[0] + x);
+ ma[1][1] = LoadAligned32(ma444[0] + x);
+ LoadAligned64(b343[0] + x, b[1][0]);
+ LoadAligned64(b444[0] + x, b[1][1]);
+ p[0][1] = CalculateFilteredOutputPass2(sr0_lo, ma[1], b[1]);
+ const __m256i d00 = SelfGuidedDoubleMultiplier(sr0_lo, p[0], w0, w2);
+ ma[2][0] = LoadAligned32(ma343[1] + x);
+ LoadAligned64(b343[1] + x, b[2][0]);
+ p[1][1] = CalculateFilteredOutputPass2(sr1_lo, ma[2], b[2]);
+ const __m256i d10 = SelfGuidedDoubleMultiplier(sr1_lo, p[1], w0, w2);
+
+ Sum565W(b5 + 1, b[0][1]);
+ StoreAligned64(b565[1] + x + 16, b[0][1]);
+ Store343_444Hi(ma3x[0], b3[0] + 1, x + 16, &mat[1][2], &mat[2][1], b[1][2],
+ b[2][1], ma343[2], ma444[1], b343[2], b444[1]);
+ Store343_444Hi(ma3x[1], b3[1] + 1, x + 16, &mat[2][2], b[2][2], ma343[3],
+ ma444[2], b343[3], b444[2]);
+ const __m256i sr0_hi = _mm256_unpackhi_epi8(sr0, _mm256_setzero_si256());
+ const __m256i sr1_hi = _mm256_unpackhi_epi8(sr1, _mm256_setzero_si256());
+ mat[0][0] = LoadAligned32(ma565[0] + x + 16);
+ LoadAligned64(b565[0] + x + 16, b[0][0]);
+ p[0][0] = CalculateFilteredOutputPass1(sr0_hi, mat[0], b[0]);
+ p[1][0] = CalculateFilteredOutput<4>(sr1_hi, mat[0][1], b[0][1]);
+ mat[1][0] = LoadAligned32(ma343[0] + x + 16);
+ mat[1][1] = LoadAligned32(ma444[0] + x + 16);
+ LoadAligned64(b343[0] + x + 16, b[1][0]);
+ LoadAligned64(b444[0] + x + 16, b[1][1]);
+ p[0][1] = CalculateFilteredOutputPass2(sr0_hi, mat[1], b[1]);
+ const __m256i d01 = SelfGuidedDoubleMultiplier(sr0_hi, p[0], w0, w2);
+ mat[2][0] = LoadAligned32(ma343[1] + x + 16);
+ LoadAligned64(b343[1] + x + 16, b[2][0]);
+ p[1][1] = CalculateFilteredOutputPass2(sr1_hi, mat[2], b[2]);
+ const __m256i d11 = SelfGuidedDoubleMultiplier(sr1_hi, p[1], w0, w2);
+ StoreUnaligned32(dst + x, _mm256_packus_epi16(d00, d01));
+ StoreUnaligned32(dst + stride + x, _mm256_packus_epi16(d10, d11));
+ sq[0][0] = sq[0][2];
+ sq[1][0] = sq[1][2];
+ ma3[0][0] = ma3[0][2];
+ ma3[1][0] = ma3[1][2];
+ ma5[0] = ma5[2];
+ b3[0][0] = b3[0][2];
+ b3[1][0] = b3[1][2];
+ b5[0] = b5[2];
+ x += 32;
+ } while (x < width);
+}
+
+inline void BoxFilterLastRow(
+ const uint8_t* const src, const uint8_t* const src0, const int width,
+ const ptrdiff_t sum_width, const uint16_t scales[2], const int16_t w0,
+ const int16_t w2, uint16_t* const sum3[4], uint16_t* const sum5[5],
+ uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+ uint16_t* const ma343[4], uint16_t* const ma444[3],
+ uint16_t* const ma565[2], uint32_t* const b343[4], uint32_t* const b444[3],
+ uint32_t* const b565[2], uint8_t* const dst) {
+ const __m128i s0 =
+ LoadUnaligned16Msan(src0, kOverreadInBytesPass1_128 - width);
+ __m128i ma3_0, ma5_0, b3_0, b5_0, sq_128[2];
+ __m256i ma3[3], ma5[3], sq[3], b3[3], b5[3];
+ sq_128[0] = SquareLo8(s0);
+ BoxFilterPreProcessLastRowLo(s0, scales, sum3, sum5, square_sum3, square_sum5,
+ sq_128, &ma3_0, &ma5_0, &b3_0, &b5_0);
+ sq[0] = SetrM128i(sq_128[0], sq_128[1]);
+ ma3[0] = SetrM128i(ma3_0, ma3_0);
+ ma5[0] = SetrM128i(ma5_0, ma5_0);
+ b3[0] = SetrM128i(b3_0, b3_0);
+ b5[0] = SetrM128i(b5_0, b5_0);
+
+ int x = 0;
+ do {
+ __m256i ma[3], mat[3], b[3][2], p[2], ma3x[3], ma5x[3];
+ BoxFilterPreProcessLastRow(src0 + x + 8,
+ x + 8 + kOverreadInBytesPass1_256 - width,
+ sum_width, x + 8, scales, sum3, sum5,
+ square_sum3, square_sum5, sq, ma3, ma5, b3, b5);
+ Prepare3_8(ma3, ma3x);
+ Prepare3_8(ma5, ma5x);
+ ma[1] = Sum565Lo(ma5x);
+ Sum565W(b5, b[1]);
+ ma[2] = Sum343Lo(ma3x);
+ Sum343W(b3, b[2]);
+ const __m256i sr = LoadUnaligned32(src + x);
+ const __m256i sr_lo = _mm256_unpacklo_epi8(sr, _mm256_setzero_si256());
+ ma[0] = LoadAligned32(ma565[0] + x);
+ LoadAligned64(b565[0] + x, b[0]);
+ p[0] = CalculateFilteredOutputPass1(sr_lo, ma, b);
+ ma[0] = LoadAligned32(ma343[0] + x);
+ ma[1] = LoadAligned32(ma444[0] + x);
+ LoadAligned64(b343[0] + x, b[0]);
+ LoadAligned64(b444[0] + x, b[1]);
+ p[1] = CalculateFilteredOutputPass2(sr_lo, ma, b);
+ const __m256i d0 = SelfGuidedDoubleMultiplier(sr_lo, p, w0, w2);
+
+ mat[1] = Sum565Hi(ma5x);
+ Sum565W(b5 + 1, b[1]);
+ mat[2] = Sum343Hi(ma3x);
+ Sum343W(b3 + 1, b[2]);
+ const __m256i sr_hi = _mm256_unpackhi_epi8(sr, _mm256_setzero_si256());
+ mat[0] = LoadAligned32(ma565[0] + x + 16);
+ LoadAligned64(b565[0] + x + 16, b[0]);
+ p[0] = CalculateFilteredOutputPass1(sr_hi, mat, b);
+ mat[0] = LoadAligned32(ma343[0] + x + 16);
+ mat[1] = LoadAligned32(ma444[0] + x + 16);
+ LoadAligned64(b343[0] + x + 16, b[0]);
+ LoadAligned64(b444[0] + x + 16, b[1]);
+ p[1] = CalculateFilteredOutputPass2(sr_hi, mat, b);
+ const __m256i d1 = SelfGuidedDoubleMultiplier(sr_hi, p, w0, w2);
+ StoreUnaligned32(dst + x, _mm256_packus_epi16(d0, d1));
+ sq[0] = sq[2];
+ ma3[0] = ma3[2];
+ ma5[0] = ma5[2];
+ b3[0] = b3[2];
+ b5[0] = b5[2];
+ x += 32;
+ } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterProcess(
+ const RestorationUnitInfo& restoration_info, const uint8_t* src,
+ const uint8_t* const top_border, const uint8_t* bottom_border,
+ const ptrdiff_t stride, const int width, const int height,
+ SgrBuffer* const sgr_buffer, uint8_t* dst) {
+ const auto temp_stride = Align<ptrdiff_t>(width, 32);
+ const auto sum_width = temp_stride + 8;
+ const auto sum_stride = temp_stride + 32;
+ const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+ const uint16_t* const scales = kSgrScaleParameter[sgr_proj_index]; // < 2^12.
+ const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+ const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+ const int16_t w2 = (1 << kSgrProjPrecisionBits) - w0 - w1;
+ uint16_t *sum3[4], *sum5[5], *ma343[4], *ma444[3], *ma565[2];
+ uint32_t *square_sum3[4], *square_sum5[5], *b343[4], *b444[3], *b565[2];
+ sum3[0] = sgr_buffer->sum3 + kSumOffset;
+ square_sum3[0] = sgr_buffer->square_sum3 + kSumOffset;
+ ma343[0] = sgr_buffer->ma343;
+ b343[0] = sgr_buffer->b343;
+ for (int i = 1; i <= 3; ++i) {
+ sum3[i] = sum3[i - 1] + sum_stride;
+ square_sum3[i] = square_sum3[i - 1] + sum_stride;
+ ma343[i] = ma343[i - 1] + temp_stride;
+ b343[i] = b343[i - 1] + temp_stride;
+ }
+ sum5[0] = sgr_buffer->sum5 + kSumOffset;
+ square_sum5[0] = sgr_buffer->square_sum5 + kSumOffset;
+ for (int i = 1; i <= 4; ++i) {
+ sum5[i] = sum5[i - 1] + sum_stride;
+ square_sum5[i] = square_sum5[i - 1] + sum_stride;
+ }
+ ma444[0] = sgr_buffer->ma444;
+ b444[0] = sgr_buffer->b444;
+ for (int i = 1; i <= 2; ++i) {
+ ma444[i] = ma444[i - 1] + temp_stride;
+ b444[i] = b444[i - 1] + temp_stride;
+ }
+ ma565[0] = sgr_buffer->ma565;
+ ma565[1] = ma565[0] + temp_stride;
+ b565[0] = sgr_buffer->b565;
+ b565[1] = b565[0] + temp_stride;
+ assert(scales[0] != 0);
+ assert(scales[1] != 0);
+ BoxSum(top_border, stride, width, sum_stride, temp_stride, sum3[0], sum5[1],
+ square_sum3[0], square_sum5[1]);
+ sum5[0] = sum5[1];
+ square_sum5[0] = square_sum5[1];
+ const uint8_t* const s = (height > 1) ? src + stride : bottom_border;
+ BoxSumFilterPreProcess(src, s, width, scales, sum3, sum5, square_sum3,
+ square_sum5, sum_width, ma343, ma444, ma565[0], b343,
+ b444, b565[0]);
+ sum5[0] = sgr_buffer->sum5 + kSumOffset;
+ square_sum5[0] = sgr_buffer->square_sum5 + kSumOffset;
+
+ for (int y = (height >> 1) - 1; y > 0; --y) {
+ Circulate4PointersBy2<uint16_t>(sum3);
+ Circulate4PointersBy2<uint32_t>(square_sum3);
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ BoxFilter(src + 3, src + 2 * stride, src + 3 * stride, stride, width,
+ scales, w0, w2, sum3, sum5, square_sum3, square_sum5, sum_width,
+ ma343, ma444, ma565, b343, b444, b565, dst);
+ src += 2 * stride;
+ dst += 2 * stride;
+ Circulate4PointersBy2<uint16_t>(ma343);
+ Circulate4PointersBy2<uint32_t>(b343);
+ std::swap(ma444[0], ma444[2]);
+ std::swap(b444[0], b444[2]);
+ std::swap(ma565[0], ma565[1]);
+ std::swap(b565[0], b565[1]);
+ }
+
+ Circulate4PointersBy2<uint16_t>(sum3);
+ Circulate4PointersBy2<uint32_t>(square_sum3);
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ if ((height & 1) == 0 || height > 1) {
+ const uint8_t* sr[2];
+ if ((height & 1) == 0) {
+ sr[0] = bottom_border;
+ sr[1] = bottom_border + stride;
+ } else {
+ sr[0] = src + 2 * stride;
+ sr[1] = bottom_border;
+ }
+ BoxFilter(src + 3, sr[0], sr[1], stride, width, scales, w0, w2, sum3, sum5,
+ square_sum3, square_sum5, sum_width, ma343, ma444, ma565, b343,
+ b444, b565, dst);
+ }
+ if ((height & 1) != 0) {
+ if (height > 1) {
+ src += 2 * stride;
+ dst += 2 * stride;
+ Circulate4PointersBy2<uint16_t>(sum3);
+ Circulate4PointersBy2<uint32_t>(square_sum3);
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ Circulate4PointersBy2<uint16_t>(ma343);
+ Circulate4PointersBy2<uint32_t>(b343);
+ std::swap(ma444[0], ma444[2]);
+ std::swap(b444[0], b444[2]);
+ std::swap(ma565[0], ma565[1]);
+ std::swap(b565[0], b565[1]);
+ }
+ BoxFilterLastRow(src + 3, bottom_border + stride, width, sum_width, scales,
+ w0, w2, sum3, sum5, square_sum3, square_sum5, ma343, ma444,
+ ma565, b343, b444, b565, dst);
+ }
+}
+
+inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
+ const uint8_t* src,
+ const uint8_t* const top_border,
+ const uint8_t* bottom_border,
+ const ptrdiff_t stride, const int width,
+ const int height, SgrBuffer* const sgr_buffer,
+ uint8_t* dst) {
+ const auto temp_stride = Align<ptrdiff_t>(width, 32);
+ const auto sum_width = temp_stride + 8;
+ const auto sum_stride = temp_stride + 32;
+ const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+ const uint32_t scale = kSgrScaleParameter[sgr_proj_index][0]; // < 2^12.
+ const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+ uint16_t *sum5[5], *ma565[2];
+ uint32_t *square_sum5[5], *b565[2];
+ sum5[0] = sgr_buffer->sum5 + kSumOffset;
+ square_sum5[0] = sgr_buffer->square_sum5 + kSumOffset;
+ for (int i = 1; i <= 4; ++i) {
+ sum5[i] = sum5[i - 1] + sum_stride;
+ square_sum5[i] = square_sum5[i - 1] + sum_stride;
+ }
+ ma565[0] = sgr_buffer->ma565;
+ ma565[1] = ma565[0] + temp_stride;
+ b565[0] = sgr_buffer->b565;
+ b565[1] = b565[0] + temp_stride;
+ assert(scale != 0);
+ BoxSum<5>(top_border, stride, width, sum_stride, temp_stride, sum5[1],
+ square_sum5[1]);
+ sum5[0] = sum5[1];
+ square_sum5[0] = square_sum5[1];
+ const uint8_t* const s = (height > 1) ? src + stride : bottom_border;
+ BoxSumFilterPreProcess5(src, s, width, scale, sum5, square_sum5, sum_width,
+ ma565[0], b565[0]);
+ sum5[0] = sgr_buffer->sum5 + kSumOffset;
+ square_sum5[0] = sgr_buffer->square_sum5 + kSumOffset;
+
+ for (int y = (height >> 1) - 1; y > 0; --y) {
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ BoxFilterPass1(src + 3, src + 2 * stride, src + 3 * stride, stride, sum5,
+ square_sum5, width, sum_width, scale, w0, ma565, b565, dst);
+ src += 2 * stride;
+ dst += 2 * stride;
+ std::swap(ma565[0], ma565[1]);
+ std::swap(b565[0], b565[1]);
+ }
+
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ if ((height & 1) == 0 || height > 1) {
+ const uint8_t* sr[2];
+ if ((height & 1) == 0) {
+ sr[0] = bottom_border;
+ sr[1] = bottom_border + stride;
+ } else {
+ sr[0] = src + 2 * stride;
+ sr[1] = bottom_border;
+ }
+ BoxFilterPass1(src + 3, sr[0], sr[1], stride, sum5, square_sum5, width,
+ sum_width, scale, w0, ma565, b565, dst);
+ }
+ if ((height & 1) != 0) {
+ src += 3;
+ if (height > 1) {
+ src += 2 * stride;
+ dst += 2 * stride;
+ std::swap(ma565[0], ma565[1]);
+ std::swap(b565[0], b565[1]);
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ }
+ BoxFilterPass1LastRow(src, bottom_border + stride, width, sum_width, scale,
+ w0, sum5, square_sum5, ma565[0], b565[0], dst);
+ }
+}
+
+inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
+ const uint8_t* src,
+ const uint8_t* const top_border,
+ const uint8_t* bottom_border,
+ const ptrdiff_t stride, const int width,
+ const int height, SgrBuffer* const sgr_buffer,
+ uint8_t* dst) {
+ assert(restoration_info.sgr_proj_info.multiplier[0] == 0);
+ const auto temp_stride = Align<ptrdiff_t>(width, 32);
+ const auto sum_width = temp_stride + 8;
+ const auto sum_stride = temp_stride + 32;
+ const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+ const int16_t w0 = (1 << kSgrProjPrecisionBits) - w1;
+ const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+ const uint32_t scale = kSgrScaleParameter[sgr_proj_index][1]; // < 2^12.
+ uint16_t *sum3[3], *ma343[3], *ma444[2];
+ uint32_t *square_sum3[3], *b343[3], *b444[2];
+ sum3[0] = sgr_buffer->sum3 + kSumOffset;
+ square_sum3[0] = sgr_buffer->square_sum3 + kSumOffset;
+ ma343[0] = sgr_buffer->ma343;
+ b343[0] = sgr_buffer->b343;
+ for (int i = 1; i <= 2; ++i) {
+ sum3[i] = sum3[i - 1] + sum_stride;
+ square_sum3[i] = square_sum3[i - 1] + sum_stride;
+ ma343[i] = ma343[i - 1] + temp_stride;
+ b343[i] = b343[i - 1] + temp_stride;
+ }
+ ma444[0] = sgr_buffer->ma444;
+ ma444[1] = ma444[0] + temp_stride;
+ b444[0] = sgr_buffer->b444;
+ b444[1] = b444[0] + temp_stride;
+ assert(scale != 0);
+ BoxSum<3>(top_border, stride, width, sum_stride, temp_stride, sum3[0],
+ square_sum3[0]);
+ BoxSumFilterPreProcess3<false>(src, width, scale, sum3, square_sum3,
+ sum_width, ma343[0], nullptr, b343[0],
+ nullptr);
+ Circulate3PointersBy1<uint16_t>(sum3);
+ Circulate3PointersBy1<uint32_t>(square_sum3);
+ const uint8_t* s;
+ if (height > 1) {
+ s = src + stride;
+ } else {
+ s = bottom_border;
+ bottom_border += stride;
+ }
+ BoxSumFilterPreProcess3<true>(s, width, scale, sum3, square_sum3, sum_width,
+ ma343[1], ma444[0], b343[1], b444[0]);
+
+ for (int y = height - 2; y > 0; --y) {
+ Circulate3PointersBy1<uint16_t>(sum3);
+ Circulate3PointersBy1<uint32_t>(square_sum3);
+ BoxFilterPass2(src + 2, src + 2 * stride, width, sum_width, scale, w0, sum3,
+ square_sum3, ma343, ma444, b343, b444, dst);
+ src += stride;
+ dst += stride;
+ Circulate3PointersBy1<uint16_t>(ma343);
+ Circulate3PointersBy1<uint32_t>(b343);
+ std::swap(ma444[0], ma444[1]);
+ std::swap(b444[0], b444[1]);
+ }
+
+ int y = std::min(height, 2);
+ src += 2;
+ do {
+ Circulate3PointersBy1<uint16_t>(sum3);
+ Circulate3PointersBy1<uint32_t>(square_sum3);
+ BoxFilterPass2(src, bottom_border, width, sum_width, scale, w0, sum3,
+ square_sum3, ma343, ma444, b343, b444, dst);
+ src += stride;
+ dst += stride;
+ bottom_border += stride;
+ Circulate3PointersBy1<uint16_t>(ma343);
+ Circulate3PointersBy1<uint32_t>(b343);
+ std::swap(ma444[0], ma444[1]);
+ std::swap(b444[0], b444[1]);
+ } while (--y != 0);
+}
+
+// If |width| is non-multiple of 8, up to 7 more pixels are written to |dest| in
+// the end of each row. It is safe to overwrite the output as it will not be
+// part of the visible frame.
+void SelfGuidedFilter_AVX2(
+ const RestorationUnitInfo& restoration_info, const void* const source,
+ const void* const top_border, const void* const bottom_border,
+ const ptrdiff_t stride, const int width, const int height,
+ RestorationBuffer* const restoration_buffer, void* const dest) {
+ const int index = restoration_info.sgr_proj_info.index;
+ const int radius_pass_0 = kSgrProjParams[index][0]; // 2 or 0
+ const int radius_pass_1 = kSgrProjParams[index][2]; // 1 or 0
+ const auto* const src = static_cast<const uint8_t*>(source);
+ const auto* top = static_cast<const uint8_t*>(top_border);
+ const auto* bottom = static_cast<const uint8_t*>(bottom_border);
+ auto* const dst = static_cast<uint8_t*>(dest);
+ SgrBuffer* const sgr_buffer = &restoration_buffer->sgr_buffer;
+ if (radius_pass_1 == 0) {
+ // |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the
+ // following assertion.
+ assert(radius_pass_0 != 0);
+ BoxFilterProcessPass1(restoration_info, src - 3, top - 3, bottom - 3,
+ stride, width, height, sgr_buffer, dst);
+ } else if (radius_pass_0 == 0) {
+ BoxFilterProcessPass2(restoration_info, src - 2, top - 2, bottom - 2,
+ stride, width, height, sgr_buffer, dst);
+ } else {
+ BoxFilterProcess(restoration_info, src - 3, top - 3, bottom - 3, stride,
+ width, height, sgr_buffer, dst);
+ }
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+#if DSP_ENABLED_8BPP_AVX2(WienerFilter)
+ dsp->loop_restorations[0] = WienerFilter_AVX2;
+#endif
+#if DSP_ENABLED_8BPP_AVX2(SelfGuidedFilter)
+ dsp->loop_restorations[1] = SelfGuidedFilter_AVX2;
+#endif
+}
+
+} // namespace
+} // namespace low_bitdepth
+
+void LoopRestorationInit_AVX2() { low_bitdepth::Init8bpp(); }
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_TARGETING_AVX2
+namespace libgav1 {
+namespace dsp {
+
+void LoopRestorationInit_AVX2() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_TARGETING_AVX2
diff --git a/src/dsp/x86/loop_restoration_avx2.h b/src/dsp/x86/loop_restoration_avx2.h
new file mode 100644
index 0000000..d80227c
--- /dev/null
+++ b/src/dsp/x86/loop_restoration_avx2.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_LOOP_RESTORATION_AVX2_H_
+#define LIBGAV1_SRC_DSP_X86_LOOP_RESTORATION_AVX2_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::loop_restorations, see the defines below for specifics.
+// These functions are not thread-safe.
+void LoopRestorationInit_AVX2();
+void LoopRestorationInit10bpp_AVX2();
+
+} // namespace dsp
+} // namespace libgav1
+
+// If avx2 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the avx2 implementation should be used.
+#if LIBGAV1_TARGETING_AVX2
+
+#ifndef LIBGAV1_Dsp8bpp_WienerFilter
+#define LIBGAV1_Dsp8bpp_WienerFilter LIBGAV1_CPU_AVX2
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WienerFilter
+#define LIBGAV1_Dsp10bpp_WienerFilter LIBGAV1_CPU_AVX2
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_SelfGuidedFilter
+#define LIBGAV1_Dsp8bpp_SelfGuidedFilter LIBGAV1_CPU_AVX2
+#endif
+
+#endif // LIBGAV1_TARGETING_AVX2
+
+#endif // LIBGAV1_SRC_DSP_X86_LOOP_RESTORATION_AVX2_H_
diff --git a/src/dsp/x86/loop_restoration_sse4.cc b/src/dsp/x86/loop_restoration_sse4.cc
new file mode 100644
index 0000000..24f5ad2
--- /dev/null
+++ b/src/dsp/x86/loop_restoration_sse4.cc
@@ -0,0 +1,2549 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/loop_restoration.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+#include <smmintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/common.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+inline void WienerHorizontalClip(const __m128i s[2], const __m128i s_3x128,
+ int16_t* const wiener_buffer) {
+ constexpr int offset =
+ 1 << (8 + kWienerFilterBits - kInterRoundBitsHorizontal - 1);
+ constexpr int limit =
+ (1 << (8 + 1 + kWienerFilterBits - kInterRoundBitsHorizontal)) - 1;
+ const __m128i offsets = _mm_set1_epi16(-offset);
+ const __m128i limits = _mm_set1_epi16(limit - offset);
+ // The sum range here is [-128 * 255 + 4, 90 * 255 + 4].
+ const __m128i sum = _mm_add_epi16(s[0], s[1]);
+ const __m128i rounded_sum0 = _mm_srai_epi16(sum, kInterRoundBitsHorizontal);
+ // Add back scaled down offset correction.
+ const __m128i rounded_sum1 = _mm_add_epi16(rounded_sum0, s_3x128);
+ const __m128i d0 = _mm_max_epi16(rounded_sum1, offsets);
+ const __m128i d1 = _mm_min_epi16(d0, limits);
+ StoreAligned16(wiener_buffer, d1);
+}
+
+inline void WienerHorizontalTap7Kernel(const __m128i s[4],
+ const __m128i filter[4],
+ int16_t* const wiener_buffer) {
+ __m128i madds[4];
+ madds[0] = _mm_maddubs_epi16(s[0], filter[0]);
+ madds[1] = _mm_maddubs_epi16(s[1], filter[1]);
+ madds[2] = _mm_maddubs_epi16(s[2], filter[2]);
+ madds[3] = _mm_maddubs_epi16(s[3], filter[3]);
+ madds[0] = _mm_add_epi16(madds[0], madds[2]);
+ madds[1] = _mm_add_epi16(madds[1], madds[3]);
+ const __m128i s_3x128 =
+ _mm_slli_epi16(_mm_srli_epi16(s[1], 8), 7 - kInterRoundBitsHorizontal);
+ WienerHorizontalClip(madds, s_3x128, wiener_buffer);
+}
+
+inline void WienerHorizontalTap5Kernel(const __m128i s[5],
+ const __m128i filter[3],
+ int16_t* const wiener_buffer) {
+ __m128i madds[3];
+ madds[0] = _mm_maddubs_epi16(s[0], filter[0]);
+ madds[1] = _mm_maddubs_epi16(s[1], filter[1]);
+ madds[2] = _mm_maddubs_epi16(s[2], filter[2]);
+ madds[0] = _mm_add_epi16(madds[0], madds[2]);
+ const __m128i s_3x128 =
+ _mm_srli_epi16(_mm_slli_epi16(s[1], 8), kInterRoundBitsHorizontal + 1);
+ WienerHorizontalClip(madds, s_3x128, wiener_buffer);
+}
+
+inline void WienerHorizontalTap3Kernel(const __m128i s[2],
+ const __m128i filter[2],
+ int16_t* const wiener_buffer) {
+ __m128i madds[2];
+ madds[0] = _mm_maddubs_epi16(s[0], filter[0]);
+ madds[1] = _mm_maddubs_epi16(s[1], filter[1]);
+ const __m128i s_3x128 =
+ _mm_slli_epi16(_mm_srli_epi16(s[0], 8), 7 - kInterRoundBitsHorizontal);
+ WienerHorizontalClip(madds, s_3x128, wiener_buffer);
+}
+
+// loading all and unpacking is about 7% faster than using _mm_alignr_epi8().
+inline void WienerHorizontalTap7(const uint8_t* src, const ptrdiff_t src_stride,
+ const ptrdiff_t width, const int height,
+ const int coefficient0,
+ const __m128i coefficients,
+ int16_t** const wiener_buffer) {
+ const __m128i round = _mm_set1_epi8(1 << (kInterRoundBitsHorizontal - 1));
+ __m128i filter[4];
+ filter[0] = _mm_shuffle_epi8(coefficients, _mm_set1_epi16(0x0200));
+ filter[1] = _mm_shuffle_epi8(coefficients, _mm_set1_epi16(0x0604));
+ filter[2] = _mm_shuffle_epi8(coefficients, _mm_set1_epi16(0x0204));
+ filter[3] = _mm_set1_epi16((1 << 8) | static_cast<uint8_t>(coefficient0));
+ for (int y = height; y != 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ __m128i s[7], ss[4];
+ s[0] = LoadUnaligned16(src + x + 0);
+ s[1] = LoadUnaligned16(src + x + 1);
+ s[2] = LoadUnaligned16(src + x + 2);
+ s[3] = LoadUnaligned16(src + x + 3);
+ s[4] = LoadUnaligned16(src + x + 4);
+ s[5] = LoadUnaligned16(src + x + 5);
+ s[6] = LoadUnaligned16(src + x + 6);
+ ss[0] = _mm_unpacklo_epi8(s[0], s[1]);
+ ss[1] = _mm_unpacklo_epi8(s[2], s[3]);
+ ss[2] = _mm_unpacklo_epi8(s[4], s[5]);
+ ss[3] = _mm_unpacklo_epi8(s[6], round);
+ WienerHorizontalTap7Kernel(ss, filter, *wiener_buffer + x + 0);
+ ss[0] = _mm_unpackhi_epi8(s[0], s[1]);
+ ss[1] = _mm_unpackhi_epi8(s[2], s[3]);
+ ss[2] = _mm_unpackhi_epi8(s[4], s[5]);
+ ss[3] = _mm_unpackhi_epi8(s[6], round);
+ WienerHorizontalTap7Kernel(ss, filter, *wiener_buffer + x + 8);
+ x += 16;
+ } while (x < width);
+ src += src_stride;
+ *wiener_buffer += width;
+ }
+}
+
+inline void WienerHorizontalTap5(const uint8_t* src, const ptrdiff_t src_stride,
+ const ptrdiff_t width, const int height,
+ const int coefficient1,
+ const __m128i coefficients,
+ int16_t** const wiener_buffer) {
+ const __m128i round = _mm_set1_epi8(1 << (kInterRoundBitsHorizontal - 1));
+ __m128i filter[3];
+ filter[0] = _mm_shuffle_epi8(coefficients, _mm_set1_epi16(0x0402));
+ filter[1] = _mm_shuffle_epi8(coefficients, _mm_set1_epi16(0x0406));
+ filter[2] = _mm_set1_epi16((1 << 8) | static_cast<uint8_t>(coefficient1));
+ for (int y = height; y != 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ __m128i s[5], ss[3];
+ s[0] = LoadUnaligned16(src + x + 0);
+ s[1] = LoadUnaligned16(src + x + 1);
+ s[2] = LoadUnaligned16(src + x + 2);
+ s[3] = LoadUnaligned16(src + x + 3);
+ s[4] = LoadUnaligned16(src + x + 4);
+ ss[0] = _mm_unpacklo_epi8(s[0], s[1]);
+ ss[1] = _mm_unpacklo_epi8(s[2], s[3]);
+ ss[2] = _mm_unpacklo_epi8(s[4], round);
+ WienerHorizontalTap5Kernel(ss, filter, *wiener_buffer + x + 0);
+ ss[0] = _mm_unpackhi_epi8(s[0], s[1]);
+ ss[1] = _mm_unpackhi_epi8(s[2], s[3]);
+ ss[2] = _mm_unpackhi_epi8(s[4], round);
+ WienerHorizontalTap5Kernel(ss, filter, *wiener_buffer + x + 8);
+ x += 16;
+ } while (x < width);
+ src += src_stride;
+ *wiener_buffer += width;
+ }
+}
+
+inline void WienerHorizontalTap3(const uint8_t* src, const ptrdiff_t src_stride,
+ const ptrdiff_t width, const int height,
+ const int coefficient2,
+ const __m128i coefficients,
+ int16_t** const wiener_buffer) {
+ const __m128i round = _mm_set1_epi8(1 << (kInterRoundBitsHorizontal - 1));
+ __m128i filter[2];
+ filter[0] = _mm_shuffle_epi8(coefficients, _mm_set1_epi16(0x0604));
+ filter[1] = _mm_set1_epi16((1 << 8) | static_cast<uint8_t>(coefficient2));
+ for (int y = height; y != 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ __m128i s[3], ss[2];
+ s[0] = LoadUnaligned16(src + x + 0);
+ s[1] = LoadUnaligned16(src + x + 1);
+ s[2] = LoadUnaligned16(src + x + 2);
+ ss[0] = _mm_unpacklo_epi8(s[0], s[1]);
+ ss[1] = _mm_unpacklo_epi8(s[2], round);
+ WienerHorizontalTap3Kernel(ss, filter, *wiener_buffer + x + 0);
+ ss[0] = _mm_unpackhi_epi8(s[0], s[1]);
+ ss[1] = _mm_unpackhi_epi8(s[2], round);
+ WienerHorizontalTap3Kernel(ss, filter, *wiener_buffer + x + 8);
+ x += 16;
+ } while (x < width);
+ src += src_stride;
+ *wiener_buffer += width;
+ }
+}
+
+inline void WienerHorizontalTap1(const uint8_t* src, const ptrdiff_t src_stride,
+ const ptrdiff_t width, const int height,
+ int16_t** const wiener_buffer) {
+ for (int y = height; y != 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ const __m128i s = LoadUnaligned16(src + x);
+ const __m128i s0 = _mm_unpacklo_epi8(s, _mm_setzero_si128());
+ const __m128i s1 = _mm_unpackhi_epi8(s, _mm_setzero_si128());
+ const __m128i d0 = _mm_slli_epi16(s0, 4);
+ const __m128i d1 = _mm_slli_epi16(s1, 4);
+ StoreAligned16(*wiener_buffer + x + 0, d0);
+ StoreAligned16(*wiener_buffer + x + 8, d1);
+ x += 16;
+ } while (x < width);
+ src += src_stride;
+ *wiener_buffer += width;
+ }
+}
+
+inline __m128i WienerVertical7(const __m128i a[2], const __m128i filter[2]) {
+ const __m128i round = _mm_set1_epi32(1 << (kInterRoundBitsVertical - 1));
+ const __m128i madd0 = _mm_madd_epi16(a[0], filter[0]);
+ const __m128i madd1 = _mm_madd_epi16(a[1], filter[1]);
+ const __m128i sum0 = _mm_add_epi32(round, madd0);
+ const __m128i sum1 = _mm_add_epi32(sum0, madd1);
+ return _mm_srai_epi32(sum1, kInterRoundBitsVertical);
+}
+
+inline __m128i WienerVertical5(const __m128i a[2], const __m128i filter[2]) {
+ const __m128i madd0 = _mm_madd_epi16(a[0], filter[0]);
+ const __m128i madd1 = _mm_madd_epi16(a[1], filter[1]);
+ const __m128i sum = _mm_add_epi32(madd0, madd1);
+ return _mm_srai_epi32(sum, kInterRoundBitsVertical);
+}
+
+inline __m128i WienerVertical3(const __m128i a, const __m128i filter) {
+ const __m128i round = _mm_set1_epi32(1 << (kInterRoundBitsVertical - 1));
+ const __m128i madd = _mm_madd_epi16(a, filter);
+ const __m128i sum = _mm_add_epi32(round, madd);
+ return _mm_srai_epi32(sum, kInterRoundBitsVertical);
+}
+
+inline __m128i WienerVerticalFilter7(const __m128i a[7],
+ const __m128i filter[2]) {
+ __m128i b[2];
+ const __m128i a06 = _mm_add_epi16(a[0], a[6]);
+ const __m128i a15 = _mm_add_epi16(a[1], a[5]);
+ const __m128i a24 = _mm_add_epi16(a[2], a[4]);
+ b[0] = _mm_unpacklo_epi16(a06, a15);
+ b[1] = _mm_unpacklo_epi16(a24, a[3]);
+ const __m128i sum0 = WienerVertical7(b, filter);
+ b[0] = _mm_unpackhi_epi16(a06, a15);
+ b[1] = _mm_unpackhi_epi16(a24, a[3]);
+ const __m128i sum1 = WienerVertical7(b, filter);
+ return _mm_packs_epi32(sum0, sum1);
+}
+
+inline __m128i WienerVerticalFilter5(const __m128i a[5],
+ const __m128i filter[2]) {
+ const __m128i round = _mm_set1_epi16(1 << (kInterRoundBitsVertical - 1));
+ __m128i b[2];
+ const __m128i a04 = _mm_add_epi16(a[0], a[4]);
+ const __m128i a13 = _mm_add_epi16(a[1], a[3]);
+ b[0] = _mm_unpacklo_epi16(a04, a13);
+ b[1] = _mm_unpacklo_epi16(a[2], round);
+ const __m128i sum0 = WienerVertical5(b, filter);
+ b[0] = _mm_unpackhi_epi16(a04, a13);
+ b[1] = _mm_unpackhi_epi16(a[2], round);
+ const __m128i sum1 = WienerVertical5(b, filter);
+ return _mm_packs_epi32(sum0, sum1);
+}
+
+inline __m128i WienerVerticalFilter3(const __m128i a[3], const __m128i filter) {
+ __m128i b;
+ const __m128i a02 = _mm_add_epi16(a[0], a[2]);
+ b = _mm_unpacklo_epi16(a02, a[1]);
+ const __m128i sum0 = WienerVertical3(b, filter);
+ b = _mm_unpackhi_epi16(a02, a[1]);
+ const __m128i sum1 = WienerVertical3(b, filter);
+ return _mm_packs_epi32(sum0, sum1);
+}
+
+inline __m128i WienerVerticalTap7Kernel(const int16_t* wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const __m128i filter[2], __m128i a[7]) {
+ a[0] = LoadAligned16(wiener_buffer + 0 * wiener_stride);
+ a[1] = LoadAligned16(wiener_buffer + 1 * wiener_stride);
+ a[2] = LoadAligned16(wiener_buffer + 2 * wiener_stride);
+ a[3] = LoadAligned16(wiener_buffer + 3 * wiener_stride);
+ a[4] = LoadAligned16(wiener_buffer + 4 * wiener_stride);
+ a[5] = LoadAligned16(wiener_buffer + 5 * wiener_stride);
+ a[6] = LoadAligned16(wiener_buffer + 6 * wiener_stride);
+ return WienerVerticalFilter7(a, filter);
+}
+
+inline __m128i WienerVerticalTap5Kernel(const int16_t* wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const __m128i filter[2], __m128i a[5]) {
+ a[0] = LoadAligned16(wiener_buffer + 0 * wiener_stride);
+ a[1] = LoadAligned16(wiener_buffer + 1 * wiener_stride);
+ a[2] = LoadAligned16(wiener_buffer + 2 * wiener_stride);
+ a[3] = LoadAligned16(wiener_buffer + 3 * wiener_stride);
+ a[4] = LoadAligned16(wiener_buffer + 4 * wiener_stride);
+ return WienerVerticalFilter5(a, filter);
+}
+
+inline __m128i WienerVerticalTap3Kernel(const int16_t* wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const __m128i filter, __m128i a[3]) {
+ a[0] = LoadAligned16(wiener_buffer + 0 * wiener_stride);
+ a[1] = LoadAligned16(wiener_buffer + 1 * wiener_stride);
+ a[2] = LoadAligned16(wiener_buffer + 2 * wiener_stride);
+ return WienerVerticalFilter3(a, filter);
+}
+
+inline void WienerVerticalTap7Kernel2(const int16_t* wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const __m128i filter[2], __m128i d[2]) {
+ __m128i a[8];
+ d[0] = WienerVerticalTap7Kernel(wiener_buffer, wiener_stride, filter, a);
+ a[7] = LoadAligned16(wiener_buffer + 7 * wiener_stride);
+ d[1] = WienerVerticalFilter7(a + 1, filter);
+}
+
+inline void WienerVerticalTap5Kernel2(const int16_t* wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const __m128i filter[2], __m128i d[2]) {
+ __m128i a[6];
+ d[0] = WienerVerticalTap5Kernel(wiener_buffer, wiener_stride, filter, a);
+ a[5] = LoadAligned16(wiener_buffer + 5 * wiener_stride);
+ d[1] = WienerVerticalFilter5(a + 1, filter);
+}
+
+inline void WienerVerticalTap3Kernel2(const int16_t* wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const __m128i filter, __m128i d[2]) {
+ __m128i a[4];
+ d[0] = WienerVerticalTap3Kernel(wiener_buffer, wiener_stride, filter, a);
+ a[3] = LoadAligned16(wiener_buffer + 3 * wiener_stride);
+ d[1] = WienerVerticalFilter3(a + 1, filter);
+}
+
+inline void WienerVerticalTap7(const int16_t* wiener_buffer,
+ const ptrdiff_t width, const int height,
+ const int16_t coefficients[4], uint8_t* dst,
+ const ptrdiff_t dst_stride) {
+ const __m128i c = LoadLo8(coefficients);
+ __m128i filter[2];
+ filter[0] = _mm_shuffle_epi32(c, 0x0);
+ filter[1] = _mm_shuffle_epi32(c, 0x55);
+ for (int y = height >> 1; y > 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ __m128i d[2][2];
+ WienerVerticalTap7Kernel2(wiener_buffer + x + 0, width, filter, d[0]);
+ WienerVerticalTap7Kernel2(wiener_buffer + x + 8, width, filter, d[1]);
+ StoreAligned16(dst + x, _mm_packus_epi16(d[0][0], d[1][0]));
+ StoreAligned16(dst + dst_stride + x, _mm_packus_epi16(d[0][1], d[1][1]));
+ x += 16;
+ } while (x < width);
+ dst += 2 * dst_stride;
+ wiener_buffer += 2 * width;
+ }
+
+ if ((height & 1) != 0) {
+ ptrdiff_t x = 0;
+ do {
+ __m128i a[7];
+ const __m128i d0 =
+ WienerVerticalTap7Kernel(wiener_buffer + x + 0, width, filter, a);
+ const __m128i d1 =
+ WienerVerticalTap7Kernel(wiener_buffer + x + 8, width, filter, a);
+ StoreAligned16(dst + x, _mm_packus_epi16(d0, d1));
+ x += 16;
+ } while (x < width);
+ }
+}
+
+inline void WienerVerticalTap5(const int16_t* wiener_buffer,
+ const ptrdiff_t width, const int height,
+ const int16_t coefficients[3], uint8_t* dst,
+ const ptrdiff_t dst_stride) {
+ const __m128i c = Load4(coefficients);
+ __m128i filter[2];
+ filter[0] = _mm_shuffle_epi32(c, 0);
+ filter[1] =
+ _mm_set1_epi32((1 << 16) | static_cast<uint16_t>(coefficients[2]));
+ for (int y = height >> 1; y > 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ __m128i d[2][2];
+ WienerVerticalTap5Kernel2(wiener_buffer + x + 0, width, filter, d[0]);
+ WienerVerticalTap5Kernel2(wiener_buffer + x + 8, width, filter, d[1]);
+ StoreAligned16(dst + x, _mm_packus_epi16(d[0][0], d[1][0]));
+ StoreAligned16(dst + dst_stride + x, _mm_packus_epi16(d[0][1], d[1][1]));
+ x += 16;
+ } while (x < width);
+ dst += 2 * dst_stride;
+ wiener_buffer += 2 * width;
+ }
+
+ if ((height & 1) != 0) {
+ ptrdiff_t x = 0;
+ do {
+ __m128i a[5];
+ const __m128i d0 =
+ WienerVerticalTap5Kernel(wiener_buffer + x + 0, width, filter, a);
+ const __m128i d1 =
+ WienerVerticalTap5Kernel(wiener_buffer + x + 8, width, filter, a);
+ StoreAligned16(dst + x, _mm_packus_epi16(d0, d1));
+ x += 16;
+ } while (x < width);
+ }
+}
+
+inline void WienerVerticalTap3(const int16_t* wiener_buffer,
+ const ptrdiff_t width, const int height,
+ const int16_t coefficients[2], uint8_t* dst,
+ const ptrdiff_t dst_stride) {
+ const __m128i filter =
+ _mm_set1_epi32(*reinterpret_cast<const int32_t*>(coefficients));
+ for (int y = height >> 1; y > 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ __m128i d[2][2];
+ WienerVerticalTap3Kernel2(wiener_buffer + x + 0, width, filter, d[0]);
+ WienerVerticalTap3Kernel2(wiener_buffer + x + 8, width, filter, d[1]);
+ StoreAligned16(dst + x, _mm_packus_epi16(d[0][0], d[1][0]));
+ StoreAligned16(dst + dst_stride + x, _mm_packus_epi16(d[0][1], d[1][1]));
+ x += 16;
+ } while (x < width);
+ dst += 2 * dst_stride;
+ wiener_buffer += 2 * width;
+ }
+
+ if ((height & 1) != 0) {
+ ptrdiff_t x = 0;
+ do {
+ __m128i a[3];
+ const __m128i d0 =
+ WienerVerticalTap3Kernel(wiener_buffer + x + 0, width, filter, a);
+ const __m128i d1 =
+ WienerVerticalTap3Kernel(wiener_buffer + x + 8, width, filter, a);
+ StoreAligned16(dst + x, _mm_packus_epi16(d0, d1));
+ x += 16;
+ } while (x < width);
+ }
+}
+
+inline void WienerVerticalTap1Kernel(const int16_t* const wiener_buffer,
+ uint8_t* const dst) {
+ const __m128i a0 = LoadAligned16(wiener_buffer + 0);
+ const __m128i a1 = LoadAligned16(wiener_buffer + 8);
+ const __m128i b0 = _mm_add_epi16(a0, _mm_set1_epi16(8));
+ const __m128i b1 = _mm_add_epi16(a1, _mm_set1_epi16(8));
+ const __m128i c0 = _mm_srai_epi16(b0, 4);
+ const __m128i c1 = _mm_srai_epi16(b1, 4);
+ const __m128i d = _mm_packus_epi16(c0, c1);
+ StoreAligned16(dst, d);
+}
+
+inline void WienerVerticalTap1(const int16_t* wiener_buffer,
+ const ptrdiff_t width, const int height,
+ uint8_t* dst, const ptrdiff_t dst_stride) {
+ for (int y = height >> 1; y > 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ WienerVerticalTap1Kernel(wiener_buffer + x, dst + x);
+ WienerVerticalTap1Kernel(wiener_buffer + width + x, dst + dst_stride + x);
+ x += 16;
+ } while (x < width);
+ dst += 2 * dst_stride;
+ wiener_buffer += 2 * width;
+ }
+
+ if ((height & 1) != 0) {
+ ptrdiff_t x = 0;
+ do {
+ WienerVerticalTap1Kernel(wiener_buffer + x, dst + x);
+ x += 16;
+ } while (x < width);
+ }
+}
+
+void WienerFilter_SSE4_1(const RestorationUnitInfo& restoration_info,
+ const void* const source, const void* const top_border,
+ const void* const bottom_border,
+ const ptrdiff_t stride, const int width,
+ const int height,
+ RestorationBuffer* const restoration_buffer,
+ void* const dest) {
+ const int16_t* const number_leading_zero_coefficients =
+ restoration_info.wiener_info.number_leading_zero_coefficients;
+ const int number_rows_to_skip = std::max(
+ static_cast<int>(number_leading_zero_coefficients[WienerInfo::kVertical]),
+ 1);
+ const ptrdiff_t wiener_stride = Align(width, 16);
+ int16_t* const wiener_buffer_vertical = restoration_buffer->wiener_buffer;
+ // The values are saturated to 13 bits before storing.
+ int16_t* wiener_buffer_horizontal =
+ wiener_buffer_vertical + number_rows_to_skip * wiener_stride;
+
+ // horizontal filtering.
+ // Over-reads up to 15 - |kRestorationHorizontalBorder| values.
+ const int height_horizontal =
+ height + kWienerFilterTaps - 1 - 2 * number_rows_to_skip;
+ const int height_extra = (height_horizontal - height) >> 1;
+ assert(height_extra <= 2);
+ const auto* const src = static_cast<const uint8_t*>(source);
+ const auto* const top = static_cast<const uint8_t*>(top_border);
+ const auto* const bottom = static_cast<const uint8_t*>(bottom_border);
+ const int16_t* const filter_horizontal =
+ restoration_info.wiener_info.filter[WienerInfo::kHorizontal];
+ const __m128i c = LoadLo8(filter_horizontal);
+ // In order to keep the horizontal pass intermediate values within 16 bits we
+ // offset |filter[3]| by 128. The 128 offset will be added back in the loop.
+ const __m128i coefficients_horizontal =
+ _mm_sub_epi16(c, _mm_setr_epi16(0, 0, 0, 128, 0, 0, 0, 0));
+ if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) {
+ WienerHorizontalTap7(top + (2 - height_extra) * stride - 3, stride,
+ wiener_stride, height_extra, filter_horizontal[0],
+ coefficients_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap7(src - 3, stride, wiener_stride, height,
+ filter_horizontal[0], coefficients_horizontal,
+ &wiener_buffer_horizontal);
+ WienerHorizontalTap7(bottom - 3, stride, wiener_stride, height_extra,
+ filter_horizontal[0], coefficients_horizontal,
+ &wiener_buffer_horizontal);
+ } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
+ WienerHorizontalTap5(top + (2 - height_extra) * stride - 2, stride,
+ wiener_stride, height_extra, filter_horizontal[1],
+ coefficients_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap5(src - 2, stride, wiener_stride, height,
+ filter_horizontal[1], coefficients_horizontal,
+ &wiener_buffer_horizontal);
+ WienerHorizontalTap5(bottom - 2, stride, wiener_stride, height_extra,
+ filter_horizontal[1], coefficients_horizontal,
+ &wiener_buffer_horizontal);
+ } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) {
+ // The maximum over-reads happen here.
+ WienerHorizontalTap3(top + (2 - height_extra) * stride - 1, stride,
+ wiener_stride, height_extra, filter_horizontal[2],
+ coefficients_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap3(src - 1, stride, wiener_stride, height,
+ filter_horizontal[2], coefficients_horizontal,
+ &wiener_buffer_horizontal);
+ WienerHorizontalTap3(bottom - 1, stride, wiener_stride, height_extra,
+ filter_horizontal[2], coefficients_horizontal,
+ &wiener_buffer_horizontal);
+ } else {
+ assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3);
+ WienerHorizontalTap1(top + (2 - height_extra) * stride, stride,
+ wiener_stride, height_extra,
+ &wiener_buffer_horizontal);
+ WienerHorizontalTap1(src, stride, wiener_stride, height,
+ &wiener_buffer_horizontal);
+ WienerHorizontalTap1(bottom, stride, wiener_stride, height_extra,
+ &wiener_buffer_horizontal);
+ }
+
+ // vertical filtering.
+ // Over-writes up to 15 values.
+ const int16_t* const filter_vertical =
+ restoration_info.wiener_info.filter[WienerInfo::kVertical];
+ auto* dst = static_cast<uint8_t*>(dest);
+ if (number_leading_zero_coefficients[WienerInfo::kVertical] == 0) {
+ // Because the top row of |source| is a duplicate of the second row, and the
+ // bottom row of |source| is a duplicate of its above row, we can duplicate
+ // the top and bottom row of |wiener_buffer| accordingly.
+ memcpy(wiener_buffer_horizontal, wiener_buffer_horizontal - wiener_stride,
+ sizeof(*wiener_buffer_horizontal) * wiener_stride);
+ memcpy(restoration_buffer->wiener_buffer,
+ restoration_buffer->wiener_buffer + wiener_stride,
+ sizeof(*restoration_buffer->wiener_buffer) * wiener_stride);
+ WienerVerticalTap7(wiener_buffer_vertical, wiener_stride, height,
+ filter_vertical, dst, stride);
+ } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 1) {
+ WienerVerticalTap5(wiener_buffer_vertical + wiener_stride, wiener_stride,
+ height, filter_vertical + 1, dst, stride);
+ } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 2) {
+ WienerVerticalTap3(wiener_buffer_vertical + 2 * wiener_stride,
+ wiener_stride, height, filter_vertical + 2, dst, stride);
+ } else {
+ assert(number_leading_zero_coefficients[WienerInfo::kVertical] == 3);
+ WienerVerticalTap1(wiener_buffer_vertical + 3 * wiener_stride,
+ wiener_stride, height, dst, stride);
+ }
+}
+
+//------------------------------------------------------------------------------
+// SGR
+
+// SIMD overreads 16 - (width % 16) - 2 * padding pixels, where padding is 3 for
+// Pass 1 and 2 for Pass 2.
+constexpr int kOverreadInBytesPass1 = 10;
+constexpr int kOverreadInBytesPass2 = 12;
+
+inline void LoadAligned16x2U16(const uint16_t* const src[2], const ptrdiff_t x,
+ __m128i dst[2]) {
+ dst[0] = LoadAligned16(src[0] + x);
+ dst[1] = LoadAligned16(src[1] + x);
+}
+
+inline void LoadAligned16x2U16Msan(const uint16_t* const src[2],
+ const ptrdiff_t x, const ptrdiff_t border,
+ __m128i dst[2]) {
+ dst[0] = LoadAligned16Msan(src[0] + x, sizeof(**src) * (x + 8 - border));
+ dst[1] = LoadAligned16Msan(src[1] + x, sizeof(**src) * (x + 8 - border));
+}
+
+inline void LoadAligned16x3U16(const uint16_t* const src[3], const ptrdiff_t x,
+ __m128i dst[3]) {
+ dst[0] = LoadAligned16(src[0] + x);
+ dst[1] = LoadAligned16(src[1] + x);
+ dst[2] = LoadAligned16(src[2] + x);
+}
+
+inline void LoadAligned16x3U16Msan(const uint16_t* const src[3],
+ const ptrdiff_t x, const ptrdiff_t border,
+ __m128i dst[3]) {
+ dst[0] = LoadAligned16Msan(src[0] + x, sizeof(**src) * (x + 8 - border));
+ dst[1] = LoadAligned16Msan(src[1] + x, sizeof(**src) * (x + 8 - border));
+ dst[2] = LoadAligned16Msan(src[2] + x, sizeof(**src) * (x + 8 - border));
+}
+
+inline void LoadAligned32U32(const uint32_t* const src, __m128i dst[2]) {
+ dst[0] = LoadAligned16(src + 0);
+ dst[1] = LoadAligned16(src + 4);
+}
+
+inline void LoadAligned32U32Msan(const uint32_t* const src, const ptrdiff_t x,
+ const ptrdiff_t border, __m128i dst[2]) {
+ dst[0] = LoadAligned16Msan(src + x + 0, sizeof(*src) * (x + 4 - border));
+ dst[1] = LoadAligned16Msan(src + x + 4, sizeof(*src) * (x + 8 - border));
+}
+
+inline void LoadAligned32x2U32(const uint32_t* const src[2], const ptrdiff_t x,
+ __m128i dst[2][2]) {
+ LoadAligned32U32(src[0] + x, dst[0]);
+ LoadAligned32U32(src[1] + x, dst[1]);
+}
+
+inline void LoadAligned32x2U32Msan(const uint32_t* const src[2],
+ const ptrdiff_t x, const ptrdiff_t border,
+ __m128i dst[2][2]) {
+ LoadAligned32U32Msan(src[0], x, border, dst[0]);
+ LoadAligned32U32Msan(src[1], x, border, dst[1]);
+}
+
+inline void LoadAligned32x3U32(const uint32_t* const src[3], const ptrdiff_t x,
+ __m128i dst[3][2]) {
+ LoadAligned32U32(src[0] + x, dst[0]);
+ LoadAligned32U32(src[1] + x, dst[1]);
+ LoadAligned32U32(src[2] + x, dst[2]);
+}
+
+inline void LoadAligned32x3U32Msan(const uint32_t* const src[3],
+ const ptrdiff_t x, const ptrdiff_t border,
+ __m128i dst[3][2]) {
+ LoadAligned32U32Msan(src[0], x, border, dst[0]);
+ LoadAligned32U32Msan(src[1], x, border, dst[1]);
+ LoadAligned32U32Msan(src[2], x, border, dst[2]);
+}
+
+inline void StoreAligned32U16(uint16_t* const dst, const __m128i src[2]) {
+ StoreAligned16(dst + 0, src[0]);
+ StoreAligned16(dst + 8, src[1]);
+}
+
+inline void StoreAligned32U32(uint32_t* const dst, const __m128i src[2]) {
+ StoreAligned16(dst + 0, src[0]);
+ StoreAligned16(dst + 4, src[1]);
+}
+
+inline void StoreAligned64U32(uint32_t* const dst, const __m128i src[4]) {
+ StoreAligned32U32(dst + 0, src + 0);
+ StoreAligned32U32(dst + 8, src + 2);
+}
+
+// Don't use _mm_cvtepu8_epi16() or _mm_cvtepu16_epi32() in the following
+// functions. Some compilers may generate super inefficient code and the whole
+// decoder could be 15% slower.
+
+inline __m128i VaddlLo8(const __m128i src0, const __m128i src1) {
+ const __m128i s0 = _mm_unpacklo_epi8(src0, _mm_setzero_si128());
+ const __m128i s1 = _mm_unpacklo_epi8(src1, _mm_setzero_si128());
+ return _mm_add_epi16(s0, s1);
+}
+
+inline __m128i VaddlHi8(const __m128i src0, const __m128i src1) {
+ const __m128i s0 = _mm_unpackhi_epi8(src0, _mm_setzero_si128());
+ const __m128i s1 = _mm_unpackhi_epi8(src1, _mm_setzero_si128());
+ return _mm_add_epi16(s0, s1);
+}
+
+inline __m128i VaddlLo16(const __m128i src0, const __m128i src1) {
+ const __m128i s0 = _mm_unpacklo_epi16(src0, _mm_setzero_si128());
+ const __m128i s1 = _mm_unpacklo_epi16(src1, _mm_setzero_si128());
+ return _mm_add_epi32(s0, s1);
+}
+
+inline __m128i VaddlHi16(const __m128i src0, const __m128i src1) {
+ const __m128i s0 = _mm_unpackhi_epi16(src0, _mm_setzero_si128());
+ const __m128i s1 = _mm_unpackhi_epi16(src1, _mm_setzero_si128());
+ return _mm_add_epi32(s0, s1);
+}
+
+inline __m128i VaddwLo8(const __m128i src0, const __m128i src1) {
+ const __m128i s1 = _mm_unpacklo_epi8(src1, _mm_setzero_si128());
+ return _mm_add_epi16(src0, s1);
+}
+
+inline __m128i VaddwHi8(const __m128i src0, const __m128i src1) {
+ const __m128i s1 = _mm_unpackhi_epi8(src1, _mm_setzero_si128());
+ return _mm_add_epi16(src0, s1);
+}
+
+inline __m128i VaddwLo16(const __m128i src0, const __m128i src1) {
+ const __m128i s1 = _mm_unpacklo_epi16(src1, _mm_setzero_si128());
+ return _mm_add_epi32(src0, s1);
+}
+
+inline __m128i VaddwHi16(const __m128i src0, const __m128i src1) {
+ const __m128i s1 = _mm_unpackhi_epi16(src1, _mm_setzero_si128());
+ return _mm_add_epi32(src0, s1);
+}
+
+inline __m128i VmullNLo8(const __m128i src0, const int src1) {
+ const __m128i s0 = _mm_unpacklo_epi16(src0, _mm_setzero_si128());
+ return _mm_madd_epi16(s0, _mm_set1_epi32(src1));
+}
+
+inline __m128i VmullNHi8(const __m128i src0, const int src1) {
+ const __m128i s0 = _mm_unpackhi_epi16(src0, _mm_setzero_si128());
+ return _mm_madd_epi16(s0, _mm_set1_epi32(src1));
+}
+
+inline __m128i VmullLo16(const __m128i src0, const __m128i src1) {
+ const __m128i s0 = _mm_unpacklo_epi16(src0, _mm_setzero_si128());
+ const __m128i s1 = _mm_unpacklo_epi16(src1, _mm_setzero_si128());
+ return _mm_madd_epi16(s0, s1);
+}
+
+inline __m128i VmullHi16(const __m128i src0, const __m128i src1) {
+ const __m128i s0 = _mm_unpackhi_epi16(src0, _mm_setzero_si128());
+ const __m128i s1 = _mm_unpackhi_epi16(src1, _mm_setzero_si128());
+ return _mm_madd_epi16(s0, s1);
+}
+
+inline __m128i VrshrS32(const __m128i src0, const int src1) {
+ const __m128i sum = _mm_add_epi32(src0, _mm_set1_epi32(1 << (src1 - 1)));
+ return _mm_srai_epi32(sum, src1);
+}
+
+inline __m128i VrshrU32(const __m128i src0, const int src1) {
+ const __m128i sum = _mm_add_epi32(src0, _mm_set1_epi32(1 << (src1 - 1)));
+ return _mm_srli_epi32(sum, src1);
+}
+
+inline __m128i SquareLo8(const __m128i src) {
+ const __m128i s = _mm_unpacklo_epi8(src, _mm_setzero_si128());
+ return _mm_mullo_epi16(s, s);
+}
+
+inline __m128i SquareHi8(const __m128i src) {
+ const __m128i s = _mm_unpackhi_epi8(src, _mm_setzero_si128());
+ return _mm_mullo_epi16(s, s);
+}
+
+inline void Prepare3Lo8(const __m128i src, __m128i dst[3]) {
+ dst[0] = src;
+ dst[1] = _mm_srli_si128(src, 1);
+ dst[2] = _mm_srli_si128(src, 2);
+}
+
+template <int offset>
+inline void Prepare3_8(const __m128i src[2], __m128i dst[3]) {
+ dst[0] = _mm_alignr_epi8(src[1], src[0], offset + 0);
+ dst[1] = _mm_alignr_epi8(src[1], src[0], offset + 1);
+ dst[2] = _mm_alignr_epi8(src[1], src[0], offset + 2);
+}
+
+inline void Prepare3_16(const __m128i src[2], __m128i dst[3]) {
+ dst[0] = src[0];
+ dst[1] = _mm_alignr_epi8(src[1], src[0], 2);
+ dst[2] = _mm_alignr_epi8(src[1], src[0], 4);
+}
+
+inline void Prepare5Lo8(const __m128i src, __m128i dst[5]) {
+ dst[0] = src;
+ dst[1] = _mm_srli_si128(src, 1);
+ dst[2] = _mm_srli_si128(src, 2);
+ dst[3] = _mm_srli_si128(src, 3);
+ dst[4] = _mm_srli_si128(src, 4);
+}
+
+template <int offset>
+inline void Prepare5_8(const __m128i src[2], __m128i dst[5]) {
+ dst[0] = _mm_alignr_epi8(src[1], src[0], offset + 0);
+ dst[1] = _mm_alignr_epi8(src[1], src[0], offset + 1);
+ dst[2] = _mm_alignr_epi8(src[1], src[0], offset + 2);
+ dst[3] = _mm_alignr_epi8(src[1], src[0], offset + 3);
+ dst[4] = _mm_alignr_epi8(src[1], src[0], offset + 4);
+}
+
+inline void Prepare5_16(const __m128i src[2], __m128i dst[5]) {
+ Prepare3_16(src, dst);
+ dst[3] = _mm_alignr_epi8(src[1], src[0], 6);
+ dst[4] = _mm_alignr_epi8(src[1], src[0], 8);
+}
+
+inline __m128i Sum3_16(const __m128i src0, const __m128i src1,
+ const __m128i src2) {
+ const __m128i sum = _mm_add_epi16(src0, src1);
+ return _mm_add_epi16(sum, src2);
+}
+
+inline __m128i Sum3_16(const __m128i src[3]) {
+ return Sum3_16(src[0], src[1], src[2]);
+}
+
+inline __m128i Sum3_32(const __m128i src0, const __m128i src1,
+ const __m128i src2) {
+ const __m128i sum = _mm_add_epi32(src0, src1);
+ return _mm_add_epi32(sum, src2);
+}
+
+inline void Sum3_32(const __m128i src[3][2], __m128i dst[2]) {
+ dst[0] = Sum3_32(src[0][0], src[1][0], src[2][0]);
+ dst[1] = Sum3_32(src[0][1], src[1][1], src[2][1]);
+}
+
+inline __m128i Sum3WLo16(const __m128i src[3]) {
+ const __m128i sum = VaddlLo8(src[0], src[1]);
+ return VaddwLo8(sum, src[2]);
+}
+
+inline __m128i Sum3WHi16(const __m128i src[3]) {
+ const __m128i sum = VaddlHi8(src[0], src[1]);
+ return VaddwHi8(sum, src[2]);
+}
+
+inline __m128i Sum3WLo32(const __m128i src[3]) {
+ const __m128i sum = VaddlLo16(src[0], src[1]);
+ return VaddwLo16(sum, src[2]);
+}
+
+inline __m128i Sum3WHi32(const __m128i src[3]) {
+ const __m128i sum = VaddlHi16(src[0], src[1]);
+ return VaddwHi16(sum, src[2]);
+}
+
+inline __m128i Sum5_16(const __m128i src[5]) {
+ const __m128i sum01 = _mm_add_epi16(src[0], src[1]);
+ const __m128i sum23 = _mm_add_epi16(src[2], src[3]);
+ const __m128i sum = _mm_add_epi16(sum01, sum23);
+ return _mm_add_epi16(sum, src[4]);
+}
+
+inline __m128i Sum5_32(const __m128i* const src0, const __m128i* const src1,
+ const __m128i* const src2, const __m128i* const src3,
+ const __m128i* const src4) {
+ const __m128i sum01 = _mm_add_epi32(*src0, *src1);
+ const __m128i sum23 = _mm_add_epi32(*src2, *src3);
+ const __m128i sum = _mm_add_epi32(sum01, sum23);
+ return _mm_add_epi32(sum, *src4);
+}
+
+inline void Sum5_32(const __m128i src[5][2], __m128i dst[2]) {
+ dst[0] = Sum5_32(&src[0][0], &src[1][0], &src[2][0], &src[3][0], &src[4][0]);
+ dst[1] = Sum5_32(&src[0][1], &src[1][1], &src[2][1], &src[3][1], &src[4][1]);
+}
+
+inline __m128i Sum5WLo16(const __m128i src[5]) {
+ const __m128i sum01 = VaddlLo8(src[0], src[1]);
+ const __m128i sum23 = VaddlLo8(src[2], src[3]);
+ const __m128i sum = _mm_add_epi16(sum01, sum23);
+ return VaddwLo8(sum, src[4]);
+}
+
+inline __m128i Sum5WHi16(const __m128i src[5]) {
+ const __m128i sum01 = VaddlHi8(src[0], src[1]);
+ const __m128i sum23 = VaddlHi8(src[2], src[3]);
+ const __m128i sum = _mm_add_epi16(sum01, sum23);
+ return VaddwHi8(sum, src[4]);
+}
+
+inline __m128i Sum3Horizontal(const __m128i src) {
+ __m128i s[3];
+ Prepare3Lo8(src, s);
+ return Sum3WLo16(s);
+}
+
+template <int offset>
+inline void Sum3Horizontal(const __m128i src[2], __m128i dst[2]) {
+ __m128i s[3];
+ Prepare3_8<offset>(src, s);
+ dst[0] = Sum3WLo16(s);
+ dst[1] = Sum3WHi16(s);
+}
+
+inline void Sum3WHorizontal(const __m128i src[2], __m128i dst[2]) {
+ __m128i s[3];
+ Prepare3_16(src, s);
+ dst[0] = Sum3WLo32(s);
+ dst[1] = Sum3WHi32(s);
+}
+
+inline __m128i Sum5Horizontal(const __m128i src) {
+ __m128i s[5];
+ Prepare5Lo8(src, s);
+ return Sum5WLo16(s);
+}
+
+template <int offset>
+inline void Sum5Horizontal(const __m128i src[2], __m128i* const dst0,
+ __m128i* const dst1) {
+ __m128i s[5];
+ Prepare5_8<offset>(src, s);
+ *dst0 = Sum5WLo16(s);
+ *dst1 = Sum5WHi16(s);
+}
+
+inline void Sum5WHorizontal(const __m128i src[2], __m128i dst[2]) {
+ __m128i s[5];
+ Prepare5_16(src, s);
+ const __m128i sum01_lo = VaddlLo16(s[0], s[1]);
+ const __m128i sum23_lo = VaddlLo16(s[2], s[3]);
+ const __m128i sum0123_lo = _mm_add_epi32(sum01_lo, sum23_lo);
+ dst[0] = VaddwLo16(sum0123_lo, s[4]);
+ const __m128i sum01_hi = VaddlHi16(s[0], s[1]);
+ const __m128i sum23_hi = VaddlHi16(s[2], s[3]);
+ const __m128i sum0123_hi = _mm_add_epi32(sum01_hi, sum23_hi);
+ dst[1] = VaddwHi16(sum0123_hi, s[4]);
+}
+
+void SumHorizontalLo(const __m128i src[5], __m128i* const row_sq3,
+ __m128i* const row_sq5) {
+ const __m128i sum04 = VaddlLo16(src[0], src[4]);
+ *row_sq3 = Sum3WLo32(src + 1);
+ *row_sq5 = _mm_add_epi32(sum04, *row_sq3);
+}
+
+void SumHorizontalHi(const __m128i src[5], __m128i* const row_sq3,
+ __m128i* const row_sq5) {
+ const __m128i sum04 = VaddlHi16(src[0], src[4]);
+ *row_sq3 = Sum3WHi32(src + 1);
+ *row_sq5 = _mm_add_epi32(sum04, *row_sq3);
+}
+
+void SumHorizontalLo(const __m128i src, __m128i* const row3,
+ __m128i* const row5) {
+ __m128i s[5];
+ Prepare5Lo8(src, s);
+ const __m128i sum04 = VaddlLo8(s[0], s[4]);
+ *row3 = Sum3WLo16(s + 1);
+ *row5 = _mm_add_epi16(sum04, *row3);
+}
+
+template <int offset>
+void SumHorizontal(const __m128i src[2], __m128i* const row3_0,
+ __m128i* const row3_1, __m128i* const row5_0,
+ __m128i* const row5_1) {
+ __m128i s[5];
+ Prepare5_8<offset>(src, s);
+ const __m128i sum04_lo = VaddlLo8(s[0], s[4]);
+ const __m128i sum04_hi = VaddlHi8(s[0], s[4]);
+ *row3_0 = Sum3WLo16(s + 1);
+ *row3_1 = Sum3WHi16(s + 1);
+ *row5_0 = _mm_add_epi16(sum04_lo, *row3_0);
+ *row5_1 = _mm_add_epi16(sum04_hi, *row3_1);
+}
+
+inline void SumHorizontal(const __m128i src[2], __m128i* const row_sq3_0,
+ __m128i* const row_sq3_1, __m128i* const row_sq5_0,
+ __m128i* const row_sq5_1) {
+ __m128i s[5];
+ Prepare5_16(src, s);
+ SumHorizontalLo(s, row_sq3_0, row_sq5_0);
+ SumHorizontalHi(s, row_sq3_1, row_sq5_1);
+}
+
+inline __m128i Sum343Lo(const __m128i ma3[3]) {
+ const __m128i sum = Sum3WLo16(ma3);
+ const __m128i sum3 = Sum3_16(sum, sum, sum);
+ return VaddwLo8(sum3, ma3[1]);
+}
+
+inline __m128i Sum343Hi(const __m128i ma3[3]) {
+ const __m128i sum = Sum3WHi16(ma3);
+ const __m128i sum3 = Sum3_16(sum, sum, sum);
+ return VaddwHi8(sum3, ma3[1]);
+}
+
+inline __m128i Sum343WLo(const __m128i src[3]) {
+ const __m128i sum = Sum3WLo32(src);
+ const __m128i sum3 = Sum3_32(sum, sum, sum);
+ return VaddwLo16(sum3, src[1]);
+}
+
+inline __m128i Sum343WHi(const __m128i src[3]) {
+ const __m128i sum = Sum3WHi32(src);
+ const __m128i sum3 = Sum3_32(sum, sum, sum);
+ return VaddwHi16(sum3, src[1]);
+}
+
+inline void Sum343W(const __m128i src[2], __m128i dst[2]) {
+ __m128i s[3];
+ Prepare3_16(src, s);
+ dst[0] = Sum343WLo(s);
+ dst[1] = Sum343WHi(s);
+}
+
+inline __m128i Sum565Lo(const __m128i src[3]) {
+ const __m128i sum = Sum3WLo16(src);
+ const __m128i sum4 = _mm_slli_epi16(sum, 2);
+ const __m128i sum5 = _mm_add_epi16(sum4, sum);
+ return VaddwLo8(sum5, src[1]);
+}
+
+inline __m128i Sum565Hi(const __m128i src[3]) {
+ const __m128i sum = Sum3WHi16(src);
+ const __m128i sum4 = _mm_slli_epi16(sum, 2);
+ const __m128i sum5 = _mm_add_epi16(sum4, sum);
+ return VaddwHi8(sum5, src[1]);
+}
+
+inline __m128i Sum565WLo(const __m128i src[3]) {
+ const __m128i sum = Sum3WLo32(src);
+ const __m128i sum4 = _mm_slli_epi32(sum, 2);
+ const __m128i sum5 = _mm_add_epi32(sum4, sum);
+ return VaddwLo16(sum5, src[1]);
+}
+
+inline __m128i Sum565WHi(const __m128i src[3]) {
+ const __m128i sum = Sum3WHi32(src);
+ const __m128i sum4 = _mm_slli_epi32(sum, 2);
+ const __m128i sum5 = _mm_add_epi32(sum4, sum);
+ return VaddwHi16(sum5, src[1]);
+}
+
+inline void Sum565W(const __m128i src[2], __m128i dst[2]) {
+ __m128i s[3];
+ Prepare3_16(src, s);
+ dst[0] = Sum565WLo(s);
+ dst[1] = Sum565WHi(s);
+}
+
+inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride,
+ const ptrdiff_t width, const ptrdiff_t sum_stride,
+ const ptrdiff_t sum_width, uint16_t* sum3, uint16_t* sum5,
+ uint32_t* square_sum3, uint32_t* square_sum5) {
+ int y = 2;
+ do {
+ __m128i s[2], sq[3];
+ s[0] = LoadUnaligned16Msan(src, kOverreadInBytesPass1 - width);
+ sq[0] = SquareLo8(s[0]);
+ ptrdiff_t x = sum_width;
+ do {
+ __m128i row3[2], row5[2], row_sq3[2], row_sq5[2];
+ x -= 16;
+ src += 16;
+ s[1] = LoadUnaligned16Msan(src,
+ sum_width - x + kOverreadInBytesPass1 - width);
+ sq[1] = SquareHi8(s[0]);
+ sq[2] = SquareLo8(s[1]);
+ SumHorizontal<0>(s, &row3[0], &row3[1], &row5[0], &row5[1]);
+ StoreAligned32U16(sum3, row3);
+ StoreAligned32U16(sum5, row5);
+ SumHorizontal(sq + 0, &row_sq3[0], &row_sq3[1], &row_sq5[0], &row_sq5[1]);
+ StoreAligned32U32(square_sum3 + 0, row_sq3);
+ StoreAligned32U32(square_sum5 + 0, row_sq5);
+ SumHorizontal(sq + 1, &row_sq3[0], &row_sq3[1], &row_sq5[0], &row_sq5[1]);
+ StoreAligned32U32(square_sum3 + 8, row_sq3);
+ StoreAligned32U32(square_sum5 + 8, row_sq5);
+ s[0] = s[1];
+ sq[0] = sq[2];
+ sum3 += 16;
+ sum5 += 16;
+ square_sum3 += 16;
+ square_sum5 += 16;
+ } while (x != 0);
+ src += src_stride - sum_width;
+ sum3 += sum_stride - sum_width;
+ sum5 += sum_stride - sum_width;
+ square_sum3 += sum_stride - sum_width;
+ square_sum5 += sum_stride - sum_width;
+ } while (--y != 0);
+}
+
+template <int size>
+inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride,
+ const ptrdiff_t width, const ptrdiff_t sum_stride,
+ const ptrdiff_t sum_width, uint16_t* sums,
+ uint32_t* square_sums) {
+ static_assert(size == 3 || size == 5, "");
+ constexpr int kOverreadInBytes =
+ (size == 5) ? kOverreadInBytesPass1 : kOverreadInBytesPass2;
+ int y = 2;
+ do {
+ __m128i s[2], sq[3];
+ s[0] = LoadUnaligned16Msan(src, kOverreadInBytes - width);
+ sq[0] = SquareLo8(s[0]);
+ ptrdiff_t x = sum_width;
+ do {
+ __m128i row[2], row_sq[4];
+ x -= 16;
+ src += 16;
+ s[1] = LoadUnaligned16Msan(src, sum_width - x + kOverreadInBytes - width);
+ sq[1] = SquareHi8(s[0]);
+ sq[2] = SquareLo8(s[1]);
+ if (size == 3) {
+ Sum3Horizontal<0>(s, row);
+ Sum3WHorizontal(sq + 0, row_sq + 0);
+ Sum3WHorizontal(sq + 1, row_sq + 2);
+ } else {
+ Sum5Horizontal<0>(s, &row[0], &row[1]);
+ Sum5WHorizontal(sq + 0, row_sq + 0);
+ Sum5WHorizontal(sq + 1, row_sq + 2);
+ }
+ StoreAligned32U16(sums, row);
+ StoreAligned64U32(square_sums, row_sq);
+ s[0] = s[1];
+ sq[0] = sq[2];
+ sums += 16;
+ square_sums += 16;
+ } while (x != 0);
+ src += src_stride - sum_width;
+ sums += sum_stride - sum_width;
+ square_sums += sum_stride - sum_width;
+ } while (--y != 0);
+}
+
+template <int n>
+inline __m128i CalculateMa(const __m128i sum, const __m128i sum_sq,
+ const uint32_t scale) {
+ static_assert(n == 9 || n == 25, "");
+ // a = |sum_sq|
+ // d = |sum|
+ // p = (a * n < d * d) ? 0 : a * n - d * d;
+ const __m128i dxd = _mm_madd_epi16(sum, sum);
+ // _mm_mullo_epi32() has high latency. Using shifts and additions instead.
+ // Some compilers could do this for us but we make this explicit.
+ // return _mm_mullo_epi32(sum_sq, _mm_set1_epi32(n));
+ __m128i axn = _mm_add_epi32(sum_sq, _mm_slli_epi32(sum_sq, 3));
+ if (n == 25) axn = _mm_add_epi32(axn, _mm_slli_epi32(sum_sq, 4));
+ const __m128i sub = _mm_sub_epi32(axn, dxd);
+ const __m128i p = _mm_max_epi32(sub, _mm_setzero_si128());
+ const __m128i pxs = _mm_mullo_epi32(p, _mm_set1_epi32(scale));
+ return VrshrU32(pxs, kSgrProjScaleBits);
+}
+
+template <int n>
+inline __m128i CalculateMa(const __m128i sum, const __m128i sum_sq[2],
+ const uint32_t scale) {
+ static_assert(n == 9 || n == 25, "");
+ const __m128i sum_lo = _mm_unpacklo_epi16(sum, _mm_setzero_si128());
+ const __m128i sum_hi = _mm_unpackhi_epi16(sum, _mm_setzero_si128());
+ const __m128i z0 = CalculateMa<n>(sum_lo, sum_sq[0], scale);
+ const __m128i z1 = CalculateMa<n>(sum_hi, sum_sq[1], scale);
+ return _mm_packus_epi32(z0, z1);
+}
+
+template <int n>
+inline __m128i CalculateB(const __m128i sum, const __m128i ma) {
+ static_assert(n == 9 || n == 25, "");
+ constexpr uint32_t one_over_n =
+ ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n;
+ const __m128i m0 = VmullLo16(ma, sum);
+ const __m128i m1 = VmullHi16(ma, sum);
+ const __m128i m2 = _mm_mullo_epi32(m0, _mm_set1_epi32(one_over_n));
+ const __m128i m3 = _mm_mullo_epi32(m1, _mm_set1_epi32(one_over_n));
+ const __m128i b_lo = VrshrU32(m2, kSgrProjReciprocalBits);
+ const __m128i b_hi = VrshrU32(m3, kSgrProjReciprocalBits);
+ return _mm_packus_epi32(b_lo, b_hi);
+}
+
+inline void CalculateSumAndIndex5(const __m128i s5[5], const __m128i sq5[5][2],
+ const uint32_t scale, __m128i* const sum,
+ __m128i* const index) {
+ __m128i sum_sq[2];
+ *sum = Sum5_16(s5);
+ Sum5_32(sq5, sum_sq);
+ *index = CalculateMa<25>(*sum, sum_sq, scale);
+}
+
+inline void CalculateSumAndIndex3(const __m128i s3[3], const __m128i sq3[3][2],
+ const uint32_t scale, __m128i* const sum,
+ __m128i* const index) {
+ __m128i sum_sq[2];
+ *sum = Sum3_16(s3);
+ Sum3_32(sq3, sum_sq);
+ *index = CalculateMa<9>(*sum, sum_sq, scale);
+}
+
+template <int n, int offset>
+inline void LookupIntermediate(const __m128i sum, const __m128i index,
+ __m128i* const ma, __m128i* const b) {
+ static_assert(n == 9 || n == 25, "");
+ static_assert(offset == 0 || offset == 8, "");
+ const __m128i idx = _mm_packus_epi16(index, index);
+ // Actually it's not stored and loaded. The compiler will use a 64-bit
+ // general-purpose register to process. Faster than using _mm_extract_epi8().
+ uint8_t temp[8];
+ StoreLo8(temp, idx);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[0]], offset + 0);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[1]], offset + 1);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[2]], offset + 2);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[3]], offset + 3);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[4]], offset + 4);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[5]], offset + 5);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[6]], offset + 6);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[7]], offset + 7);
+ // b = ma * b * one_over_n
+ // |ma| = [0, 255]
+ // |sum| is a box sum with radius 1 or 2.
+ // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
+ // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
+ // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
+ // When radius is 2 |n| is 25. |one_over_n| is 164.
+ // When radius is 1 |n| is 9. |one_over_n| is 455.
+ // |kSgrProjReciprocalBits| is 12.
+ // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
+ // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
+ __m128i maq;
+ if (offset == 0) {
+ maq = _mm_unpacklo_epi8(*ma, _mm_setzero_si128());
+ } else {
+ maq = _mm_unpackhi_epi8(*ma, _mm_setzero_si128());
+ }
+ *b = CalculateB<n>(sum, maq);
+}
+
+// Set the shuffle control mask of indices out of range [0, 15] to (1xxxxxxx)b
+// to get value 0 as the shuffle result. The most significiant bit 1 comes
+// either from the comparision instruction, or from the sign bit of the index.
+inline __m128i ShuffleIndex(const __m128i table, const __m128i index) {
+ __m128i mask;
+ mask = _mm_cmpgt_epi8(index, _mm_set1_epi8(15));
+ mask = _mm_or_si128(mask, index);
+ return _mm_shuffle_epi8(table, mask);
+}
+
+inline __m128i AdjustValue(const __m128i value, const __m128i index,
+ const int threshold) {
+ const __m128i thresholds = _mm_set1_epi8(threshold - 128);
+ const __m128i offset = _mm_cmpgt_epi8(index, thresholds);
+ return _mm_add_epi8(value, offset);
+}
+
+inline void CalculateIntermediate(const __m128i sum[2], const __m128i index[2],
+ __m128i* const ma, __m128i* const b0,
+ __m128i* const b1) {
+ // Use table lookup to read elements which indices are less than 48.
+ const __m128i c0 = LoadAligned16(kSgrMaLookup + 0 * 16);
+ const __m128i c1 = LoadAligned16(kSgrMaLookup + 1 * 16);
+ const __m128i c2 = LoadAligned16(kSgrMaLookup + 2 * 16);
+ const __m128i indices = _mm_packus_epi16(index[0], index[1]);
+ __m128i idx;
+ // Clip idx to 127 to apply signed comparision instructions.
+ idx = _mm_min_epu8(indices, _mm_set1_epi8(127));
+ // All elements which indices are less than 48 are set to 0.
+ // Get shuffle results for indices in range [0, 15].
+ *ma = ShuffleIndex(c0, idx);
+ // Get shuffle results for indices in range [16, 31].
+ // Subtract 16 to utilize the sign bit of the index.
+ idx = _mm_sub_epi8(idx, _mm_set1_epi8(16));
+ const __m128i res1 = ShuffleIndex(c1, idx);
+ // Use OR instruction to combine shuffle results together.
+ *ma = _mm_or_si128(*ma, res1);
+ // Get shuffle results for indices in range [32, 47].
+ // Subtract 16 to utilize the sign bit of the index.
+ idx = _mm_sub_epi8(idx, _mm_set1_epi8(16));
+ const __m128i res2 = ShuffleIndex(c2, idx);
+ *ma = _mm_or_si128(*ma, res2);
+
+ // For elements which indices are larger than 47, since they seldom change
+ // values with the increase of the index, we use comparison and arithmetic
+ // operations to calculate their values.
+ // Add -128 to apply signed comparision instructions.
+ idx = _mm_add_epi8(indices, _mm_set1_epi8(-128));
+ // Elements which indices are larger than 47 (with value 0) are set to 5.
+ *ma = _mm_max_epu8(*ma, _mm_set1_epi8(5));
+ *ma = AdjustValue(*ma, idx, 55); // 55 is the last index which value is 5.
+ *ma = AdjustValue(*ma, idx, 72); // 72 is the last index which value is 4.
+ *ma = AdjustValue(*ma, idx, 101); // 101 is the last index which value is 3.
+ *ma = AdjustValue(*ma, idx, 169); // 169 is the last index which value is 2.
+ *ma = AdjustValue(*ma, idx, 254); // 254 is the last index which value is 1.
+
+ // b = ma * b * one_over_n
+ // |ma| = [0, 255]
+ // |sum| is a box sum with radius 1 or 2.
+ // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
+ // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
+ // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
+ // When radius is 2 |n| is 25. |one_over_n| is 164.
+ // When radius is 1 |n| is 9. |one_over_n| is 455.
+ // |kSgrProjReciprocalBits| is 12.
+ // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
+ // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
+ const __m128i maq0 = _mm_unpacklo_epi8(*ma, _mm_setzero_si128());
+ *b0 = CalculateB<9>(sum[0], maq0);
+ const __m128i maq1 = _mm_unpackhi_epi8(*ma, _mm_setzero_si128());
+ *b1 = CalculateB<9>(sum[1], maq1);
+}
+
+inline void CalculateIntermediate(const __m128i sum[2], const __m128i index[2],
+ __m128i ma[2], __m128i b[2]) {
+ __m128i mas;
+ CalculateIntermediate(sum, index, &mas, &b[0], &b[1]);
+ ma[0] = _mm_unpacklo_epi64(ma[0], mas);
+ ma[1] = _mm_srli_si128(mas, 8);
+}
+
+// Note: It has been tried to call CalculateIntermediate() to replace the slow
+// LookupIntermediate() when calculating 16 intermediate data points. However,
+// the compiler generates even slower code.
+template <int offset>
+inline void CalculateIntermediate5(const __m128i s5[5], const __m128i sq5[5][2],
+ const uint32_t scale, __m128i* const ma,
+ __m128i* const b) {
+ static_assert(offset == 0 || offset == 8, "");
+ __m128i sum, index;
+ CalculateSumAndIndex5(s5, sq5, scale, &sum, &index);
+ LookupIntermediate<25, offset>(sum, index, ma, b);
+}
+
+inline void CalculateIntermediate3(const __m128i s3[3], const __m128i sq3[3][2],
+ const uint32_t scale, __m128i* const ma,
+ __m128i* const b) {
+ __m128i sum, index;
+ CalculateSumAndIndex3(s3, sq3, scale, &sum, &index);
+ LookupIntermediate<9, 0>(sum, index, ma, b);
+}
+
+inline void Store343_444(const __m128i b3[2], const ptrdiff_t x,
+ __m128i sum_b343[2], __m128i sum_b444[2],
+ uint32_t* const b343, uint32_t* const b444) {
+ __m128i b[3], sum_b111[2];
+ Prepare3_16(b3, b);
+ sum_b111[0] = Sum3WLo32(b);
+ sum_b111[1] = Sum3WHi32(b);
+ sum_b444[0] = _mm_slli_epi32(sum_b111[0], 2);
+ sum_b444[1] = _mm_slli_epi32(sum_b111[1], 2);
+ StoreAligned32U32(b444 + x, sum_b444);
+ sum_b343[0] = _mm_sub_epi32(sum_b444[0], sum_b111[0]);
+ sum_b343[1] = _mm_sub_epi32(sum_b444[1], sum_b111[1]);
+ sum_b343[0] = VaddwLo16(sum_b343[0], b[1]);
+ sum_b343[1] = VaddwHi16(sum_b343[1], b[1]);
+ StoreAligned32U32(b343 + x, sum_b343);
+}
+
+inline void Store343_444Lo(const __m128i ma3[3], const __m128i b3[2],
+ const ptrdiff_t x, __m128i* const sum_ma343,
+ __m128i* const sum_ma444, __m128i sum_b343[2],
+ __m128i sum_b444[2], uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ const __m128i sum_ma111 = Sum3WLo16(ma3);
+ *sum_ma444 = _mm_slli_epi16(sum_ma111, 2);
+ StoreAligned16(ma444 + x, *sum_ma444);
+ const __m128i sum333 = _mm_sub_epi16(*sum_ma444, sum_ma111);
+ *sum_ma343 = VaddwLo8(sum333, ma3[1]);
+ StoreAligned16(ma343 + x, *sum_ma343);
+ Store343_444(b3, x, sum_b343, sum_b444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m128i ma3[3], const __m128i b3[2],
+ const ptrdiff_t x, __m128i* const sum_ma343,
+ __m128i* const sum_ma444, __m128i sum_b343[2],
+ __m128i sum_b444[2], uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ const __m128i sum_ma111 = Sum3WHi16(ma3);
+ *sum_ma444 = _mm_slli_epi16(sum_ma111, 2);
+ StoreAligned16(ma444 + x, *sum_ma444);
+ const __m128i sum333 = _mm_sub_epi16(*sum_ma444, sum_ma111);
+ *sum_ma343 = VaddwHi8(sum333, ma3[1]);
+ StoreAligned16(ma343 + x, *sum_ma343);
+ Store343_444(b3, x, sum_b343, sum_b444, b343, b444);
+}
+
+inline void Store343_444Lo(const __m128i ma3[3], const __m128i b3[2],
+ const ptrdiff_t x, __m128i* const sum_ma343,
+ __m128i sum_b343[2], uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ __m128i sum_ma444, sum_b444[2];
+ Store343_444Lo(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, sum_b444, ma343,
+ ma444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m128i ma3[3], const __m128i b3[2],
+ const ptrdiff_t x, __m128i* const sum_ma343,
+ __m128i sum_b343[2], uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ __m128i sum_ma444, sum_b444[2];
+ Store343_444Hi(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, sum_b444, ma343,
+ ma444, b343, b444);
+}
+
+inline void Store343_444Lo(const __m128i ma3[3], const __m128i b3[2],
+ const ptrdiff_t x, uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ __m128i sum_ma343, sum_b343[2];
+ Store343_444Lo(ma3, b3, x, &sum_ma343, sum_b343, ma343, ma444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m128i ma3[3], const __m128i b3[2],
+ const ptrdiff_t x, uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ __m128i sum_ma343, sum_b343[2];
+ Store343_444Hi(ma3, b3, x, &sum_ma343, sum_b343, ma343, ma444, b343, b444);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5Lo(
+ const __m128i s[2][2], const uint32_t scale, uint16_t* const sum5[5],
+ uint32_t* const square_sum5[5], __m128i sq[2][4], __m128i* const ma,
+ __m128i* const b) {
+ __m128i s5[2][5], sq5[5][2];
+ sq[0][1] = SquareHi8(s[0][0]);
+ sq[1][1] = SquareHi8(s[1][0]);
+ s5[0][3] = Sum5Horizontal(s[0][0]);
+ StoreAligned16(sum5[3], s5[0][3]);
+ s5[0][4] = Sum5Horizontal(s[1][0]);
+ StoreAligned16(sum5[4], s5[0][4]);
+ Sum5WHorizontal(sq[0], sq5[3]);
+ StoreAligned32U32(square_sum5[3], sq5[3]);
+ Sum5WHorizontal(sq[1], sq5[4]);
+ StoreAligned32U32(square_sum5[4], sq5[4]);
+ LoadAligned16x3U16(sum5, 0, s5[0]);
+ LoadAligned32x3U32(square_sum5, 0, sq5);
+ CalculateIntermediate5<0>(s5[0], sq5, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5(
+ const __m128i s[2][2], const ptrdiff_t sum_width, const ptrdiff_t x,
+ const uint32_t scale, uint16_t* const sum5[5],
+ uint32_t* const square_sum5[5], __m128i sq[2][4], __m128i ma[2],
+ __m128i b[3]) {
+ __m128i s5[2][5], sq5[5][2];
+ sq[0][2] = SquareLo8(s[0][1]);
+ sq[1][2] = SquareLo8(s[1][1]);
+ Sum5Horizontal<8>(s[0], &s5[0][3], &s5[1][3]);
+ StoreAligned16(sum5[3] + x + 0, s5[0][3]);
+ StoreAligned16(sum5[3] + x + 8, s5[1][3]);
+ Sum5Horizontal<8>(s[1], &s5[0][4], &s5[1][4]);
+ StoreAligned16(sum5[4] + x + 0, s5[0][4]);
+ StoreAligned16(sum5[4] + x + 8, s5[1][4]);
+ Sum5WHorizontal(sq[0] + 1, sq5[3]);
+ StoreAligned32U32(square_sum5[3] + x, sq5[3]);
+ Sum5WHorizontal(sq[1] + 1, sq5[4]);
+ StoreAligned32U32(square_sum5[4] + x, sq5[4]);
+ LoadAligned16x3U16(sum5, x, s5[0]);
+ LoadAligned32x3U32(square_sum5, x, sq5);
+ CalculateIntermediate5<8>(s5[0], sq5, scale, &ma[0], &b[1]);
+
+ sq[0][3] = SquareHi8(s[0][1]);
+ sq[1][3] = SquareHi8(s[1][1]);
+ Sum5WHorizontal(sq[0] + 2, sq5[3]);
+ StoreAligned32U32(square_sum5[3] + x + 8, sq5[3]);
+ Sum5WHorizontal(sq[1] + 2, sq5[4]);
+ StoreAligned32U32(square_sum5[4] + x + 8, sq5[4]);
+ LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
+ LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
+ CalculateIntermediate5<0>(s5[1], sq5, scale, &ma[1], &b[2]);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRowLo(
+ const __m128i s, const uint32_t scale, const uint16_t* const sum5[5],
+ const uint32_t* const square_sum5[5], __m128i sq[2], __m128i* const ma,
+ __m128i* const b) {
+ __m128i s5[5], sq5[5][2];
+ sq[1] = SquareHi8(s);
+ s5[3] = s5[4] = Sum5Horizontal(s);
+ Sum5WHorizontal(sq, sq5[3]);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ LoadAligned16x3U16(sum5, 0, s5);
+ LoadAligned32x3U32(square_sum5, 0, sq5);
+ CalculateIntermediate5<0>(s5, sq5, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRow(
+ const __m128i s[2], const ptrdiff_t sum_width, const ptrdiff_t x,
+ const uint32_t scale, const uint16_t* const sum5[5],
+ const uint32_t* const square_sum5[5], __m128i sq[4], __m128i ma[2],
+ __m128i b[3]) {
+ __m128i s5[2][5], sq5[5][2];
+ sq[2] = SquareLo8(s[1]);
+ Sum5Horizontal<8>(s, &s5[0][3], &s5[1][3]);
+ s5[0][4] = s5[0][3];
+ s5[1][4] = s5[1][3];
+ Sum5WHorizontal(sq + 1, sq5[3]);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ LoadAligned16x3U16(sum5, x, s5[0]);
+ LoadAligned32x3U32(square_sum5, x, sq5);
+ CalculateIntermediate5<8>(s5[0], sq5, scale, &ma[0], &b[1]);
+
+ sq[3] = SquareHi8(s[1]);
+ Sum5WHorizontal(sq + 2, sq5[3]);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
+ LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
+ CalculateIntermediate5<0>(s5[1], sq5, scale, &ma[1], &b[2]);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3Lo(
+ const __m128i s, const uint32_t scale, uint16_t* const sum3[3],
+ uint32_t* const square_sum3[3], __m128i sq[2], __m128i* const ma,
+ __m128i* const b) {
+ __m128i s3[3], sq3[3][2];
+ sq[1] = SquareHi8(s);
+ s3[2] = Sum3Horizontal(s);
+ StoreAligned16(sum3[2], s3[2]);
+ Sum3WHorizontal(sq, sq3[2]);
+ StoreAligned32U32(square_sum3[2], sq3[2]);
+ LoadAligned16x2U16(sum3, 0, s3);
+ LoadAligned32x2U32(square_sum3, 0, sq3);
+ CalculateIntermediate3(s3, sq3, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3(
+ const __m128i s[2], const ptrdiff_t x, const ptrdiff_t sum_width,
+ const uint32_t scale, uint16_t* const sum3[3],
+ uint32_t* const square_sum3[3], __m128i sq[4], __m128i ma[2],
+ __m128i b[3]) {
+ __m128i s3[4], sq3[3][2], sum[2], index[2];
+ sq[2] = SquareLo8(s[1]);
+ Sum3Horizontal<8>(s, s3 + 2);
+ StoreAligned32U16(sum3[2] + x, s3 + 2);
+ Sum3WHorizontal(sq + 1, sq3[2]);
+ StoreAligned32U32(square_sum3[2] + x + 0, sq3[2]);
+ LoadAligned16x2U16(sum3, x, s3);
+ LoadAligned32x2U32(square_sum3, x, sq3);
+ CalculateSumAndIndex3(s3, sq3, scale, &sum[0], &index[0]);
+
+ sq[3] = SquareHi8(s[1]);
+ Sum3WHorizontal(sq + 2, sq3[2]);
+ StoreAligned32U32(square_sum3[2] + x + 8, sq3[2]);
+ LoadAligned16x2U16Msan(sum3, x + 8, sum_width, s3 + 1);
+ LoadAligned32x2U32Msan(square_sum3, x + 8, sum_width, sq3);
+ CalculateSumAndIndex3(s3 + 1, sq3, scale, &sum[1], &index[1]);
+ CalculateIntermediate(sum, index, ma, b + 1);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLo(
+ const __m128i s[2][2], const uint16_t scales[2], uint16_t* const sum3[4],
+ uint16_t* const sum5[5], uint32_t* const square_sum3[4],
+ uint32_t* const square_sum5[5], __m128i sq[2][4], __m128i ma3[2][2],
+ __m128i b3[2][3], __m128i* const ma5, __m128i* const b5) {
+ __m128i s3[4], s5[5], sq3[4][2], sq5[5][2], sum[2], index[2];
+ sq[0][1] = SquareHi8(s[0][0]);
+ sq[1][1] = SquareHi8(s[1][0]);
+ SumHorizontalLo(s[0][0], &s3[2], &s5[3]);
+ SumHorizontalLo(s[1][0], &s3[3], &s5[4]);
+ StoreAligned16(sum3[2], s3[2]);
+ StoreAligned16(sum3[3], s3[3]);
+ StoreAligned16(sum5[3], s5[3]);
+ StoreAligned16(sum5[4], s5[4]);
+ SumHorizontal(sq[0], &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+ StoreAligned32U32(square_sum3[2], sq3[2]);
+ StoreAligned32U32(square_sum5[3], sq5[3]);
+ SumHorizontal(sq[1], &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+ StoreAligned32U32(square_sum3[3], sq3[3]);
+ StoreAligned32U32(square_sum5[4], sq5[4]);
+ LoadAligned16x2U16(sum3, 0, s3);
+ LoadAligned32x2U32(square_sum3, 0, sq3);
+ LoadAligned16x3U16(sum5, 0, s5);
+ LoadAligned32x3U32(square_sum5, 0, sq5);
+ CalculateSumAndIndex3(s3 + 0, sq3 + 0, scales[1], &sum[0], &index[0]);
+ CalculateSumAndIndex3(s3 + 1, sq3 + 1, scales[1], &sum[1], &index[1]);
+ CalculateIntermediate(sum, index, &ma3[0][0], &b3[0][0], &b3[1][0]);
+ ma3[1][0] = _mm_srli_si128(ma3[0][0], 8);
+ CalculateIntermediate5<0>(s5, sq5, scales[0], ma5, b5);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess(
+ const __m128i s[2][2], const ptrdiff_t x, const uint16_t scales[2],
+ uint16_t* const sum3[4], uint16_t* const sum5[5],
+ uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+ const ptrdiff_t sum_width, __m128i sq[2][4], __m128i ma3[2][2],
+ __m128i b3[2][3], __m128i ma5[2], __m128i b5[3]) {
+ __m128i s3[2][4], s5[2][5], sq3[4][2], sq5[5][2], sum[2][2], index[2][2];
+ SumHorizontal<8>(s[0], &s3[0][2], &s3[1][2], &s5[0][3], &s5[1][3]);
+ StoreAligned16(sum3[2] + x + 0, s3[0][2]);
+ StoreAligned16(sum3[2] + x + 8, s3[1][2]);
+ StoreAligned16(sum5[3] + x + 0, s5[0][3]);
+ StoreAligned16(sum5[3] + x + 8, s5[1][3]);
+ SumHorizontal<8>(s[1], &s3[0][3], &s3[1][3], &s5[0][4], &s5[1][4]);
+ StoreAligned16(sum3[3] + x + 0, s3[0][3]);
+ StoreAligned16(sum3[3] + x + 8, s3[1][3]);
+ StoreAligned16(sum5[4] + x + 0, s5[0][4]);
+ StoreAligned16(sum5[4] + x + 8, s5[1][4]);
+ sq[0][2] = SquareLo8(s[0][1]);
+ sq[1][2] = SquareLo8(s[1][1]);
+ SumHorizontal(sq[0] + 1, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+ StoreAligned32U32(square_sum3[2] + x, sq3[2]);
+ StoreAligned32U32(square_sum5[3] + x, sq5[3]);
+ SumHorizontal(sq[1] + 1, &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+ StoreAligned32U32(square_sum3[3] + x, sq3[3]);
+ StoreAligned32U32(square_sum5[4] + x, sq5[4]);
+ LoadAligned16x2U16(sum3, x, s3[0]);
+ LoadAligned32x2U32(square_sum3, x, sq3);
+ CalculateSumAndIndex3(s3[0], sq3, scales[1], &sum[0][0], &index[0][0]);
+ CalculateSumAndIndex3(s3[0] + 1, sq3 + 1, scales[1], &sum[1][0],
+ &index[1][0]);
+ LoadAligned16x3U16(sum5, x, s5[0]);
+ LoadAligned32x3U32(square_sum5, x, sq5);
+ CalculateIntermediate5<8>(s5[0], sq5, scales[0], &ma5[0], &b5[1]);
+
+ sq[0][3] = SquareHi8(s[0][1]);
+ sq[1][3] = SquareHi8(s[1][1]);
+ SumHorizontal(sq[0] + 2, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+ StoreAligned32U32(square_sum3[2] + x + 8, sq3[2]);
+ StoreAligned32U32(square_sum5[3] + x + 8, sq5[3]);
+ SumHorizontal(sq[1] + 2, &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+ StoreAligned32U32(square_sum3[3] + x + 8, sq3[3]);
+ StoreAligned32U32(square_sum5[4] + x + 8, sq5[4]);
+ LoadAligned16x2U16Msan(sum3, x + 8, sum_width, s3[1]);
+ LoadAligned32x2U32Msan(square_sum3, x + 8, sum_width, sq3);
+ CalculateSumAndIndex3(s3[1], sq3, scales[1], &sum[0][1], &index[0][1]);
+ CalculateSumAndIndex3(s3[1] + 1, sq3 + 1, scales[1], &sum[1][1],
+ &index[1][1]);
+ CalculateIntermediate(sum[0], index[0], ma3[0], b3[0] + 1);
+ CalculateIntermediate(sum[1], index[1], ma3[1], b3[1] + 1);
+ LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
+ LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
+ CalculateIntermediate5<0>(s5[1], sq5, scales[0], &ma5[1], &b5[2]);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRowLo(
+ const __m128i s, const uint16_t scales[2], const uint16_t* const sum3[4],
+ const uint16_t* const sum5[5], const uint32_t* const square_sum3[4],
+ const uint32_t* const square_sum5[5], __m128i sq[2], __m128i* const ma3,
+ __m128i* const ma5, __m128i* const b3, __m128i* const b5) {
+ __m128i s3[3], s5[5], sq3[3][2], sq5[5][2];
+ sq[1] = SquareHi8(s);
+ SumHorizontalLo(s, &s3[2], &s5[3]);
+ SumHorizontal(sq, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+ LoadAligned16x3U16(sum5, 0, s5);
+ s5[4] = s5[3];
+ LoadAligned32x3U32(square_sum5, 0, sq5);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ CalculateIntermediate5<0>(s5, sq5, scales[0], ma5, b5);
+ LoadAligned16x2U16(sum3, 0, s3);
+ LoadAligned32x2U32(square_sum3, 0, sq3);
+ CalculateIntermediate3(s3, sq3, scales[1], ma3, b3);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRow(
+ const __m128i s[2], const ptrdiff_t sum_width, const ptrdiff_t x,
+ const uint16_t scales[2], const uint16_t* const sum3[4],
+ const uint16_t* const sum5[5], const uint32_t* const square_sum3[4],
+ const uint32_t* const square_sum5[5], __m128i sq[4], __m128i ma3[2],
+ __m128i ma5[2], __m128i b3[3], __m128i b5[3]) {
+ __m128i s3[2][3], s5[2][5], sq3[3][2], sq5[5][2], sum[2], index[2];
+ sq[2] = SquareLo8(s[1]);
+ SumHorizontal<8>(s, &s3[0][2], &s3[1][2], &s5[0][3], &s5[1][3]);
+ SumHorizontal(sq + 1, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+ LoadAligned16x3U16(sum5, x, s5[0]);
+ s5[0][4] = s5[0][3];
+ LoadAligned32x3U32(square_sum5, x, sq5);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ CalculateIntermediate5<8>(s5[0], sq5, scales[0], ma5, b5 + 1);
+ LoadAligned16x2U16(sum3, x, s3[0]);
+ LoadAligned32x2U32(square_sum3, x, sq3);
+ CalculateSumAndIndex3(s3[0], sq3, scales[1], &sum[0], &index[0]);
+
+ sq[3] = SquareHi8(s[1]);
+ SumHorizontal(sq + 2, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+ LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
+ s5[1][4] = s5[1][3];
+ LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ CalculateIntermediate5<0>(s5[1], sq5, scales[0], ma5 + 1, b5 + 2);
+ LoadAligned16x2U16Msan(sum3, x + 8, sum_width, s3[1]);
+ LoadAligned32x2U32Msan(square_sum3, x + 8, sum_width, sq3);
+ CalculateSumAndIndex3(s3[1], sq3, scales[1], &sum[1], &index[1]);
+ CalculateIntermediate(sum, index, ma3, b3 + 1);
+}
+
+inline void BoxSumFilterPreProcess5(const uint8_t* const src0,
+ const uint8_t* const src1, const int width,
+ const uint32_t scale,
+ uint16_t* const sum5[5],
+ uint32_t* const square_sum5[5],
+ const ptrdiff_t sum_width, uint16_t* ma565,
+ uint32_t* b565) {
+ __m128i s[2][2], mas[2], sq[2][4], bs[3];
+ s[0][0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width);
+ s[1][0] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1 - width);
+ sq[0][0] = SquareLo8(s[0][0]);
+ sq[1][0] = SquareLo8(s[1][0]);
+ BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq, &mas[0], &bs[0]);
+
+ int x = 0;
+ do {
+ __m128i ma5[3], ma[2], b[4];
+ s[0][1] = LoadUnaligned16Msan(src0 + x + 16,
+ x + 16 + kOverreadInBytesPass1 - width);
+ s[1][1] = LoadUnaligned16Msan(src1 + x + 16,
+ x + 16 + kOverreadInBytesPass1 - width);
+ BoxFilterPreProcess5(s, sum_width, x + 8, scale, sum5, square_sum5, sq, mas,
+ bs);
+ Prepare3_8<0>(mas, ma5);
+ ma[0] = Sum565Lo(ma5);
+ ma[1] = Sum565Hi(ma5);
+ StoreAligned32U16(ma565, ma);
+ Sum565W(bs + 0, b + 0);
+ Sum565W(bs + 1, b + 2);
+ StoreAligned64U32(b565, b);
+ s[0][0] = s[0][1];
+ s[1][0] = s[1][1];
+ sq[0][1] = sq[0][3];
+ sq[1][1] = sq[1][3];
+ mas[0] = mas[1];
+ bs[0] = bs[2];
+ ma565 += 16;
+ b565 += 16;
+ x += 16;
+ } while (x < width);
+}
+
+template <bool calculate444>
+LIBGAV1_ALWAYS_INLINE void BoxSumFilterPreProcess3(
+ const uint8_t* const src, const int width, const uint32_t scale,
+ uint16_t* const sum3[3], uint32_t* const square_sum3[3],
+ const ptrdiff_t sum_width, uint16_t* ma343, uint16_t* ma444, uint32_t* b343,
+ uint32_t* b444) {
+ __m128i s[2], mas[2], sq[4], bs[3];
+ s[0] = LoadUnaligned16Msan(src, kOverreadInBytesPass2 - width);
+ sq[0] = SquareLo8(s[0]);
+ BoxFilterPreProcess3Lo(s[0], scale, sum3, square_sum3, sq, &mas[0], &bs[0]);
+
+ int x = 0;
+ do {
+ s[1] = LoadUnaligned16Msan(src + x + 16,
+ x + 16 + kOverreadInBytesPass2 - width);
+ BoxFilterPreProcess3(s, x + 8, sum_width, scale, sum3, square_sum3, sq, mas,
+ bs);
+ __m128i ma3[3];
+ Prepare3_8<0>(mas, ma3);
+ if (calculate444) { // NOLINT(readability-simplify-boolean-expr)
+ Store343_444Lo(ma3, bs + 0, 0, ma343, ma444, b343, b444);
+ Store343_444Hi(ma3, bs + 1, 8, ma343, ma444, b343, b444);
+ ma444 += 16;
+ b444 += 16;
+ } else {
+ __m128i ma[2], b[4];
+ ma[0] = Sum343Lo(ma3);
+ ma[1] = Sum343Hi(ma3);
+ StoreAligned32U16(ma343, ma);
+ Sum343W(bs + 0, b + 0);
+ Sum343W(bs + 1, b + 2);
+ StoreAligned64U32(b343, b);
+ }
+ s[0] = s[1];
+ sq[1] = sq[3];
+ mas[0] = mas[1];
+ bs[0] = bs[2];
+ ma343 += 16;
+ b343 += 16;
+ x += 16;
+ } while (x < width);
+}
+
+inline void BoxSumFilterPreProcess(
+ const uint8_t* const src0, const uint8_t* const src1, const int width,
+ const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5],
+ uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+ const ptrdiff_t sum_width, uint16_t* const ma343[4],
+ uint16_t* const ma444[2], uint16_t* ma565, uint32_t* const b343[4],
+ uint32_t* const b444[2], uint32_t* b565) {
+ __m128i s[2][2], ma3[2][2], ma5[2], sq[2][4], b3[2][3], b5[3];
+ s[0][0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width);
+ s[1][0] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1 - width);
+ sq[0][0] = SquareLo8(s[0][0]);
+ sq[1][0] = SquareLo8(s[1][0]);
+ BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq,
+ ma3, b3, &ma5[0], &b5[0]);
+
+ int x = 0;
+ do {
+ __m128i ma[2], b[4], ma3x[3], ma5x[3];
+ s[0][1] = LoadUnaligned16Msan(src0 + x + 16,
+ x + 16 + kOverreadInBytesPass1 - width);
+ s[1][1] = LoadUnaligned16Msan(src1 + x + 16,
+ x + 16 + kOverreadInBytesPass1 - width);
+ BoxFilterPreProcess(s, x + 8, scales, sum3, sum5, square_sum3, square_sum5,
+ sum_width, sq, ma3, b3, ma5, b5);
+
+ Prepare3_8<0>(ma3[0], ma3x);
+ ma[0] = Sum343Lo(ma3x);
+ ma[1] = Sum343Hi(ma3x);
+ StoreAligned32U16(ma343[0] + x, ma);
+ Sum343W(b3[0] + 0, b + 0);
+ Sum343W(b3[0] + 1, b + 2);
+ StoreAligned64U32(b343[0] + x, b);
+ Sum565W(b5 + 0, b + 0);
+ Sum565W(b5 + 1, b + 2);
+ StoreAligned64U32(b565, b);
+ Prepare3_8<0>(ma3[1], ma3x);
+ Store343_444Lo(ma3x, b3[1], x, ma343[1], ma444[0], b343[1], b444[0]);
+ Store343_444Hi(ma3x, b3[1] + 1, x + 8, ma343[1], ma444[0], b343[1],
+ b444[0]);
+ Prepare3_8<0>(ma5, ma5x);
+ ma[0] = Sum565Lo(ma5x);
+ ma[1] = Sum565Hi(ma5x);
+ StoreAligned32U16(ma565, ma);
+ s[0][0] = s[0][1];
+ s[1][0] = s[1][1];
+ sq[0][1] = sq[0][3];
+ sq[1][1] = sq[1][3];
+ ma3[0][0] = ma3[0][1];
+ ma3[1][0] = ma3[1][1];
+ ma5[0] = ma5[1];
+ b3[0][0] = b3[0][2];
+ b3[1][0] = b3[1][2];
+ b5[0] = b5[2];
+ ma565 += 16;
+ b565 += 16;
+ x += 16;
+ } while (x < width);
+}
+
+template <int shift>
+inline __m128i FilterOutput(const __m128i ma_x_src, const __m128i b) {
+ // ma: 255 * 32 = 8160 (13 bits)
+ // b: 65088 * 32 = 2082816 (21 bits)
+ // v: b - ma * 255 (22 bits)
+ const __m128i v = _mm_sub_epi32(b, ma_x_src);
+ // kSgrProjSgrBits = 8
+ // kSgrProjRestoreBits = 4
+ // shift = 4 or 5
+ // v >> 8 or 9 (13 bits)
+ return VrshrS32(v, kSgrProjSgrBits + shift - kSgrProjRestoreBits);
+}
+
+template <int shift>
+inline __m128i CalculateFilteredOutput(const __m128i src, const __m128i ma,
+ const __m128i b[2]) {
+ const __m128i ma_x_src_lo = VmullLo16(ma, src);
+ const __m128i ma_x_src_hi = VmullHi16(ma, src);
+ const __m128i dst_lo = FilterOutput<shift>(ma_x_src_lo, b[0]);
+ const __m128i dst_hi = FilterOutput<shift>(ma_x_src_hi, b[1]);
+ return _mm_packs_epi32(dst_lo, dst_hi); // 13 bits
+}
+
+inline __m128i CalculateFilteredOutputPass1(const __m128i src, __m128i ma[2],
+ __m128i b[2][2]) {
+ const __m128i ma_sum = _mm_add_epi16(ma[0], ma[1]);
+ __m128i b_sum[2];
+ b_sum[0] = _mm_add_epi32(b[0][0], b[1][0]);
+ b_sum[1] = _mm_add_epi32(b[0][1], b[1][1]);
+ return CalculateFilteredOutput<5>(src, ma_sum, b_sum);
+}
+
+inline __m128i CalculateFilteredOutputPass2(const __m128i src, __m128i ma[3],
+ __m128i b[3][2]) {
+ const __m128i ma_sum = Sum3_16(ma);
+ __m128i b_sum[2];
+ Sum3_32(b, b_sum);
+ return CalculateFilteredOutput<5>(src, ma_sum, b_sum);
+}
+
+inline __m128i SelfGuidedFinal(const __m128i src, const __m128i v[2]) {
+ const __m128i v_lo =
+ VrshrS32(v[0], kSgrProjRestoreBits + kSgrProjPrecisionBits);
+ const __m128i v_hi =
+ VrshrS32(v[1], kSgrProjRestoreBits + kSgrProjPrecisionBits);
+ const __m128i vv = _mm_packs_epi32(v_lo, v_hi);
+ return _mm_add_epi16(src, vv);
+}
+
+inline __m128i SelfGuidedDoubleMultiplier(const __m128i src,
+ const __m128i filter[2], const int w0,
+ const int w2) {
+ __m128i v[2];
+ const __m128i w0_w2 = _mm_set1_epi32((w2 << 16) | static_cast<uint16_t>(w0));
+ const __m128i f_lo = _mm_unpacklo_epi16(filter[0], filter[1]);
+ const __m128i f_hi = _mm_unpackhi_epi16(filter[0], filter[1]);
+ v[0] = _mm_madd_epi16(w0_w2, f_lo);
+ v[1] = _mm_madd_epi16(w0_w2, f_hi);
+ return SelfGuidedFinal(src, v);
+}
+
+inline __m128i SelfGuidedSingleMultiplier(const __m128i src,
+ const __m128i filter, const int w0) {
+ // weight: -96 to 96 (Sgrproj_Xqd_Min/Max)
+ __m128i v[2];
+ v[0] = VmullNLo8(filter, w0);
+ v[1] = VmullNHi8(filter, w0);
+ return SelfGuidedFinal(src, v);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPass1(
+ const uint8_t* const src, const uint8_t* const src0,
+ const uint8_t* const src1, const ptrdiff_t stride, uint16_t* const sum5[5],
+ uint32_t* const square_sum5[5], const int width, const ptrdiff_t sum_width,
+ const uint32_t scale, const int16_t w0, uint16_t* const ma565[2],
+ uint32_t* const b565[2], uint8_t* const dst) {
+ __m128i s[2][2], mas[2], sq[2][4], bs[3];
+ s[0][0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width);
+ s[1][0] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1 - width);
+ sq[0][0] = SquareLo8(s[0][0]);
+ sq[1][0] = SquareLo8(s[1][0]);
+ BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq, &mas[0], &bs[0]);
+
+ int x = 0;
+ do {
+ __m128i ma[2], ma3[3], b[2][2], sr[2], p[2];
+ s[0][1] = LoadUnaligned16Msan(src0 + x + 16,
+ x + 16 + kOverreadInBytesPass1 - width);
+ s[1][1] = LoadUnaligned16Msan(src1 + x + 16,
+ x + 16 + kOverreadInBytesPass1 - width);
+ BoxFilterPreProcess5(s, sum_width, x + 8, scale, sum5, square_sum5, sq, mas,
+ bs);
+ Prepare3_8<0>(mas, ma3);
+ ma[1] = Sum565Lo(ma3);
+ StoreAligned16(ma565[1] + x, ma[1]);
+ Sum565W(bs, b[1]);
+ StoreAligned32U32(b565[1] + x, b[1]);
+ sr[0] = LoadAligned16(src + x);
+ sr[1] = LoadAligned16(src + stride + x);
+ const __m128i sr0_lo = _mm_unpacklo_epi8(sr[0], _mm_setzero_si128());
+ const __m128i sr1_lo = _mm_unpacklo_epi8(sr[1], _mm_setzero_si128());
+ ma[0] = LoadAligned16(ma565[0] + x);
+ LoadAligned32U32(b565[0] + x, b[0]);
+ p[0] = CalculateFilteredOutputPass1(sr0_lo, ma, b);
+ p[1] = CalculateFilteredOutput<4>(sr1_lo, ma[1], b[1]);
+ const __m128i d00 = SelfGuidedSingleMultiplier(sr0_lo, p[0], w0);
+ const __m128i d10 = SelfGuidedSingleMultiplier(sr1_lo, p[1], w0);
+
+ ma[1] = Sum565Hi(ma3);
+ StoreAligned16(ma565[1] + x + 8, ma[1]);
+ Sum565W(bs + 1, b[1]);
+ StoreAligned32U32(b565[1] + x + 8, b[1]);
+ const __m128i sr0_hi = _mm_unpackhi_epi8(sr[0], _mm_setzero_si128());
+ const __m128i sr1_hi = _mm_unpackhi_epi8(sr[1], _mm_setzero_si128());
+ ma[0] = LoadAligned16(ma565[0] + x + 8);
+ LoadAligned32U32(b565[0] + x + 8, b[0]);
+ p[0] = CalculateFilteredOutputPass1(sr0_hi, ma, b);
+ p[1] = CalculateFilteredOutput<4>(sr1_hi, ma[1], b[1]);
+ const __m128i d01 = SelfGuidedSingleMultiplier(sr0_hi, p[0], w0);
+ StoreAligned16(dst + x, _mm_packus_epi16(d00, d01));
+ const __m128i d11 = SelfGuidedSingleMultiplier(sr1_hi, p[1], w0);
+ StoreAligned16(dst + stride + x, _mm_packus_epi16(d10, d11));
+ s[0][0] = s[0][1];
+ s[1][0] = s[1][1];
+ sq[0][1] = sq[0][3];
+ sq[1][1] = sq[1][3];
+ mas[0] = mas[1];
+ bs[0] = bs[2];
+ x += 16;
+ } while (x < width);
+}
+
+inline void BoxFilterPass1LastRow(
+ const uint8_t* const src, const uint8_t* const src0, const int width,
+ const ptrdiff_t sum_width, const uint32_t scale, const int16_t w0,
+ uint16_t* const sum5[5], uint32_t* const square_sum5[5], uint16_t* ma565,
+ uint32_t* b565, uint8_t* const dst) {
+ __m128i s[2], mas[2], sq[4], bs[3];
+ s[0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width);
+ sq[0] = SquareLo8(s[0]);
+ BoxFilterPreProcess5LastRowLo(s[0], scale, sum5, square_sum5, sq, &mas[0],
+ &bs[0]);
+
+ int x = 0;
+ do {
+ __m128i ma[2], ma5[3], b[2][2];
+ s[1] = LoadUnaligned16Msan(src0 + x + 16,
+ x + 16 + kOverreadInBytesPass1 - width);
+ BoxFilterPreProcess5LastRow(s, sum_width, x + 8, scale, sum5, square_sum5,
+ sq, mas, bs);
+ Prepare3_8<0>(mas, ma5);
+ ma[1] = Sum565Lo(ma5);
+ Sum565W(bs, b[1]);
+ ma[0] = LoadAligned16(ma565);
+ LoadAligned32U32(b565, b[0]);
+ const __m128i sr = LoadAligned16(src + x);
+ const __m128i sr_lo = _mm_unpacklo_epi8(sr, _mm_setzero_si128());
+ __m128i p = CalculateFilteredOutputPass1(sr_lo, ma, b);
+ const __m128i d0 = SelfGuidedSingleMultiplier(sr_lo, p, w0);
+
+ ma[1] = Sum565Hi(ma5);
+ Sum565W(bs + 1, b[1]);
+ ma[0] = LoadAligned16(ma565 + 8);
+ LoadAligned32U32(b565 + 8, b[0]);
+ const __m128i sr_hi = _mm_unpackhi_epi8(sr, _mm_setzero_si128());
+ p = CalculateFilteredOutputPass1(sr_hi, ma, b);
+ const __m128i d1 = SelfGuidedSingleMultiplier(sr_hi, p, w0);
+ StoreAligned16(dst + x, _mm_packus_epi16(d0, d1));
+ s[0] = s[1];
+ sq[1] = sq[3];
+ mas[0] = mas[1];
+ bs[0] = bs[2];
+ ma565 += 16;
+ b565 += 16;
+ x += 16;
+ } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPass2(
+ const uint8_t* const src, const uint8_t* const src0, const int width,
+ const ptrdiff_t sum_width, const uint32_t scale, const int16_t w0,
+ uint16_t* const sum3[3], uint32_t* const square_sum3[3],
+ uint16_t* const ma343[3], uint16_t* const ma444[2], uint32_t* const b343[3],
+ uint32_t* const b444[2], uint8_t* const dst) {
+ __m128i s[2], mas[2], sq[4], bs[3];
+ s[0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass2 - width);
+ sq[0] = SquareLo8(s[0]);
+ BoxFilterPreProcess3Lo(s[0], scale, sum3, square_sum3, sq, &mas[0], &bs[0]);
+
+ int x = 0;
+ do {
+ s[1] = LoadUnaligned16Msan(src0 + x + 16,
+ x + 16 + kOverreadInBytesPass2 - width);
+ BoxFilterPreProcess3(s, x + 8, sum_width, scale, sum3, square_sum3, sq, mas,
+ bs);
+ __m128i ma[3], b[3][2], ma3[3];
+ Prepare3_8<0>(mas, ma3);
+ Store343_444Lo(ma3, bs + 0, x, &ma[2], b[2], ma343[2], ma444[1], b343[2],
+ b444[1]);
+ const __m128i sr = LoadAligned16(src + x);
+ const __m128i sr_lo = _mm_unpacklo_epi8(sr, _mm_setzero_si128());
+ ma[0] = LoadAligned16(ma343[0] + x);
+ ma[1] = LoadAligned16(ma444[0] + x);
+ LoadAligned32U32(b343[0] + x, b[0]);
+ LoadAligned32U32(b444[0] + x, b[1]);
+ const __m128i p0 = CalculateFilteredOutputPass2(sr_lo, ma, b);
+
+ Store343_444Hi(ma3, bs + 1, x + 8, &ma[2], b[2], ma343[2], ma444[1],
+ b343[2], b444[1]);
+ const __m128i sr_hi = _mm_unpackhi_epi8(sr, _mm_setzero_si128());
+ ma[0] = LoadAligned16(ma343[0] + x + 8);
+ ma[1] = LoadAligned16(ma444[0] + x + 8);
+ LoadAligned32U32(b343[0] + x + 8, b[0]);
+ LoadAligned32U32(b444[0] + x + 8, b[1]);
+ const __m128i p1 = CalculateFilteredOutputPass2(sr_hi, ma, b);
+ const __m128i d0 = SelfGuidedSingleMultiplier(sr_lo, p0, w0);
+ const __m128i d1 = SelfGuidedSingleMultiplier(sr_hi, p1, w0);
+ StoreAligned16(dst + x, _mm_packus_epi16(d0, d1));
+ s[0] = s[1];
+ sq[1] = sq[3];
+ mas[0] = mas[1];
+ bs[0] = bs[2];
+ x += 16;
+ } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilter(
+ const uint8_t* const src, const uint8_t* const src0,
+ const uint8_t* const src1, const ptrdiff_t stride, const int width,
+ const uint16_t scales[2], const int16_t w0, const int16_t w2,
+ uint16_t* const sum3[4], uint16_t* const sum5[5],
+ uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+ const ptrdiff_t sum_width, uint16_t* const ma343[4],
+ uint16_t* const ma444[3], uint16_t* const ma565[2], uint32_t* const b343[4],
+ uint32_t* const b444[3], uint32_t* const b565[2], uint8_t* const dst) {
+ __m128i s[2][2], ma3[2][2], ma5[2], sq[2][4], b3[2][3], b5[3];
+ s[0][0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width);
+ s[1][0] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1 - width);
+ sq[0][0] = SquareLo8(s[0][0]);
+ sq[1][0] = SquareLo8(s[1][0]);
+ BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq,
+ ma3, b3, &ma5[0], &b5[0]);
+
+ int x = 0;
+ do {
+ __m128i ma[3][3], b[3][3][2], p[2][2], ma3x[2][3], ma5x[3];
+ s[0][1] = LoadUnaligned16Msan(src0 + x + 16,
+ x + 16 + kOverreadInBytesPass1 - width);
+ s[1][1] = LoadUnaligned16Msan(src1 + x + 16,
+ x + 16 + kOverreadInBytesPass1 - width);
+ BoxFilterPreProcess(s, x + 8, scales, sum3, sum5, square_sum3, square_sum5,
+ sum_width, sq, ma3, b3, ma5, b5);
+ Prepare3_8<0>(ma3[0], ma3x[0]);
+ Prepare3_8<0>(ma3[1], ma3x[1]);
+ Prepare3_8<0>(ma5, ma5x);
+ Store343_444Lo(ma3x[0], b3[0], x, &ma[1][2], &ma[2][1], b[1][2], b[2][1],
+ ma343[2], ma444[1], b343[2], b444[1]);
+ Store343_444Lo(ma3x[1], b3[1], x, &ma[2][2], b[2][2], ma343[3], ma444[2],
+ b343[3], b444[2]);
+ ma[0][1] = Sum565Lo(ma5x);
+ StoreAligned16(ma565[1] + x, ma[0][1]);
+ Sum565W(b5, b[0][1]);
+ StoreAligned32U32(b565[1] + x, b[0][1]);
+ const __m128i sr0 = LoadAligned16(src + x);
+ const __m128i sr1 = LoadAligned16(src + stride + x);
+ const __m128i sr0_lo = _mm_unpacklo_epi8(sr0, _mm_setzero_si128());
+ const __m128i sr1_lo = _mm_unpacklo_epi8(sr1, _mm_setzero_si128());
+ ma[0][0] = LoadAligned16(ma565[0] + x);
+ LoadAligned32U32(b565[0] + x, b[0][0]);
+ p[0][0] = CalculateFilteredOutputPass1(sr0_lo, ma[0], b[0]);
+ p[1][0] = CalculateFilteredOutput<4>(sr1_lo, ma[0][1], b[0][1]);
+ ma[1][0] = LoadAligned16(ma343[0] + x);
+ ma[1][1] = LoadAligned16(ma444[0] + x);
+ LoadAligned32U32(b343[0] + x, b[1][0]);
+ LoadAligned32U32(b444[0] + x, b[1][1]);
+ p[0][1] = CalculateFilteredOutputPass2(sr0_lo, ma[1], b[1]);
+ const __m128i d00 = SelfGuidedDoubleMultiplier(sr0_lo, p[0], w0, w2);
+ ma[2][0] = LoadAligned16(ma343[1] + x);
+ LoadAligned32U32(b343[1] + x, b[2][0]);
+ p[1][1] = CalculateFilteredOutputPass2(sr1_lo, ma[2], b[2]);
+ const __m128i d10 = SelfGuidedDoubleMultiplier(sr1_lo, p[1], w0, w2);
+
+ Store343_444Hi(ma3x[0], b3[0] + 1, x + 8, &ma[1][2], &ma[2][1], b[1][2],
+ b[2][1], ma343[2], ma444[1], b343[2], b444[1]);
+ Store343_444Hi(ma3x[1], b3[1] + 1, x + 8, &ma[2][2], b[2][2], ma343[3],
+ ma444[2], b343[3], b444[2]);
+ ma[0][1] = Sum565Hi(ma5x);
+ StoreAligned16(ma565[1] + x + 8, ma[0][1]);
+ Sum565W(b5 + 1, b[0][1]);
+ StoreAligned32U32(b565[1] + x + 8, b[0][1]);
+ const __m128i sr0_hi = _mm_unpackhi_epi8(sr0, _mm_setzero_si128());
+ const __m128i sr1_hi = _mm_unpackhi_epi8(sr1, _mm_setzero_si128());
+ ma[0][0] = LoadAligned16(ma565[0] + x + 8);
+ LoadAligned32U32(b565[0] + x + 8, b[0][0]);
+ p[0][0] = CalculateFilteredOutputPass1(sr0_hi, ma[0], b[0]);
+ p[1][0] = CalculateFilteredOutput<4>(sr1_hi, ma[0][1], b[0][1]);
+ ma[1][0] = LoadAligned16(ma343[0] + x + 8);
+ ma[1][1] = LoadAligned16(ma444[0] + x + 8);
+ LoadAligned32U32(b343[0] + x + 8, b[1][0]);
+ LoadAligned32U32(b444[0] + x + 8, b[1][1]);
+ p[0][1] = CalculateFilteredOutputPass2(sr0_hi, ma[1], b[1]);
+ const __m128i d01 = SelfGuidedDoubleMultiplier(sr0_hi, p[0], w0, w2);
+ StoreAligned16(dst + x, _mm_packus_epi16(d00, d01));
+ ma[2][0] = LoadAligned16(ma343[1] + x + 8);
+ LoadAligned32U32(b343[1] + x + 8, b[2][0]);
+ p[1][1] = CalculateFilteredOutputPass2(sr1_hi, ma[2], b[2]);
+ const __m128i d11 = SelfGuidedDoubleMultiplier(sr1_hi, p[1], w0, w2);
+ StoreAligned16(dst + stride + x, _mm_packus_epi16(d10, d11));
+ s[0][0] = s[0][1];
+ s[1][0] = s[1][1];
+ sq[0][1] = sq[0][3];
+ sq[1][1] = sq[1][3];
+ ma3[0][0] = ma3[0][1];
+ ma3[1][0] = ma3[1][1];
+ ma5[0] = ma5[1];
+ b3[0][0] = b3[0][2];
+ b3[1][0] = b3[1][2];
+ b5[0] = b5[2];
+ x += 16;
+ } while (x < width);
+}
+
+inline void BoxFilterLastRow(
+ const uint8_t* const src, const uint8_t* const src0, const int width,
+ const ptrdiff_t sum_width, const uint16_t scales[2], const int16_t w0,
+ const int16_t w2, uint16_t* const sum3[4], uint16_t* const sum5[5],
+ uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+ uint16_t* const ma343[4], uint16_t* const ma444[3],
+ uint16_t* const ma565[2], uint32_t* const b343[4], uint32_t* const b444[3],
+ uint32_t* const b565[2], uint8_t* const dst) {
+ __m128i s[2], ma3[2], ma5[2], sq[4], b3[3], b5[3], ma[3], b[3][2];
+ s[0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width);
+ sq[0] = SquareLo8(s[0]);
+ BoxFilterPreProcessLastRowLo(s[0], scales, sum3, sum5, square_sum3,
+ square_sum5, sq, &ma3[0], &ma5[0], &b3[0],
+ &b5[0]);
+
+ int x = 0;
+ do {
+ __m128i ma3x[3], ma5x[3], p[2];
+ s[1] = LoadUnaligned16Msan(src0 + x + 16,
+ x + 16 + kOverreadInBytesPass1 - width);
+ BoxFilterPreProcessLastRow(s, sum_width, x + 8, scales, sum3, sum5,
+ square_sum3, square_sum5, sq, ma3, ma5, b3, b5);
+ Prepare3_8<0>(ma3, ma3x);
+ Prepare3_8<0>(ma5, ma5x);
+ ma[1] = Sum565Lo(ma5x);
+ Sum565W(b5, b[1]);
+ ma[2] = Sum343Lo(ma3x);
+ Sum343W(b3, b[2]);
+ const __m128i sr = LoadAligned16(src + x);
+ const __m128i sr_lo = _mm_unpacklo_epi8(sr, _mm_setzero_si128());
+ ma[0] = LoadAligned16(ma565[0] + x);
+ LoadAligned32U32(b565[0] + x, b[0]);
+ p[0] = CalculateFilteredOutputPass1(sr_lo, ma, b);
+ ma[0] = LoadAligned16(ma343[0] + x);
+ ma[1] = LoadAligned16(ma444[0] + x);
+ LoadAligned32U32(b343[0] + x, b[0]);
+ LoadAligned32U32(b444[0] + x, b[1]);
+ p[1] = CalculateFilteredOutputPass2(sr_lo, ma, b);
+ const __m128i d0 = SelfGuidedDoubleMultiplier(sr_lo, p, w0, w2);
+
+ ma[1] = Sum565Hi(ma5x);
+ Sum565W(b5 + 1, b[1]);
+ ma[2] = Sum343Hi(ma3x);
+ Sum343W(b3 + 1, b[2]);
+ const __m128i sr_hi = _mm_unpackhi_epi8(sr, _mm_setzero_si128());
+ ma[0] = LoadAligned16(ma565[0] + x + 8);
+ LoadAligned32U32(b565[0] + x + 8, b[0]);
+ p[0] = CalculateFilteredOutputPass1(sr_hi, ma, b);
+ ma[0] = LoadAligned16(ma343[0] + x + 8);
+ ma[1] = LoadAligned16(ma444[0] + x + 8);
+ LoadAligned32U32(b343[0] + x + 8, b[0]);
+ LoadAligned32U32(b444[0] + x + 8, b[1]);
+ p[1] = CalculateFilteredOutputPass2(sr_hi, ma, b);
+ const __m128i d1 = SelfGuidedDoubleMultiplier(sr_hi, p, w0, w2);
+ StoreAligned16(dst + x, _mm_packus_epi16(d0, d1));
+ s[0] = s[1];
+ sq[1] = sq[3];
+ ma3[0] = ma3[1];
+ ma5[0] = ma5[1];
+ b3[0] = b3[2];
+ b5[0] = b5[2];
+ x += 16;
+ } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterProcess(
+ const RestorationUnitInfo& restoration_info, const uint8_t* src,
+ const uint8_t* const top_border, const uint8_t* bottom_border,
+ const ptrdiff_t stride, const int width, const int height,
+ SgrBuffer* const sgr_buffer, uint8_t* dst) {
+ const auto temp_stride = Align<ptrdiff_t>(width, 16);
+ const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
+ const auto sum_stride = temp_stride + 16;
+ const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+ const uint16_t* const scales = kSgrScaleParameter[sgr_proj_index]; // < 2^12.
+ const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+ const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+ const int16_t w2 = (1 << kSgrProjPrecisionBits) - w0 - w1;
+ uint16_t *sum3[4], *sum5[5], *ma343[4], *ma444[3], *ma565[2];
+ uint32_t *square_sum3[4], *square_sum5[5], *b343[4], *b444[3], *b565[2];
+ sum3[0] = sgr_buffer->sum3;
+ square_sum3[0] = sgr_buffer->square_sum3;
+ ma343[0] = sgr_buffer->ma343;
+ b343[0] = sgr_buffer->b343;
+ for (int i = 1; i <= 3; ++i) {
+ sum3[i] = sum3[i - 1] + sum_stride;
+ square_sum3[i] = square_sum3[i - 1] + sum_stride;
+ ma343[i] = ma343[i - 1] + temp_stride;
+ b343[i] = b343[i - 1] + temp_stride;
+ }
+ sum5[0] = sgr_buffer->sum5;
+ square_sum5[0] = sgr_buffer->square_sum5;
+ for (int i = 1; i <= 4; ++i) {
+ sum5[i] = sum5[i - 1] + sum_stride;
+ square_sum5[i] = square_sum5[i - 1] + sum_stride;
+ }
+ ma444[0] = sgr_buffer->ma444;
+ b444[0] = sgr_buffer->b444;
+ for (int i = 1; i <= 2; ++i) {
+ ma444[i] = ma444[i - 1] + temp_stride;
+ b444[i] = b444[i - 1] + temp_stride;
+ }
+ ma565[0] = sgr_buffer->ma565;
+ ma565[1] = ma565[0] + temp_stride;
+ b565[0] = sgr_buffer->b565;
+ b565[1] = b565[0] + temp_stride;
+ assert(scales[0] != 0);
+ assert(scales[1] != 0);
+ BoxSum(top_border, stride, width, sum_stride, sum_width, sum3[0], sum5[1],
+ square_sum3[0], square_sum5[1]);
+ sum5[0] = sum5[1];
+ square_sum5[0] = square_sum5[1];
+ const uint8_t* const s = (height > 1) ? src + stride : bottom_border;
+ BoxSumFilterPreProcess(src, s, width, scales, sum3, sum5, square_sum3,
+ square_sum5, sum_width, ma343, ma444, ma565[0], b343,
+ b444, b565[0]);
+ sum5[0] = sgr_buffer->sum5;
+ square_sum5[0] = sgr_buffer->square_sum5;
+
+ for (int y = (height >> 1) - 1; y > 0; --y) {
+ Circulate4PointersBy2<uint16_t>(sum3);
+ Circulate4PointersBy2<uint32_t>(square_sum3);
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ BoxFilter(src + 3, src + 2 * stride, src + 3 * stride, stride, width,
+ scales, w0, w2, sum3, sum5, square_sum3, square_sum5, sum_width,
+ ma343, ma444, ma565, b343, b444, b565, dst);
+ src += 2 * stride;
+ dst += 2 * stride;
+ Circulate4PointersBy2<uint16_t>(ma343);
+ Circulate4PointersBy2<uint32_t>(b343);
+ std::swap(ma444[0], ma444[2]);
+ std::swap(b444[0], b444[2]);
+ std::swap(ma565[0], ma565[1]);
+ std::swap(b565[0], b565[1]);
+ }
+
+ Circulate4PointersBy2<uint16_t>(sum3);
+ Circulate4PointersBy2<uint32_t>(square_sum3);
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ if ((height & 1) == 0 || height > 1) {
+ const uint8_t* sr[2];
+ if ((height & 1) == 0) {
+ sr[0] = bottom_border;
+ sr[1] = bottom_border + stride;
+ } else {
+ sr[0] = src + 2 * stride;
+ sr[1] = bottom_border;
+ }
+ BoxFilter(src + 3, sr[0], sr[1], stride, width, scales, w0, w2, sum3, sum5,
+ square_sum3, square_sum5, sum_width, ma343, ma444, ma565, b343,
+ b444, b565, dst);
+ }
+ if ((height & 1) != 0) {
+ if (height > 1) {
+ src += 2 * stride;
+ dst += 2 * stride;
+ Circulate4PointersBy2<uint16_t>(sum3);
+ Circulate4PointersBy2<uint32_t>(square_sum3);
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ Circulate4PointersBy2<uint16_t>(ma343);
+ Circulate4PointersBy2<uint32_t>(b343);
+ std::swap(ma444[0], ma444[2]);
+ std::swap(b444[0], b444[2]);
+ std::swap(ma565[0], ma565[1]);
+ std::swap(b565[0], b565[1]);
+ }
+ BoxFilterLastRow(src + 3, bottom_border + stride, width, sum_width, scales,
+ w0, w2, sum3, sum5, square_sum3, square_sum5, ma343, ma444,
+ ma565, b343, b444, b565, dst);
+ }
+}
+
+inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
+ const uint8_t* src,
+ const uint8_t* const top_border,
+ const uint8_t* bottom_border,
+ const ptrdiff_t stride, const int width,
+ const int height, SgrBuffer* const sgr_buffer,
+ uint8_t* dst) {
+ const auto temp_stride = Align<ptrdiff_t>(width, 16);
+ const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
+ const auto sum_stride = temp_stride + 16;
+ const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+ const uint32_t scale = kSgrScaleParameter[sgr_proj_index][0]; // < 2^12.
+ const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+ uint16_t *sum5[5], *ma565[2];
+ uint32_t *square_sum5[5], *b565[2];
+ sum5[0] = sgr_buffer->sum5;
+ square_sum5[0] = sgr_buffer->square_sum5;
+ for (int i = 1; i <= 4; ++i) {
+ sum5[i] = sum5[i - 1] + sum_stride;
+ square_sum5[i] = square_sum5[i - 1] + sum_stride;
+ }
+ ma565[0] = sgr_buffer->ma565;
+ ma565[1] = ma565[0] + temp_stride;
+ b565[0] = sgr_buffer->b565;
+ b565[1] = b565[0] + temp_stride;
+ assert(scale != 0);
+ BoxSum<5>(top_border, stride, width, sum_stride, sum_width, sum5[1],
+ square_sum5[1]);
+ sum5[0] = sum5[1];
+ square_sum5[0] = square_sum5[1];
+ const uint8_t* const s = (height > 1) ? src + stride : bottom_border;
+ BoxSumFilterPreProcess5(src, s, width, scale, sum5, square_sum5, sum_width,
+ ma565[0], b565[0]);
+ sum5[0] = sgr_buffer->sum5;
+ square_sum5[0] = sgr_buffer->square_sum5;
+
+ for (int y = (height >> 1) - 1; y > 0; --y) {
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ BoxFilterPass1(src + 3, src + 2 * stride, src + 3 * stride, stride, sum5,
+ square_sum5, width, sum_width, scale, w0, ma565, b565, dst);
+ src += 2 * stride;
+ dst += 2 * stride;
+ std::swap(ma565[0], ma565[1]);
+ std::swap(b565[0], b565[1]);
+ }
+
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ if ((height & 1) == 0 || height > 1) {
+ const uint8_t* sr[2];
+ if ((height & 1) == 0) {
+ sr[0] = bottom_border;
+ sr[1] = bottom_border + stride;
+ } else {
+ sr[0] = src + 2 * stride;
+ sr[1] = bottom_border;
+ }
+ BoxFilterPass1(src + 3, sr[0], sr[1], stride, sum5, square_sum5, width,
+ sum_width, scale, w0, ma565, b565, dst);
+ }
+ if ((height & 1) != 0) {
+ src += 3;
+ if (height > 1) {
+ src += 2 * stride;
+ dst += 2 * stride;
+ std::swap(ma565[0], ma565[1]);
+ std::swap(b565[0], b565[1]);
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ }
+ BoxFilterPass1LastRow(src, bottom_border + stride, width, sum_width, scale,
+ w0, sum5, square_sum5, ma565[0], b565[0], dst);
+ }
+}
+
+inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
+ const uint8_t* src,
+ const uint8_t* const top_border,
+ const uint8_t* bottom_border,
+ const ptrdiff_t stride, const int width,
+ const int height, SgrBuffer* const sgr_buffer,
+ uint8_t* dst) {
+ assert(restoration_info.sgr_proj_info.multiplier[0] == 0);
+ const auto temp_stride = Align<ptrdiff_t>(width, 16);
+ const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
+ const auto sum_stride = temp_stride + 16;
+ const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+ const int16_t w0 = (1 << kSgrProjPrecisionBits) - w1;
+ const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+ const uint32_t scale = kSgrScaleParameter[sgr_proj_index][1]; // < 2^12.
+ uint16_t *sum3[3], *ma343[3], *ma444[2];
+ uint32_t *square_sum3[3], *b343[3], *b444[2];
+ sum3[0] = sgr_buffer->sum3;
+ square_sum3[0] = sgr_buffer->square_sum3;
+ ma343[0] = sgr_buffer->ma343;
+ b343[0] = sgr_buffer->b343;
+ for (int i = 1; i <= 2; ++i) {
+ sum3[i] = sum3[i - 1] + sum_stride;
+ square_sum3[i] = square_sum3[i - 1] + sum_stride;
+ ma343[i] = ma343[i - 1] + temp_stride;
+ b343[i] = b343[i - 1] + temp_stride;
+ }
+ ma444[0] = sgr_buffer->ma444;
+ ma444[1] = ma444[0] + temp_stride;
+ b444[0] = sgr_buffer->b444;
+ b444[1] = b444[0] + temp_stride;
+ assert(scale != 0);
+ BoxSum<3>(top_border, stride, width, sum_stride, sum_width, sum3[0],
+ square_sum3[0]);
+ BoxSumFilterPreProcess3<false>(src, width, scale, sum3, square_sum3,
+ sum_width, ma343[0], nullptr, b343[0],
+ nullptr);
+ Circulate3PointersBy1<uint16_t>(sum3);
+ Circulate3PointersBy1<uint32_t>(square_sum3);
+ const uint8_t* s;
+ if (height > 1) {
+ s = src + stride;
+ } else {
+ s = bottom_border;
+ bottom_border += stride;
+ }
+ BoxSumFilterPreProcess3<true>(s, width, scale, sum3, square_sum3, sum_width,
+ ma343[1], ma444[0], b343[1], b444[0]);
+
+ for (int y = height - 2; y > 0; --y) {
+ Circulate3PointersBy1<uint16_t>(sum3);
+ Circulate3PointersBy1<uint32_t>(square_sum3);
+ BoxFilterPass2(src + 2, src + 2 * stride, width, sum_width, scale, w0, sum3,
+ square_sum3, ma343, ma444, b343, b444, dst);
+ src += stride;
+ dst += stride;
+ Circulate3PointersBy1<uint16_t>(ma343);
+ Circulate3PointersBy1<uint32_t>(b343);
+ std::swap(ma444[0], ma444[1]);
+ std::swap(b444[0], b444[1]);
+ }
+
+ int y = std::min(height, 2);
+ src += 2;
+ do {
+ Circulate3PointersBy1<uint16_t>(sum3);
+ Circulate3PointersBy1<uint32_t>(square_sum3);
+ BoxFilterPass2(src, bottom_border, width, sum_width, scale, w0, sum3,
+ square_sum3, ma343, ma444, b343, b444, dst);
+ src += stride;
+ dst += stride;
+ bottom_border += stride;
+ Circulate3PointersBy1<uint16_t>(ma343);
+ Circulate3PointersBy1<uint32_t>(b343);
+ std::swap(ma444[0], ma444[1]);
+ std::swap(b444[0], b444[1]);
+ } while (--y != 0);
+}
+
+// If |width| is non-multiple of 8, up to 7 more pixels are written to |dest| in
+// the end of each row. It is safe to overwrite the output as it will not be
+// part of the visible frame.
+void SelfGuidedFilter_SSE4_1(
+ const RestorationUnitInfo& restoration_info, const void* const source,
+ const void* const top_border, const void* const bottom_border,
+ const ptrdiff_t stride, const int width, const int height,
+ RestorationBuffer* const restoration_buffer, void* const dest) {
+ const int index = restoration_info.sgr_proj_info.index;
+ const int radius_pass_0 = kSgrProjParams[index][0]; // 2 or 0
+ const int radius_pass_1 = kSgrProjParams[index][2]; // 1 or 0
+ const auto* const src = static_cast<const uint8_t*>(source);
+ const auto* top = static_cast<const uint8_t*>(top_border);
+ const auto* bottom = static_cast<const uint8_t*>(bottom_border);
+ auto* const dst = static_cast<uint8_t*>(dest);
+ SgrBuffer* const sgr_buffer = &restoration_buffer->sgr_buffer;
+ if (radius_pass_1 == 0) {
+ // |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the
+ // following assertion.
+ assert(radius_pass_0 != 0);
+ BoxFilterProcessPass1(restoration_info, src - 3, top - 3, bottom - 3,
+ stride, width, height, sgr_buffer, dst);
+ } else if (radius_pass_0 == 0) {
+ BoxFilterProcessPass2(restoration_info, src - 2, top - 2, bottom - 2,
+ stride, width, height, sgr_buffer, dst);
+ } else {
+ BoxFilterProcess(restoration_info, src - 3, top - 3, bottom - 3, stride,
+ width, height, sgr_buffer, dst);
+ }
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+ static_cast<void>(dsp);
+#if DSP_ENABLED_8BPP_SSE4_1(WienerFilter)
+ dsp->loop_restorations[0] = WienerFilter_SSE4_1;
+#else
+ static_cast<void>(WienerFilter_SSE4_1);
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(SelfGuidedFilter)
+ dsp->loop_restorations[1] = SelfGuidedFilter_SSE4_1;
+#else
+ static_cast<void>(SelfGuidedFilter_SSE4_1);
+#endif
+}
+
+} // namespace
+} // namespace low_bitdepth
+
+void LoopRestorationInit_SSE4_1() { low_bitdepth::Init8bpp(); }
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_TARGETING_SSE4_1
+namespace libgav1 {
+namespace dsp {
+
+void LoopRestorationInit_SSE4_1() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/loop_restoration_sse4.h b/src/dsp/x86/loop_restoration_sse4.h
new file mode 100644
index 0000000..65b2b11
--- /dev/null
+++ b/src/dsp/x86/loop_restoration_sse4.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_LOOP_RESTORATION_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_LOOP_RESTORATION_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::loop_restorations, see the defines below for specifics.
+// These functions are not thread-safe.
+void LoopRestorationInit_SSE4_1();
+void LoopRestorationInit10bpp_SSE4_1();
+
+} // namespace dsp
+} // namespace libgav1
+
+// If sse4 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the sse4 implementation should be used.
+#if LIBGAV1_TARGETING_SSE4_1
+
+#ifndef LIBGAV1_Dsp8bpp_WienerFilter
+#define LIBGAV1_Dsp8bpp_WienerFilter LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_SelfGuidedFilter
+#define LIBGAV1_Dsp8bpp_SelfGuidedFilter LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WienerFilter
+#define LIBGAV1_Dsp10bpp_WienerFilter LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif // LIBGAV1_TARGETING_SSE4_1
+
+#endif // LIBGAV1_SRC_DSP_X86_LOOP_RESTORATION_SSE4_H_
diff --git a/src/dsp/x86/mask_blend_sse4.cc b/src/dsp/x86/mask_blend_sse4.cc
new file mode 100644
index 0000000..d8036be
--- /dev/null
+++ b/src/dsp/x86/mask_blend_sse4.cc
@@ -0,0 +1,447 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/mask_blend.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <smmintrin.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+// Width can only be 4 when it is subsampled from a block of width 8, hence
+// subsampling_x is always 1 when this function is called.
+template <int subsampling_x, int subsampling_y>
+inline __m128i GetMask4x2(const uint8_t* mask, ptrdiff_t mask_stride) {
+ if (subsampling_x == 1) {
+ const __m128i mask_val_0 = _mm_cvtepu8_epi16(LoadLo8(mask));
+ const __m128i mask_val_1 =
+ _mm_cvtepu8_epi16(LoadLo8(mask + (mask_stride << subsampling_y)));
+ __m128i subsampled_mask = _mm_hadd_epi16(mask_val_0, mask_val_1);
+ if (subsampling_y == 1) {
+ const __m128i next_mask_val_0 =
+ _mm_cvtepu8_epi16(LoadLo8(mask + mask_stride));
+ const __m128i next_mask_val_1 =
+ _mm_cvtepu8_epi16(LoadLo8(mask + mask_stride * 3));
+ subsampled_mask = _mm_add_epi16(
+ subsampled_mask, _mm_hadd_epi16(next_mask_val_0, next_mask_val_1));
+ }
+ return RightShiftWithRounding_U16(subsampled_mask, 1 + subsampling_y);
+ }
+ const __m128i mask_val_0 = Load4(mask);
+ const __m128i mask_val_1 = Load4(mask + mask_stride);
+ return _mm_cvtepu8_epi16(
+ _mm_or_si128(mask_val_0, _mm_slli_si128(mask_val_1, 4)));
+}
+
+// This function returns a 16-bit packed mask to fit in _mm_madd_epi16.
+// 16-bit is also the lowest packing for hadd, but without subsampling there is
+// an unfortunate conversion required.
+template <int subsampling_x, int subsampling_y>
+inline __m128i GetMask8(const uint8_t* mask, ptrdiff_t stride) {
+ if (subsampling_x == 1) {
+ const __m128i row_vals = LoadUnaligned16(mask);
+
+ const __m128i mask_val_0 = _mm_cvtepu8_epi16(row_vals);
+ const __m128i mask_val_1 = _mm_cvtepu8_epi16(_mm_srli_si128(row_vals, 8));
+ __m128i subsampled_mask = _mm_hadd_epi16(mask_val_0, mask_val_1);
+
+ if (subsampling_y == 1) {
+ const __m128i next_row_vals = LoadUnaligned16(mask + stride);
+ const __m128i next_mask_val_0 = _mm_cvtepu8_epi16(next_row_vals);
+ const __m128i next_mask_val_1 =
+ _mm_cvtepu8_epi16(_mm_srli_si128(next_row_vals, 8));
+ subsampled_mask = _mm_add_epi16(
+ subsampled_mask, _mm_hadd_epi16(next_mask_val_0, next_mask_val_1));
+ }
+ return RightShiftWithRounding_U16(subsampled_mask, 1 + subsampling_y);
+ }
+ assert(subsampling_y == 0 && subsampling_x == 0);
+ const __m128i mask_val = LoadLo8(mask);
+ return _mm_cvtepu8_epi16(mask_val);
+}
+
+// This version returns 8-bit packed values to fit in _mm_maddubs_epi16 because,
+// when is_inter_intra is true, the prediction values are brought to 8-bit
+// packing as well.
+template <int subsampling_x, int subsampling_y>
+inline __m128i GetInterIntraMask8(const uint8_t* mask, ptrdiff_t stride) {
+ if (subsampling_x == 1) {
+ const __m128i row_vals = LoadUnaligned16(mask);
+
+ const __m128i mask_val_0 = _mm_cvtepu8_epi16(row_vals);
+ const __m128i mask_val_1 = _mm_cvtepu8_epi16(_mm_srli_si128(row_vals, 8));
+ __m128i subsampled_mask = _mm_hadd_epi16(mask_val_0, mask_val_1);
+
+ if (subsampling_y == 1) {
+ const __m128i next_row_vals = LoadUnaligned16(mask + stride);
+ const __m128i next_mask_val_0 = _mm_cvtepu8_epi16(next_row_vals);
+ const __m128i next_mask_val_1 =
+ _mm_cvtepu8_epi16(_mm_srli_si128(next_row_vals, 8));
+ subsampled_mask = _mm_add_epi16(
+ subsampled_mask, _mm_hadd_epi16(next_mask_val_0, next_mask_val_1));
+ }
+ const __m128i ret =
+ RightShiftWithRounding_U16(subsampled_mask, 1 + subsampling_y);
+ return _mm_packus_epi16(ret, ret);
+ }
+ assert(subsampling_y == 0 && subsampling_x == 0);
+ // Unfortunately there is no shift operation for 8-bit packing, or else we
+ // could return everything with 8-bit packing.
+ const __m128i mask_val = LoadLo8(mask);
+ return mask_val;
+}
+
+inline void WriteMaskBlendLine4x2(const int16_t* const pred_0,
+ const int16_t* const pred_1,
+ const __m128i pred_mask_0,
+ const __m128i pred_mask_1, uint8_t* dst,
+ const ptrdiff_t dst_stride) {
+ const __m128i pred_val_0 = LoadAligned16(pred_0);
+ const __m128i pred_val_1 = LoadAligned16(pred_1);
+ const __m128i mask_lo = _mm_unpacklo_epi16(pred_mask_0, pred_mask_1);
+ const __m128i mask_hi = _mm_unpackhi_epi16(pred_mask_0, pred_mask_1);
+ const __m128i pred_lo = _mm_unpacklo_epi16(pred_val_0, pred_val_1);
+ const __m128i pred_hi = _mm_unpackhi_epi16(pred_val_0, pred_val_1);
+
+ // int res = (mask_value * prediction_0[x] +
+ // (64 - mask_value) * prediction_1[x]) >> 6;
+ const __m128i compound_pred_lo = _mm_madd_epi16(pred_lo, mask_lo);
+ const __m128i compound_pred_hi = _mm_madd_epi16(pred_hi, mask_hi);
+ const __m128i compound_pred = _mm_packus_epi32(
+ _mm_srli_epi32(compound_pred_lo, 6), _mm_srli_epi32(compound_pred_hi, 6));
+
+ // dst[x] = static_cast<Pixel>(
+ // Clip3(RightShiftWithRounding(res, inter_post_round_bits), 0,
+ // (1 << kBitdepth8) - 1));
+ const __m128i result = RightShiftWithRounding_S16(compound_pred, 4);
+ const __m128i res = _mm_packus_epi16(result, result);
+ Store4(dst, res);
+ Store4(dst + dst_stride, _mm_srli_si128(res, 4));
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void MaskBlending4x4_SSE4(const int16_t* pred_0, const int16_t* pred_1,
+ const uint8_t* mask,
+ const ptrdiff_t mask_stride, uint8_t* dst,
+ const ptrdiff_t dst_stride) {
+ const __m128i mask_inverter = _mm_set1_epi16(64);
+ __m128i pred_mask_0 =
+ GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+ __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+ WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
+ dst_stride);
+ pred_0 += 4 << 1;
+ pred_1 += 4 << 1;
+ mask += mask_stride << (1 + subsampling_y);
+ dst += dst_stride << 1;
+
+ pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+ pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+ WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
+ dst_stride);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void MaskBlending4xH_SSE4(const int16_t* pred_0, const int16_t* pred_1,
+ const uint8_t* const mask_ptr,
+ const ptrdiff_t mask_stride, const int height,
+ uint8_t* dst, const ptrdiff_t dst_stride) {
+ const uint8_t* mask = mask_ptr;
+ if (height == 4) {
+ MaskBlending4x4_SSE4<subsampling_x, subsampling_y>(
+ pred_0, pred_1, mask, mask_stride, dst, dst_stride);
+ return;
+ }
+ const __m128i mask_inverter = _mm_set1_epi16(64);
+ int y = 0;
+ do {
+ __m128i pred_mask_0 =
+ GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+ __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+
+ WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
+ dst_stride);
+ pred_0 += 4 << 1;
+ pred_1 += 4 << 1;
+ mask += mask_stride << (1 + subsampling_y);
+ dst += dst_stride << 1;
+
+ pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+ pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+ WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
+ dst_stride);
+ pred_0 += 4 << 1;
+ pred_1 += 4 << 1;
+ mask += mask_stride << (1 + subsampling_y);
+ dst += dst_stride << 1;
+
+ pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+ pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+ WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
+ dst_stride);
+ pred_0 += 4 << 1;
+ pred_1 += 4 << 1;
+ mask += mask_stride << (1 + subsampling_y);
+ dst += dst_stride << 1;
+
+ pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+ pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+ WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
+ dst_stride);
+ pred_0 += 4 << 1;
+ pred_1 += 4 << 1;
+ mask += mask_stride << (1 + subsampling_y);
+ dst += dst_stride << 1;
+ y += 8;
+ } while (y < height);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void MaskBlend_SSE4(const void* prediction_0, const void* prediction_1,
+ const ptrdiff_t /*prediction_stride_1*/,
+ const uint8_t* const mask_ptr,
+ const ptrdiff_t mask_stride, const int width,
+ const int height, void* dest,
+ const ptrdiff_t dst_stride) {
+ auto* dst = static_cast<uint8_t*>(dest);
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ const ptrdiff_t pred_stride_0 = width;
+ const ptrdiff_t pred_stride_1 = width;
+ if (width == 4) {
+ MaskBlending4xH_SSE4<subsampling_x, subsampling_y>(
+ pred_0, pred_1, mask_ptr, mask_stride, height, dst, dst_stride);
+ return;
+ }
+ const uint8_t* mask = mask_ptr;
+ const __m128i mask_inverter = _mm_set1_epi16(64);
+ int y = 0;
+ do {
+ int x = 0;
+ do {
+ const __m128i pred_mask_0 = GetMask8<subsampling_x, subsampling_y>(
+ mask + (x << subsampling_x), mask_stride);
+ // 64 - mask
+ const __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+ const __m128i mask_lo = _mm_unpacklo_epi16(pred_mask_0, pred_mask_1);
+ const __m128i mask_hi = _mm_unpackhi_epi16(pred_mask_0, pred_mask_1);
+
+ const __m128i pred_val_0 = LoadAligned16(pred_0 + x);
+ const __m128i pred_val_1 = LoadAligned16(pred_1 + x);
+ const __m128i pred_lo = _mm_unpacklo_epi16(pred_val_0, pred_val_1);
+ const __m128i pred_hi = _mm_unpackhi_epi16(pred_val_0, pred_val_1);
+ // int res = (mask_value * prediction_0[x] +
+ // (64 - mask_value) * prediction_1[x]) >> 6;
+ const __m128i compound_pred_lo = _mm_madd_epi16(pred_lo, mask_lo);
+ const __m128i compound_pred_hi = _mm_madd_epi16(pred_hi, mask_hi);
+
+ const __m128i res = _mm_packus_epi32(_mm_srli_epi32(compound_pred_lo, 6),
+ _mm_srli_epi32(compound_pred_hi, 6));
+ // dst[x] = static_cast<Pixel>(
+ // Clip3(RightShiftWithRounding(res, inter_post_round_bits), 0,
+ // (1 << kBitdepth8) - 1));
+ const __m128i result = RightShiftWithRounding_S16(res, 4);
+ StoreLo8(dst + x, _mm_packus_epi16(result, result));
+
+ x += 8;
+ } while (x < width);
+ dst += dst_stride;
+ pred_0 += pred_stride_0;
+ pred_1 += pred_stride_1;
+ mask += mask_stride << subsampling_y;
+ } while (++y < height);
+}
+
+inline void InterIntraWriteMaskBlendLine8bpp4x2(const uint8_t* const pred_0,
+ uint8_t* const pred_1,
+ const ptrdiff_t pred_stride_1,
+ const __m128i pred_mask_0,
+ const __m128i pred_mask_1) {
+ const __m128i pred_mask = _mm_unpacklo_epi8(pred_mask_0, pred_mask_1);
+
+ const __m128i pred_val_0 = LoadLo8(pred_0);
+ // TODO(b/150326556): One load.
+ __m128i pred_val_1 = Load4(pred_1);
+ pred_val_1 = _mm_or_si128(_mm_slli_si128(Load4(pred_1 + pred_stride_1), 4),
+ pred_val_1);
+ const __m128i pred = _mm_unpacklo_epi8(pred_val_0, pred_val_1);
+ // int res = (mask_value * prediction_1[x] +
+ // (64 - mask_value) * prediction_0[x]) >> 6;
+ const __m128i compound_pred = _mm_maddubs_epi16(pred, pred_mask);
+ const __m128i result = RightShiftWithRounding_U16(compound_pred, 6);
+ const __m128i res = _mm_packus_epi16(result, result);
+
+ Store4(pred_1, res);
+ Store4(pred_1 + pred_stride_1, _mm_srli_si128(res, 4));
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void InterIntraMaskBlending8bpp4x4_SSE4(const uint8_t* pred_0,
+ uint8_t* pred_1,
+ const ptrdiff_t pred_stride_1,
+ const uint8_t* mask,
+ const ptrdiff_t mask_stride) {
+ const __m128i mask_inverter = _mm_set1_epi8(64);
+ const __m128i pred_mask_u16_first =
+ GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+ mask += mask_stride << (1 + subsampling_y);
+ const __m128i pred_mask_u16_second =
+ GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+ mask += mask_stride << (1 + subsampling_y);
+ __m128i pred_mask_1 =
+ _mm_packus_epi16(pred_mask_u16_first, pred_mask_u16_second);
+ __m128i pred_mask_0 = _mm_sub_epi8(mask_inverter, pred_mask_1);
+ InterIntraWriteMaskBlendLine8bpp4x2(pred_0, pred_1, pred_stride_1,
+ pred_mask_0, pred_mask_1);
+ pred_0 += 4 << 1;
+ pred_1 += pred_stride_1 << 1;
+
+ pred_mask_1 = _mm_srli_si128(pred_mask_1, 8);
+ pred_mask_0 = _mm_sub_epi8(mask_inverter, pred_mask_1);
+ InterIntraWriteMaskBlendLine8bpp4x2(pred_0, pred_1, pred_stride_1,
+ pred_mask_0, pred_mask_1);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void InterIntraMaskBlending8bpp4xH_SSE4(const uint8_t* pred_0,
+ uint8_t* pred_1,
+ const ptrdiff_t pred_stride_1,
+ const uint8_t* const mask_ptr,
+ const ptrdiff_t mask_stride,
+ const int height) {
+ const uint8_t* mask = mask_ptr;
+ if (height == 4) {
+ InterIntraMaskBlending8bpp4x4_SSE4<subsampling_x, subsampling_y>(
+ pred_0, pred_1, pred_stride_1, mask, mask_stride);
+ return;
+ }
+ int y = 0;
+ do {
+ InterIntraMaskBlending8bpp4x4_SSE4<subsampling_x, subsampling_y>(
+ pred_0, pred_1, pred_stride_1, mask, mask_stride);
+ pred_0 += 4 << 2;
+ pred_1 += pred_stride_1 << 2;
+ mask += mask_stride << (2 + subsampling_y);
+
+ InterIntraMaskBlending8bpp4x4_SSE4<subsampling_x, subsampling_y>(
+ pred_0, pred_1, pred_stride_1, mask, mask_stride);
+ pred_0 += 4 << 2;
+ pred_1 += pred_stride_1 << 2;
+ mask += mask_stride << (2 + subsampling_y);
+ y += 8;
+ } while (y < height);
+}
+
+template <int subsampling_x, int subsampling_y>
+void InterIntraMaskBlend8bpp_SSE4(const uint8_t* prediction_0,
+ uint8_t* prediction_1,
+ const ptrdiff_t prediction_stride_1,
+ const uint8_t* const mask_ptr,
+ const ptrdiff_t mask_stride, const int width,
+ const int height) {
+ if (width == 4) {
+ InterIntraMaskBlending8bpp4xH_SSE4<subsampling_x, subsampling_y>(
+ prediction_0, prediction_1, prediction_stride_1, mask_ptr, mask_stride,
+ height);
+ return;
+ }
+ const uint8_t* mask = mask_ptr;
+ const __m128i mask_inverter = _mm_set1_epi8(64);
+ int y = 0;
+ do {
+ int x = 0;
+ do {
+ const __m128i pred_mask_1 =
+ GetInterIntraMask8<subsampling_x, subsampling_y>(
+ mask + (x << subsampling_x), mask_stride);
+ // 64 - mask
+ const __m128i pred_mask_0 = _mm_sub_epi8(mask_inverter, pred_mask_1);
+ const __m128i pred_mask = _mm_unpacklo_epi8(pred_mask_0, pred_mask_1);
+
+ const __m128i pred_val_0 = LoadLo8(prediction_0 + x);
+ const __m128i pred_val_1 = LoadLo8(prediction_1 + x);
+ const __m128i pred = _mm_unpacklo_epi8(pred_val_0, pred_val_1);
+ // int res = (mask_value * prediction_1[x] +
+ // (64 - mask_value) * prediction_0[x]) >> 6;
+ const __m128i compound_pred = _mm_maddubs_epi16(pred, pred_mask);
+ const __m128i result = RightShiftWithRounding_U16(compound_pred, 6);
+ const __m128i res = _mm_packus_epi16(result, result);
+
+ StoreLo8(prediction_1 + x, res);
+
+ x += 8;
+ } while (x < width);
+ prediction_0 += width;
+ prediction_1 += prediction_stride_1;
+ mask += mask_stride << subsampling_y;
+ } while (++y < height);
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+#if DSP_ENABLED_8BPP_SSE4_1(MaskBlend444)
+ dsp->mask_blend[0][0] = MaskBlend_SSE4<0, 0>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(MaskBlend422)
+ dsp->mask_blend[1][0] = MaskBlend_SSE4<1, 0>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(MaskBlend420)
+ dsp->mask_blend[2][0] = MaskBlend_SSE4<1, 1>;
+#endif
+ // The is_inter_intra index of mask_blend[][] is replaced by
+ // inter_intra_mask_blend_8bpp[] in 8-bit.
+#if DSP_ENABLED_8BPP_SSE4_1(InterIntraMaskBlend8bpp444)
+ dsp->inter_intra_mask_blend_8bpp[0] = InterIntraMaskBlend8bpp_SSE4<0, 0>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(InterIntraMaskBlend8bpp422)
+ dsp->inter_intra_mask_blend_8bpp[1] = InterIntraMaskBlend8bpp_SSE4<1, 0>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(InterIntraMaskBlend8bpp420)
+ dsp->inter_intra_mask_blend_8bpp[2] = InterIntraMaskBlend8bpp_SSE4<1, 1>;
+#endif
+}
+
+} // namespace
+} // namespace low_bitdepth
+
+void MaskBlendInit_SSE4_1() { low_bitdepth::Init8bpp(); }
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_TARGETING_SSE4_1
+
+namespace libgav1 {
+namespace dsp {
+
+void MaskBlendInit_SSE4_1() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/mask_blend_sse4.h b/src/dsp/x86/mask_blend_sse4.h
new file mode 100644
index 0000000..52b0b5c
--- /dev/null
+++ b/src/dsp/x86/mask_blend_sse4.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_MASK_BLEND_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_MASK_BLEND_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::mask_blend. This function is not thread-safe.
+void MaskBlendInit_SSE4_1();
+
+} // namespace dsp
+} // namespace libgav1
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#ifndef LIBGAV1_Dsp8bpp_MaskBlend444
+#define LIBGAV1_Dsp8bpp_MaskBlend444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_MaskBlend422
+#define LIBGAV1_Dsp8bpp_MaskBlend422 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_MaskBlend420
+#define LIBGAV1_Dsp8bpp_MaskBlend420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp444
+#define LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp422
+#define LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp422 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp420
+#define LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif // LIBGAV1_TARGETING_SSE4_1
+
+#endif // LIBGAV1_SRC_DSP_X86_MASK_BLEND_SSE4_H_
diff --git a/src/dsp/x86/motion_field_projection_sse4.cc b/src/dsp/x86/motion_field_projection_sse4.cc
new file mode 100644
index 0000000..c506941
--- /dev/null
+++ b/src/dsp/x86/motion_field_projection_sse4.cc
@@ -0,0 +1,397 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/motion_field_projection.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <smmintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+inline __m128i LoadDivision(const __m128i division_table,
+ const __m128i reference_offset) {
+ const __m128i kOne = _mm_set1_epi16(0x0100);
+ const __m128i t = _mm_add_epi8(reference_offset, reference_offset);
+ const __m128i tt = _mm_unpacklo_epi8(t, t);
+ const __m128i idx = _mm_add_epi8(tt, kOne);
+ return _mm_shuffle_epi8(division_table, idx);
+}
+
+inline __m128i MvProjection(const __m128i mv, const __m128i denominator,
+ const int numerator) {
+ const __m128i m0 = _mm_madd_epi16(mv, denominator);
+ const __m128i m = _mm_mullo_epi32(m0, _mm_set1_epi32(numerator));
+ // Add the sign (0 or -1) to round towards zero.
+ const __m128i sign = _mm_srai_epi32(m, 31);
+ const __m128i add_sign = _mm_add_epi32(m, sign);
+ const __m128i sum = _mm_add_epi32(add_sign, _mm_set1_epi32(1 << 13));
+ return _mm_srai_epi32(sum, 14);
+}
+
+inline __m128i MvProjectionClip(const __m128i mv, const __m128i denominator,
+ const int numerator) {
+ const __m128i mv0 = _mm_unpacklo_epi16(mv, _mm_setzero_si128());
+ const __m128i mv1 = _mm_unpackhi_epi16(mv, _mm_setzero_si128());
+ const __m128i denorm0 = _mm_unpacklo_epi16(denominator, _mm_setzero_si128());
+ const __m128i denorm1 = _mm_unpackhi_epi16(denominator, _mm_setzero_si128());
+ const __m128i s0 = MvProjection(mv0, denorm0, numerator);
+ const __m128i s1 = MvProjection(mv1, denorm1, numerator);
+ const __m128i projection = _mm_packs_epi32(s0, s1);
+ const __m128i projection_mv_clamp = _mm_set1_epi16(kProjectionMvClamp);
+ const __m128i projection_mv_clamp_negative =
+ _mm_set1_epi16(-kProjectionMvClamp);
+ const __m128i clamp = _mm_min_epi16(projection, projection_mv_clamp);
+ return _mm_max_epi16(clamp, projection_mv_clamp_negative);
+}
+
+inline __m128i Project_SSE4_1(const __m128i delta, const __m128i dst_sign) {
+ // Add 63 to negative delta so that it shifts towards zero.
+ const __m128i delta_sign = _mm_srai_epi16(delta, 15);
+ const __m128i delta_sign_63 = _mm_srli_epi16(delta_sign, 10);
+ const __m128i delta_adjust = _mm_add_epi16(delta, delta_sign_63);
+ const __m128i offset0 = _mm_srai_epi16(delta_adjust, 6);
+ const __m128i offset1 = _mm_xor_si128(offset0, dst_sign);
+ return _mm_sub_epi16(offset1, dst_sign);
+}
+
+inline void GetPosition(
+ const __m128i division_table, const MotionVector* const mv,
+ const int numerator, const int x8_start, const int x8_end, const int x8,
+ const __m128i& r_offsets, const __m128i& source_reference_type8,
+ const __m128i& skip_r, const __m128i& y8_floor8, const __m128i& y8_ceiling8,
+ const __m128i& d_sign, const int delta, __m128i* const r,
+ __m128i* const position_xy, int64_t* const skip_64, __m128i mvs[2]) {
+ const auto* const mv_int = reinterpret_cast<const int32_t*>(mv + x8);
+ *r = _mm_shuffle_epi8(r_offsets, source_reference_type8);
+ const __m128i denorm = LoadDivision(division_table, source_reference_type8);
+ __m128i projection_mv[2];
+ mvs[0] = LoadUnaligned16(mv_int + 0);
+ mvs[1] = LoadUnaligned16(mv_int + 4);
+ // Deinterlace x and y components
+ const __m128i kShuffle =
+ _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
+ const __m128i mv0 = _mm_shuffle_epi8(mvs[0], kShuffle);
+ const __m128i mv1 = _mm_shuffle_epi8(mvs[1], kShuffle);
+ const __m128i mv_y = _mm_unpacklo_epi64(mv0, mv1);
+ const __m128i mv_x = _mm_unpackhi_epi64(mv0, mv1);
+ // numerator could be 0.
+ projection_mv[0] = MvProjectionClip(mv_y, denorm, numerator);
+ projection_mv[1] = MvProjectionClip(mv_x, denorm, numerator);
+ // Do not update the motion vector if the block position is not valid or
+ // if position_x8 is outside the current range of x8_start and x8_end.
+ // Note that position_y8 will always be within the range of y8_start and
+ // y8_end.
+ // After subtracting the base, valid projections are within 8-bit.
+ const __m128i position_y = Project_SSE4_1(projection_mv[0], d_sign);
+ const __m128i position_x = Project_SSE4_1(projection_mv[1], d_sign);
+ const __m128i positions = _mm_packs_epi16(position_x, position_y);
+ const __m128i k01234567 =
+ _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0);
+ *position_xy = _mm_add_epi8(positions, k01234567);
+ const int x8_floor = std::max(
+ x8_start - x8, delta - kProjectionMvMaxHorizontalOffset); // [-8, 8]
+ const int x8_ceiling =
+ std::min(x8_end - x8, delta + 8 + kProjectionMvMaxHorizontalOffset) -
+ 1; // [-1, 15]
+ const __m128i x8_floor8 = _mm_set1_epi8(x8_floor);
+ const __m128i x8_ceiling8 = _mm_set1_epi8(x8_ceiling);
+ const __m128i floor_xy = _mm_unpacklo_epi64(x8_floor8, y8_floor8);
+ const __m128i ceiling_xy = _mm_unpacklo_epi64(x8_ceiling8, y8_ceiling8);
+ const __m128i underflow = _mm_cmplt_epi8(*position_xy, floor_xy);
+ const __m128i overflow = _mm_cmpgt_epi8(*position_xy, ceiling_xy);
+ const __m128i out = _mm_or_si128(underflow, overflow);
+ const __m128i skip_low = _mm_or_si128(skip_r, out);
+ const __m128i skip = _mm_or_si128(skip_low, _mm_srli_si128(out, 8));
+ StoreLo8(skip_64, skip);
+}
+
+template <int idx>
+inline void Store(const __m128i position, const __m128i reference_offset,
+ const __m128i mv, int8_t* dst_reference_offset,
+ MotionVector* dst_mv) {
+ const ptrdiff_t offset =
+ static_cast<int16_t>(_mm_extract_epi16(position, idx));
+ if ((idx & 3) == 0) {
+ dst_mv[offset].mv32 = _mm_cvtsi128_si32(mv);
+ } else {
+ dst_mv[offset].mv32 = _mm_extract_epi32(mv, idx & 3);
+ }
+ dst_reference_offset[offset] = _mm_extract_epi8(reference_offset, idx);
+}
+
+template <int idx>
+inline void CheckStore(const int8_t* skips, const __m128i position,
+ const __m128i reference_offset, const __m128i mv,
+ int8_t* dst_reference_offset, MotionVector* dst_mv) {
+ if (skips[idx] == 0) {
+ Store<idx>(position, reference_offset, mv, dst_reference_offset, dst_mv);
+ }
+}
+
+// 7.9.2.
+void MotionFieldProjectionKernel_SSE4_1(
+ const ReferenceInfo& reference_info,
+ const int reference_to_current_with_sign, const int dst_sign,
+ const int y8_start, const int y8_end, const int x8_start, const int x8_end,
+ TemporalMotionField* const motion_field) {
+ const ptrdiff_t stride = motion_field->mv.columns();
+ // The column range has to be offset by kProjectionMvMaxHorizontalOffset since
+ // coordinates in that range could end up being position_x8 because of
+ // projection.
+ const int adjusted_x8_start =
+ std::max(x8_start - kProjectionMvMaxHorizontalOffset, 0);
+ const int adjusted_x8_end = std::min(
+ x8_end + kProjectionMvMaxHorizontalOffset, static_cast<int>(stride));
+ const int adjusted_x8_end8 = adjusted_x8_end & ~7;
+ const int leftover = adjusted_x8_end - adjusted_x8_end8;
+ const int8_t* const reference_offsets =
+ reference_info.relative_distance_to.data();
+ const bool* const skip_references = reference_info.skip_references.data();
+ const int16_t* const projection_divisions =
+ reference_info.projection_divisions.data();
+ const ReferenceFrameType* source_reference_types =
+ &reference_info.motion_field_reference_frame[y8_start][0];
+ const MotionVector* mv = &reference_info.motion_field_mv[y8_start][0];
+ int8_t* dst_reference_offset = motion_field->reference_offset[y8_start];
+ MotionVector* dst_mv = motion_field->mv[y8_start];
+ const __m128i d_sign = _mm_set1_epi16(dst_sign);
+
+ static_assert(sizeof(int8_t) == sizeof(bool), "");
+ static_assert(sizeof(int8_t) == sizeof(ReferenceFrameType), "");
+ static_assert(sizeof(int32_t) == sizeof(MotionVector), "");
+ assert(dst_sign == 0 || dst_sign == -1);
+ assert(stride == motion_field->reference_offset.columns());
+ assert((y8_start & 7) == 0);
+ assert((adjusted_x8_start & 7) == 0);
+ // The final position calculation is represented with int16_t. Valid
+ // position_y8 from its base is at most 7. After considering the horizontal
+ // offset which is at most |stride - 1|, we have the following assertion,
+ // which means this optimization works for frame width up to 32K (each
+ // position is a 8x8 block).
+ assert(8 * stride <= 32768);
+ const __m128i skip_reference = LoadLo8(skip_references);
+ const __m128i r_offsets = LoadLo8(reference_offsets);
+ const __m128i division_table = LoadUnaligned16(projection_divisions);
+
+ int y8 = y8_start;
+ do {
+ const int y8_floor = (y8 & ~7) - y8; // [-7, 0]
+ const int y8_ceiling = std::min(y8_end - y8, y8_floor + 8) - 1; // [0, 7]
+ const __m128i y8_floor8 = _mm_set1_epi8(y8_floor);
+ const __m128i y8_ceiling8 = _mm_set1_epi8(y8_ceiling);
+ int x8;
+
+ for (x8 = adjusted_x8_start; x8 < adjusted_x8_end8; x8 += 8) {
+ const __m128i source_reference_type8 =
+ LoadLo8(source_reference_types + x8);
+ const __m128i skip_r =
+ _mm_shuffle_epi8(skip_reference, source_reference_type8);
+ int64_t early_skip;
+ StoreLo8(&early_skip, skip_r);
+ // Early termination #1 if all are skips. Chance is typically ~30-40%.
+ if (early_skip == -1) continue;
+ int64_t skip_64;
+ __m128i r, position_xy, mvs[2];
+ GetPosition(division_table, mv, reference_to_current_with_sign, x8_start,
+ x8_end, x8, r_offsets, source_reference_type8, skip_r,
+ y8_floor8, y8_ceiling8, d_sign, 0, &r, &position_xy, &skip_64,
+ mvs);
+ // Early termination #2 if all are skips.
+ // Chance is typically ~15-25% after Early termination #1.
+ if (skip_64 == -1) continue;
+ const __m128i p_y = _mm_cvtepi8_epi16(_mm_srli_si128(position_xy, 8));
+ const __m128i p_x = _mm_cvtepi8_epi16(position_xy);
+ const __m128i p_y_offset = _mm_mullo_epi16(p_y, _mm_set1_epi16(stride));
+ const __m128i pos = _mm_add_epi16(p_y_offset, p_x);
+ const __m128i position = _mm_add_epi16(pos, _mm_set1_epi16(x8));
+ if (skip_64 == 0) {
+ // Store all. Chance is typically ~70-85% after Early termination #2.
+ Store<0>(position, r, mvs[0], dst_reference_offset, dst_mv);
+ Store<1>(position, r, mvs[0], dst_reference_offset, dst_mv);
+ Store<2>(position, r, mvs[0], dst_reference_offset, dst_mv);
+ Store<3>(position, r, mvs[0], dst_reference_offset, dst_mv);
+ Store<4>(position, r, mvs[1], dst_reference_offset, dst_mv);
+ Store<5>(position, r, mvs[1], dst_reference_offset, dst_mv);
+ Store<6>(position, r, mvs[1], dst_reference_offset, dst_mv);
+ Store<7>(position, r, mvs[1], dst_reference_offset, dst_mv);
+ } else {
+ // Check and store each.
+ // Chance is typically ~15-30% after Early termination #2.
+ // The compiler is smart enough to not create the local buffer skips[].
+ int8_t skips[8];
+ memcpy(skips, &skip_64, sizeof(skips));
+ CheckStore<0>(skips, position, r, mvs[0], dst_reference_offset, dst_mv);
+ CheckStore<1>(skips, position, r, mvs[0], dst_reference_offset, dst_mv);
+ CheckStore<2>(skips, position, r, mvs[0], dst_reference_offset, dst_mv);
+ CheckStore<3>(skips, position, r, mvs[0], dst_reference_offset, dst_mv);
+ CheckStore<4>(skips, position, r, mvs[1], dst_reference_offset, dst_mv);
+ CheckStore<5>(skips, position, r, mvs[1], dst_reference_offset, dst_mv);
+ CheckStore<6>(skips, position, r, mvs[1], dst_reference_offset, dst_mv);
+ CheckStore<7>(skips, position, r, mvs[1], dst_reference_offset, dst_mv);
+ }
+ }
+
+ // The following leftover processing cannot be moved out of the do...while
+ // loop. Doing so may change the result storing orders of the same position.
+ if (leftover > 0) {
+ // Use SIMD only when leftover is at least 4, and there are at least 8
+ // elements in a row.
+ if (leftover >= 4 && adjusted_x8_start < adjusted_x8_end8) {
+ // Process the last 8 elements to avoid loading invalid memory. Some
+ // elements may have been processed in the above loop, which is OK.
+ const int delta = 8 - leftover;
+ x8 = adjusted_x8_end - 8;
+ const __m128i source_reference_type8 =
+ LoadLo8(source_reference_types + x8);
+ const __m128i skip_r =
+ _mm_shuffle_epi8(skip_reference, source_reference_type8);
+ int64_t early_skip;
+ StoreLo8(&early_skip, skip_r);
+ // Early termination #1 if all are skips.
+ if (early_skip != -1) {
+ int64_t skip_64;
+ __m128i r, position_xy, mvs[2];
+ GetPosition(division_table, mv, reference_to_current_with_sign,
+ x8_start, x8_end, x8, r_offsets, source_reference_type8,
+ skip_r, y8_floor8, y8_ceiling8, d_sign, delta, &r,
+ &position_xy, &skip_64, mvs);
+ // Early termination #2 if all are skips.
+ if (skip_64 != -1) {
+ const __m128i p_y =
+ _mm_cvtepi8_epi16(_mm_srli_si128(position_xy, 8));
+ const __m128i p_x = _mm_cvtepi8_epi16(position_xy);
+ const __m128i p_y_offset =
+ _mm_mullo_epi16(p_y, _mm_set1_epi16(stride));
+ const __m128i pos = _mm_add_epi16(p_y_offset, p_x);
+ const __m128i position = _mm_add_epi16(pos, _mm_set1_epi16(x8));
+ // Store up to 7 elements since leftover is at most 7.
+ if (skip_64 == 0) {
+ // Store all.
+ Store<1>(position, r, mvs[0], dst_reference_offset, dst_mv);
+ Store<2>(position, r, mvs[0], dst_reference_offset, dst_mv);
+ Store<3>(position, r, mvs[0], dst_reference_offset, dst_mv);
+ Store<4>(position, r, mvs[1], dst_reference_offset, dst_mv);
+ Store<5>(position, r, mvs[1], dst_reference_offset, dst_mv);
+ Store<6>(position, r, mvs[1], dst_reference_offset, dst_mv);
+ Store<7>(position, r, mvs[1], dst_reference_offset, dst_mv);
+ } else {
+ // Check and store each.
+ // The compiler is smart enough to not create the local buffer
+ // skips[].
+ int8_t skips[8];
+ memcpy(skips, &skip_64, sizeof(skips));
+ CheckStore<1>(skips, position, r, mvs[0], dst_reference_offset,
+ dst_mv);
+ CheckStore<2>(skips, position, r, mvs[0], dst_reference_offset,
+ dst_mv);
+ CheckStore<3>(skips, position, r, mvs[0], dst_reference_offset,
+ dst_mv);
+ CheckStore<4>(skips, position, r, mvs[1], dst_reference_offset,
+ dst_mv);
+ CheckStore<5>(skips, position, r, mvs[1], dst_reference_offset,
+ dst_mv);
+ CheckStore<6>(skips, position, r, mvs[1], dst_reference_offset,
+ dst_mv);
+ CheckStore<7>(skips, position, r, mvs[1], dst_reference_offset,
+ dst_mv);
+ }
+ }
+ }
+ } else {
+ for (; x8 < adjusted_x8_end; ++x8) {
+ const int source_reference_type = source_reference_types[x8];
+ if (skip_references[source_reference_type]) continue;
+ MotionVector projection_mv;
+ // reference_to_current_with_sign could be 0.
+ GetMvProjection(mv[x8], reference_to_current_with_sign,
+ projection_divisions[source_reference_type],
+ &projection_mv);
+ // Do not update the motion vector if the block position is not valid
+ // or if position_x8 is outside the current range of x8_start and
+ // x8_end. Note that position_y8 will always be within the range of
+ // y8_start and y8_end.
+ const int position_y8 = Project(0, projection_mv.mv[0], dst_sign);
+ if (position_y8 < y8_floor || position_y8 > y8_ceiling) continue;
+ const int x8_base = x8 & ~7;
+ const int x8_floor =
+ std::max(x8_start, x8_base - kProjectionMvMaxHorizontalOffset);
+ const int x8_ceiling =
+ std::min(x8_end, x8_base + 8 + kProjectionMvMaxHorizontalOffset);
+ const int position_x8 = Project(x8, projection_mv.mv[1], dst_sign);
+ if (position_x8 < x8_floor || position_x8 >= x8_ceiling) continue;
+ dst_mv[position_y8 * stride + position_x8] = mv[x8];
+ dst_reference_offset[position_y8 * stride + position_x8] =
+ reference_offsets[source_reference_type];
+ }
+ }
+ }
+
+ source_reference_types += stride;
+ mv += stride;
+ dst_reference_offset += stride;
+ dst_mv += stride;
+ } while (++y8 < y8_end);
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+ dsp->motion_field_projection_kernel = MotionFieldProjectionKernel_SSE4_1;
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+ dsp->motion_field_projection_kernel = MotionFieldProjectionKernel_SSE4_1;
+}
+#endif
+
+} // namespace
+
+void MotionFieldProjectionInit_SSE4_1() {
+ Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ Init10bpp();
+#endif
+}
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_TARGETING_SSE4_1
+namespace libgav1 {
+namespace dsp {
+
+void MotionFieldProjectionInit_SSE4_1() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/motion_field_projection_sse4.h b/src/dsp/x86/motion_field_projection_sse4.h
new file mode 100644
index 0000000..c05422c
--- /dev/null
+++ b/src/dsp/x86/motion_field_projection_sse4.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_MOTION_FIELD_PROJECTION_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_MOTION_FIELD_PROJECTION_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::motion_field_projection_kernel. This function is not
+// thread-safe.
+void MotionFieldProjectionInit_SSE4_1();
+
+} // namespace dsp
+} // namespace libgav1
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#ifndef LIBGAV1_Dsp8bpp_MotionFieldProjectionKernel
+#define LIBGAV1_Dsp8bpp_MotionFieldProjectionKernel LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif // LIBGAV1_TARGETING_SSE4_1
+
+#endif // LIBGAV1_SRC_DSP_X86_MOTION_FIELD_PROJECTION_SSE4_H_
diff --git a/src/dsp/x86/motion_vector_search_sse4.cc b/src/dsp/x86/motion_vector_search_sse4.cc
new file mode 100644
index 0000000..e9cdd4c
--- /dev/null
+++ b/src/dsp/x86/motion_vector_search_sse4.cc
@@ -0,0 +1,262 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/motion_vector_search.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <smmintrin.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kProjectionMvDivisionLookup_32bit[kMaxFrameDistance + 1] = {
+ 0, 16384, 8192, 5461, 4096, 3276, 2730, 2340, 2048, 1820, 1638,
+ 1489, 1365, 1260, 1170, 1092, 1024, 963, 910, 862, 819, 780,
+ 744, 712, 682, 655, 630, 606, 585, 564, 546, 528};
+
+inline __m128i MvProjection(const __m128i mv, const __m128i denominator,
+ const __m128i numerator) {
+ const __m128i m0 = _mm_madd_epi16(mv, denominator);
+ const __m128i m = _mm_mullo_epi32(m0, numerator);
+ // Add the sign (0 or -1) to round towards zero.
+ const __m128i sign = _mm_srai_epi32(m, 31);
+ const __m128i add_sign = _mm_add_epi32(m, sign);
+ const __m128i sum = _mm_add_epi32(add_sign, _mm_set1_epi32(1 << 13));
+ return _mm_srai_epi32(sum, 14);
+}
+
+inline __m128i MvProjectionClip(const __m128i mvs[2],
+ const __m128i denominators[2],
+ const __m128i numerator) {
+ const __m128i s0 = MvProjection(mvs[0], denominators[0], numerator);
+ const __m128i s1 = MvProjection(mvs[1], denominators[1], numerator);
+ const __m128i mv = _mm_packs_epi32(s0, s1);
+ const __m128i projection_mv_clamp = _mm_set1_epi16(kProjectionMvClamp);
+ const __m128i projection_mv_clamp_negative =
+ _mm_set1_epi16(-kProjectionMvClamp);
+ const __m128i clamp = _mm_min_epi16(mv, projection_mv_clamp);
+ return _mm_max_epi16(clamp, projection_mv_clamp_negative);
+}
+
+inline __m128i MvProjectionCompoundClip(
+ const MotionVector* const temporal_mvs,
+ const int8_t temporal_reference_offsets[2],
+ const int reference_offsets[2]) {
+ const auto* const tmvs = reinterpret_cast<const int32_t*>(temporal_mvs);
+ const __m128i temporal_mv = LoadLo8(tmvs);
+ const __m128i temporal_mv_0 = _mm_cvtepu16_epi32(temporal_mv);
+ __m128i mvs[2], denominators[2];
+ mvs[0] = _mm_unpacklo_epi64(temporal_mv_0, temporal_mv_0);
+ mvs[1] = _mm_unpackhi_epi64(temporal_mv_0, temporal_mv_0);
+ denominators[0] = _mm_set1_epi32(
+ kProjectionMvDivisionLookup[temporal_reference_offsets[0]]);
+ denominators[1] = _mm_set1_epi32(
+ kProjectionMvDivisionLookup[temporal_reference_offsets[1]]);
+ const __m128i offsets = LoadLo8(reference_offsets);
+ const __m128i numerator = _mm_unpacklo_epi32(offsets, offsets);
+ return MvProjectionClip(mvs, denominators, numerator);
+}
+
+inline __m128i MvProjectionSingleClip(
+ const MotionVector* const temporal_mvs,
+ const int8_t* const temporal_reference_offsets,
+ const int reference_offset) {
+ const auto* const tmvs = reinterpret_cast<const int16_t*>(temporal_mvs);
+ const __m128i temporal_mv = LoadAligned16(tmvs);
+ __m128i lookup = _mm_cvtsi32_si128(
+ kProjectionMvDivisionLookup_32bit[temporal_reference_offsets[0]]);
+ lookup = _mm_insert_epi32(
+ lookup, kProjectionMvDivisionLookup_32bit[temporal_reference_offsets[1]],
+ 1);
+ lookup = _mm_insert_epi32(
+ lookup, kProjectionMvDivisionLookup_32bit[temporal_reference_offsets[2]],
+ 2);
+ lookup = _mm_insert_epi32(
+ lookup, kProjectionMvDivisionLookup_32bit[temporal_reference_offsets[3]],
+ 3);
+ __m128i mvs[2], denominators[2];
+ mvs[0] = _mm_unpacklo_epi16(temporal_mv, _mm_setzero_si128());
+ mvs[1] = _mm_unpackhi_epi16(temporal_mv, _mm_setzero_si128());
+ denominators[0] = _mm_unpacklo_epi32(lookup, lookup);
+ denominators[1] = _mm_unpackhi_epi32(lookup, lookup);
+ const __m128i numerator = _mm_set1_epi32(reference_offset);
+ return MvProjectionClip(mvs, denominators, numerator);
+}
+
+inline void LowPrecision(const __m128i mv, void* const candidate_mvs) {
+ const __m128i kRoundDownMask = _mm_set1_epi16(~1);
+ const __m128i sign = _mm_srai_epi16(mv, 15);
+ const __m128i sub_sign = _mm_sub_epi16(mv, sign);
+ const __m128i d = _mm_and_si128(sub_sign, kRoundDownMask);
+ StoreAligned16(candidate_mvs, d);
+}
+
+inline void ForceInteger(const __m128i mv, void* const candidate_mvs) {
+ const __m128i kRoundDownMask = _mm_set1_epi16(~7);
+ const __m128i sign = _mm_srai_epi16(mv, 15);
+ const __m128i mv1 = _mm_add_epi16(mv, _mm_set1_epi16(3));
+ const __m128i mv2 = _mm_sub_epi16(mv1, sign);
+ const __m128i mv3 = _mm_and_si128(mv2, kRoundDownMask);
+ StoreAligned16(candidate_mvs, mv3);
+}
+
+void MvProjectionCompoundLowPrecision_SSE4_1(
+ const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
+ const int reference_offsets[2], const int count,
+ CompoundMotionVector* candidate_mvs) {
+ // |reference_offsets| non-zero check usually equals true and is ignored.
+ // To facilitate the compilers, make a local copy of |reference_offsets|.
+ const int offsets[2] = {reference_offsets[0], reference_offsets[1]};
+ // One more element could be calculated.
+ int i = 0;
+ do {
+ const __m128i mv = MvProjectionCompoundClip(
+ temporal_mvs + i, temporal_reference_offsets + i, offsets);
+ LowPrecision(mv, candidate_mvs + i);
+ i += 2;
+ } while (i < count);
+}
+
+void MvProjectionCompoundForceInteger_SSE4_1(
+ const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
+ const int reference_offsets[2], const int count,
+ CompoundMotionVector* candidate_mvs) {
+ // |reference_offsets| non-zero check usually equals true and is ignored.
+ // To facilitate the compilers, make a local copy of |reference_offsets|.
+ const int offsets[2] = {reference_offsets[0], reference_offsets[1]};
+ // One more element could be calculated.
+ int i = 0;
+ do {
+ const __m128i mv = MvProjectionCompoundClip(
+ temporal_mvs + i, temporal_reference_offsets + i, offsets);
+ ForceInteger(mv, candidate_mvs + i);
+ i += 2;
+ } while (i < count);
+}
+
+void MvProjectionCompoundHighPrecision_SSE4_1(
+ const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
+ const int reference_offsets[2], const int count,
+ CompoundMotionVector* candidate_mvs) {
+ // |reference_offsets| non-zero check usually equals true and is ignored.
+ // To facilitate the compilers, make a local copy of |reference_offsets|.
+ const int offsets[2] = {reference_offsets[0], reference_offsets[1]};
+ // One more element could be calculated.
+ int i = 0;
+ do {
+ const __m128i mv = MvProjectionCompoundClip(
+ temporal_mvs + i, temporal_reference_offsets + i, offsets);
+ StoreAligned16(candidate_mvs + i, mv);
+ i += 2;
+ } while (i < count);
+}
+
+void MvProjectionSingleLowPrecision_SSE4_1(
+ const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
+ const int reference_offset, const int count, MotionVector* candidate_mvs) {
+ // Up to three more elements could be calculated.
+ int i = 0;
+ do {
+ const __m128i mv = MvProjectionSingleClip(
+ temporal_mvs + i, temporal_reference_offsets + i, reference_offset);
+ LowPrecision(mv, candidate_mvs + i);
+ i += 4;
+ } while (i < count);
+}
+
+void MvProjectionSingleForceInteger_SSE4_1(
+ const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
+ const int reference_offset, const int count, MotionVector* candidate_mvs) {
+ // Up to three more elements could be calculated.
+ int i = 0;
+ do {
+ const __m128i mv = MvProjectionSingleClip(
+ temporal_mvs + i, temporal_reference_offsets + i, reference_offset);
+ ForceInteger(mv, candidate_mvs + i);
+ i += 4;
+ } while (i < count);
+}
+
+void MvProjectionSingleHighPrecision_SSE4_1(
+ const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
+ const int reference_offset, const int count, MotionVector* candidate_mvs) {
+ // Up to three more elements could be calculated.
+ int i = 0;
+ do {
+ const __m128i mv = MvProjectionSingleClip(
+ temporal_mvs + i, temporal_reference_offsets + i, reference_offset);
+ StoreAligned16(candidate_mvs + i, mv);
+ i += 4;
+ } while (i < count);
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+ dsp->mv_projection_compound[0] = MvProjectionCompoundLowPrecision_SSE4_1;
+ dsp->mv_projection_compound[1] = MvProjectionCompoundForceInteger_SSE4_1;
+ dsp->mv_projection_compound[2] = MvProjectionCompoundHighPrecision_SSE4_1;
+ dsp->mv_projection_single[0] = MvProjectionSingleLowPrecision_SSE4_1;
+ dsp->mv_projection_single[1] = MvProjectionSingleForceInteger_SSE4_1;
+ dsp->mv_projection_single[2] = MvProjectionSingleHighPrecision_SSE4_1;
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+ dsp->mv_projection_compound[0] = MvProjectionCompoundLowPrecision_SSE4_1;
+ dsp->mv_projection_compound[1] = MvProjectionCompoundForceInteger_SSE4_1;
+ dsp->mv_projection_compound[2] = MvProjectionCompoundHighPrecision_SSE4_1;
+ dsp->mv_projection_single[0] = MvProjectionSingleLowPrecision_SSE4_1;
+ dsp->mv_projection_single[1] = MvProjectionSingleForceInteger_SSE4_1;
+ dsp->mv_projection_single[2] = MvProjectionSingleHighPrecision_SSE4_1;
+}
+#endif
+
+} // namespace
+
+void MotionVectorSearchInit_SSE4_1() {
+ Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ Init10bpp();
+#endif
+}
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_TARGETING_SSE4_1
+namespace libgav1 {
+namespace dsp {
+
+void MotionVectorSearchInit_SSE4_1() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/motion_vector_search_sse4.h b/src/dsp/x86/motion_vector_search_sse4.h
new file mode 100644
index 0000000..d65b392
--- /dev/null
+++ b/src/dsp/x86/motion_vector_search_sse4.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_MOTION_VECTOR_SEARCH_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_MOTION_VECTOR_SEARCH_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::mv_projection_compound and Dsp::mv_projection_single. This
+// function is not thread-safe.
+void MotionVectorSearchInit_SSE4_1();
+
+} // namespace dsp
+} // namespace libgav1
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#ifndef LIBGAV1_Dsp8bpp_MotionVectorSearch
+#define LIBGAV1_Dsp8bpp_MotionVectorSearch LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif // LIBGAV1_TARGETING_SSE4_1
+
+#endif // LIBGAV1_SRC_DSP_X86_MOTION_VECTOR_SEARCH_SSE4_H_
diff --git a/src/dsp/x86/obmc_sse4.cc b/src/dsp/x86/obmc_sse4.cc
new file mode 100644
index 0000000..3a1d1fd
--- /dev/null
+++ b/src/dsp/x86/obmc_sse4.cc
@@ -0,0 +1,329 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/obmc.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <xmmintrin.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+#include "src/dsp/obmc.inc"
+
+inline void OverlapBlendFromLeft2xH_SSE4_1(
+ uint8_t* const prediction, const ptrdiff_t prediction_stride,
+ const int height, const uint8_t* const obmc_prediction,
+ const ptrdiff_t obmc_prediction_stride) {
+ uint8_t* pred = prediction;
+ const uint8_t* obmc_pred = obmc_prediction;
+ const __m128i mask_inverter = _mm_cvtsi32_si128(0x40404040);
+ const __m128i mask_val = _mm_shufflelo_epi16(Load4(kObmcMask), 0);
+ // 64 - mask
+ const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
+ const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val);
+ int y = height;
+ do {
+ const __m128i pred_val = Load2x2(pred, pred + prediction_stride);
+ const __m128i obmc_pred_val =
+ Load2x2(obmc_pred, obmc_pred + obmc_prediction_stride);
+
+ const __m128i terms = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
+ const __m128i result =
+ RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6);
+ const __m128i packed_result = _mm_packus_epi16(result, result);
+ Store2(pred, packed_result);
+ pred += prediction_stride;
+ const int16_t second_row_result = _mm_extract_epi16(packed_result, 1);
+ memcpy(pred, &second_row_result, sizeof(second_row_result));
+ pred += prediction_stride;
+ obmc_pred += obmc_prediction_stride << 1;
+ y -= 2;
+ } while (y != 0);
+}
+
+inline void OverlapBlendFromLeft4xH_SSE4_1(
+ uint8_t* const prediction, const ptrdiff_t prediction_stride,
+ const int height, const uint8_t* const obmc_prediction,
+ const ptrdiff_t obmc_prediction_stride) {
+ uint8_t* pred = prediction;
+ const uint8_t* obmc_pred = obmc_prediction;
+ const __m128i mask_inverter = _mm_cvtsi32_si128(0x40404040);
+ const __m128i mask_val = Load4(kObmcMask + 2);
+ // 64 - mask
+ const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
+ // Duplicate first half of vector.
+ const __m128i masks =
+ _mm_shuffle_epi32(_mm_unpacklo_epi8(mask_val, obmc_mask_val), 0x44);
+ int y = height;
+ do {
+ const __m128i pred_val0 = Load4(pred);
+ const __m128i obmc_pred_val0 = Load4(obmc_pred);
+ pred += prediction_stride;
+ obmc_pred += obmc_prediction_stride;
+
+ // Place the second row of each source in the second four bytes.
+ const __m128i pred_val =
+ _mm_alignr_epi8(Load4(pred), _mm_slli_si128(pred_val0, 12), 12);
+ const __m128i obmc_pred_val = _mm_alignr_epi8(
+ Load4(obmc_pred), _mm_slli_si128(obmc_pred_val0, 12), 12);
+ const __m128i terms = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
+ const __m128i result =
+ RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6);
+ const __m128i packed_result = _mm_packus_epi16(result, result);
+ Store4(pred - prediction_stride, packed_result);
+ const int second_row_result = _mm_extract_epi32(packed_result, 1);
+ memcpy(pred, &second_row_result, sizeof(second_row_result));
+ pred += prediction_stride;
+ obmc_pred += obmc_prediction_stride;
+ y -= 2;
+ } while (y != 0);
+}
+
+inline void OverlapBlendFromLeft8xH_SSE4_1(
+ uint8_t* const prediction, const ptrdiff_t prediction_stride,
+ const int height, const uint8_t* const obmc_prediction,
+ const ptrdiff_t obmc_prediction_stride) {
+ uint8_t* pred = prediction;
+ const uint8_t* obmc_pred = obmc_prediction;
+ const __m128i mask_inverter = _mm_set1_epi8(64);
+ const __m128i mask_val = LoadLo8(kObmcMask + 6);
+ // 64 - mask
+ const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
+ const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val);
+ int y = height;
+ do {
+ const __m128i pred_val = LoadLo8(pred);
+ const __m128i obmc_pred_val = LoadLo8(obmc_pred);
+ const __m128i terms = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
+ const __m128i result =
+ RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6);
+
+ StoreLo8(pred, _mm_packus_epi16(result, result));
+ pred += prediction_stride;
+ obmc_pred += obmc_prediction_stride;
+ } while (--y != 0);
+}
+
+void OverlapBlendFromLeft_SSE4_1(void* const prediction,
+ const ptrdiff_t prediction_stride,
+ const int width, const int height,
+ const void* const obmc_prediction,
+ const ptrdiff_t obmc_prediction_stride) {
+ auto* pred = static_cast<uint8_t*>(prediction);
+ const auto* obmc_pred = static_cast<const uint8_t*>(obmc_prediction);
+
+ if (width == 2) {
+ OverlapBlendFromLeft2xH_SSE4_1(pred, prediction_stride, height, obmc_pred,
+ obmc_prediction_stride);
+ return;
+ }
+ if (width == 4) {
+ OverlapBlendFromLeft4xH_SSE4_1(pred, prediction_stride, height, obmc_pred,
+ obmc_prediction_stride);
+ return;
+ }
+ if (width == 8) {
+ OverlapBlendFromLeft8xH_SSE4_1(pred, prediction_stride, height, obmc_pred,
+ obmc_prediction_stride);
+ return;
+ }
+ const __m128i mask_inverter = _mm_set1_epi8(64);
+ const uint8_t* mask = kObmcMask + width - 2;
+ int x = 0;
+ do {
+ pred = static_cast<uint8_t*>(prediction) + x;
+ obmc_pred = static_cast<const uint8_t*>(obmc_prediction) + x;
+ const __m128i mask_val = LoadUnaligned16(mask + x);
+ // 64 - mask
+ const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
+ const __m128i masks_lo = _mm_unpacklo_epi8(mask_val, obmc_mask_val);
+ const __m128i masks_hi = _mm_unpackhi_epi8(mask_val, obmc_mask_val);
+
+ int y = 0;
+ do {
+ const __m128i pred_val = LoadUnaligned16(pred);
+ const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred);
+ const __m128i terms_lo = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
+ const __m128i result_lo =
+ RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_lo, masks_lo), 6);
+ const __m128i terms_hi = _mm_unpackhi_epi8(pred_val, obmc_pred_val);
+ const __m128i result_hi =
+ RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_hi, masks_hi), 6);
+ StoreUnaligned16(pred, _mm_packus_epi16(result_lo, result_hi));
+
+ pred += prediction_stride;
+ obmc_pred += obmc_prediction_stride;
+ } while (++y < height);
+ x += 16;
+ } while (x < width);
+}
+
+inline void OverlapBlendFromTop4xH_SSE4_1(
+ uint8_t* const prediction, const ptrdiff_t prediction_stride,
+ const int height, const uint8_t* const obmc_prediction,
+ const ptrdiff_t obmc_prediction_stride) {
+ uint8_t* pred = prediction;
+ const uint8_t* obmc_pred = obmc_prediction;
+ const __m128i mask_inverter = _mm_set1_epi16(64);
+ const __m128i mask_shuffler = _mm_set_epi32(0x01010101, 0x01010101, 0, 0);
+ const __m128i mask_preinverter = _mm_set1_epi16(-256 | 1);
+
+ const uint8_t* mask = kObmcMask + height - 2;
+ const int compute_height = height - (height >> 2);
+ int y = 0;
+ do {
+ // First mask in the first half, second mask in the second half.
+ const __m128i mask_val = _mm_shuffle_epi8(
+ _mm_cvtsi32_si128(*reinterpret_cast<const uint16_t*>(mask + y)),
+ mask_shuffler);
+ const __m128i masks =
+ _mm_sub_epi8(mask_inverter, _mm_sign_epi8(mask_val, mask_preinverter));
+ const __m128i pred_val0 = Load4(pred);
+
+ const __m128i obmc_pred_val0 = Load4(obmc_pred);
+ pred += prediction_stride;
+ obmc_pred += obmc_prediction_stride;
+ const __m128i pred_val =
+ _mm_alignr_epi8(Load4(pred), _mm_slli_si128(pred_val0, 12), 12);
+ const __m128i obmc_pred_val = _mm_alignr_epi8(
+ Load4(obmc_pred), _mm_slli_si128(obmc_pred_val0, 12), 12);
+ const __m128i terms = _mm_unpacklo_epi8(obmc_pred_val, pred_val);
+ const __m128i result =
+ RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6);
+
+ const __m128i packed_result = _mm_packus_epi16(result, result);
+ Store4(pred - prediction_stride, packed_result);
+ Store4(pred, _mm_srli_si128(packed_result, 4));
+ pred += prediction_stride;
+ obmc_pred += obmc_prediction_stride;
+ y += 2;
+ } while (y < compute_height);
+}
+
+inline void OverlapBlendFromTop8xH_SSE4_1(
+ uint8_t* const prediction, const ptrdiff_t prediction_stride,
+ const int height, const uint8_t* const obmc_prediction,
+ const ptrdiff_t obmc_prediction_stride) {
+ uint8_t* pred = prediction;
+ const uint8_t* obmc_pred = obmc_prediction;
+ const uint8_t* mask = kObmcMask + height - 2;
+ const __m128i mask_inverter = _mm_set1_epi8(64);
+ const int compute_height = height - (height >> 2);
+ int y = compute_height;
+ do {
+ const __m128i mask_val = _mm_set1_epi8(mask[compute_height - y]);
+ // 64 - mask
+ const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
+ const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val);
+ const __m128i pred_val = LoadLo8(pred);
+ const __m128i obmc_pred_val = LoadLo8(obmc_pred);
+ const __m128i terms = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
+ const __m128i result =
+ RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6);
+
+ StoreLo8(pred, _mm_packus_epi16(result, result));
+ pred += prediction_stride;
+ obmc_pred += obmc_prediction_stride;
+ } while (--y != 0);
+}
+
+void OverlapBlendFromTop_SSE4_1(void* const prediction,
+ const ptrdiff_t prediction_stride,
+ const int width, const int height,
+ const void* const obmc_prediction,
+ const ptrdiff_t obmc_prediction_stride) {
+ auto* pred = static_cast<uint8_t*>(prediction);
+ const auto* obmc_pred = static_cast<const uint8_t*>(obmc_prediction);
+
+ if (width <= 4) {
+ OverlapBlendFromTop4xH_SSE4_1(pred, prediction_stride, height, obmc_pred,
+ obmc_prediction_stride);
+ return;
+ }
+ if (width == 8) {
+ OverlapBlendFromTop8xH_SSE4_1(pred, prediction_stride, height, obmc_pred,
+ obmc_prediction_stride);
+ return;
+ }
+
+ // Stop when mask value becomes 64.
+ const int compute_height = height - (height >> 2);
+ const __m128i mask_inverter = _mm_set1_epi8(64);
+ int y = 0;
+ const uint8_t* mask = kObmcMask + height - 2;
+ do {
+ const __m128i mask_val = _mm_set1_epi8(mask[y]);
+ // 64 - mask
+ const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
+ const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val);
+ int x = 0;
+ do {
+ const __m128i pred_val = LoadUnaligned16(pred + x);
+ const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred + x);
+ const __m128i terms_lo = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
+ const __m128i result_lo =
+ RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_lo, masks), 6);
+ const __m128i terms_hi = _mm_unpackhi_epi8(pred_val, obmc_pred_val);
+ const __m128i result_hi =
+ RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_hi, masks), 6);
+ StoreUnaligned16(pred + x, _mm_packus_epi16(result_lo, result_hi));
+ x += 16;
+ } while (x < width);
+ pred += prediction_stride;
+ obmc_pred += obmc_prediction_stride;
+ } while (++y < compute_height);
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+#if DSP_ENABLED_8BPP_SSE4_1(ObmcVertical)
+ dsp->obmc_blend[kObmcDirectionVertical] = OverlapBlendFromTop_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(ObmcHorizontal)
+ dsp->obmc_blend[kObmcDirectionHorizontal] = OverlapBlendFromLeft_SSE4_1;
+#endif
+}
+
+} // namespace
+
+void ObmcInit_SSE4_1() { Init8bpp(); }
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_TARGETING_SSE4_1
+
+namespace libgav1 {
+namespace dsp {
+
+void ObmcInit_SSE4_1() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/obmc_sse4.h b/src/dsp/x86/obmc_sse4.h
new file mode 100644
index 0000000..bd8b416
--- /dev/null
+++ b/src/dsp/x86/obmc_sse4.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_OBMC_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_OBMC_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::obmc_blend[]. This function is not thread-safe.
+void ObmcInit_SSE4_1();
+
+} // namespace dsp
+} // namespace libgav1
+
+// If sse4 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the sse4 implementation should be used.
+#if LIBGAV1_TARGETING_SSE4_1
+#ifndef LIBGAV1_Dsp8bpp_ObmcVertical
+#define LIBGAV1_Dsp8bpp_ObmcVertical LIBGAV1_CPU_SSE4_1
+#endif
+#ifndef LIBGAV1_Dsp8bpp_ObmcHorizontal
+#define LIBGAV1_Dsp8bpp_ObmcHorizontal LIBGAV1_CPU_SSE4_1
+#endif
+#endif // LIBGAV1_TARGETING_SSE4_1
+
+#endif // LIBGAV1_SRC_DSP_X86_OBMC_SSE4_H_
diff --git a/src/dsp/x86/super_res_sse4.cc b/src/dsp/x86/super_res_sse4.cc
new file mode 100644
index 0000000..b2bdfd2
--- /dev/null
+++ b/src/dsp/x86/super_res_sse4.cc
@@ -0,0 +1,166 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/super_res.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <smmintrin.h>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/dsp/x86/transpose_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+// Upscale_Filter as defined in AV1 Section 7.16
+// Negative to make them fit in 8-bit.
+alignas(16) const int8_t
+ kNegativeUpscaleFilter[kSuperResFilterShifts][kSuperResFilterTaps] = {
+ {0, 0, 0, -128, 0, 0, 0, 0}, {0, 0, 1, -128, -2, 1, 0, 0},
+ {0, -1, 3, -127, -4, 2, -1, 0}, {0, -1, 4, -127, -6, 3, -1, 0},
+ {0, -2, 6, -126, -8, 3, -1, 0}, {0, -2, 7, -125, -11, 4, -1, 0},
+ {1, -2, 8, -125, -13, 5, -2, 0}, {1, -3, 9, -124, -15, 6, -2, 0},
+ {1, -3, 10, -123, -18, 6, -2, 1}, {1, -3, 11, -122, -20, 7, -3, 1},
+ {1, -4, 12, -121, -22, 8, -3, 1}, {1, -4, 13, -120, -25, 9, -3, 1},
+ {1, -4, 14, -118, -28, 9, -3, 1}, {1, -4, 15, -117, -30, 10, -4, 1},
+ {1, -5, 16, -116, -32, 11, -4, 1}, {1, -5, 16, -114, -35, 12, -4, 1},
+ {1, -5, 17, -112, -38, 12, -4, 1}, {1, -5, 18, -111, -40, 13, -5, 1},
+ {1, -5, 18, -109, -43, 14, -5, 1}, {1, -6, 19, -107, -45, 14, -5, 1},
+ {1, -6, 19, -105, -48, 15, -5, 1}, {1, -6, 19, -103, -51, 16, -5, 1},
+ {1, -6, 20, -101, -53, 16, -6, 1}, {1, -6, 20, -99, -56, 17, -6, 1},
+ {1, -6, 20, -97, -58, 17, -6, 1}, {1, -6, 20, -95, -61, 18, -6, 1},
+ {2, -7, 20, -93, -64, 18, -6, 2}, {2, -7, 20, -91, -66, 19, -6, 1},
+ {2, -7, 20, -88, -69, 19, -6, 1}, {2, -7, 20, -86, -71, 19, -6, 1},
+ {2, -7, 20, -84, -74, 20, -7, 2}, {2, -7, 20, -81, -76, 20, -7, 1},
+ {2, -7, 20, -79, -79, 20, -7, 2}, {1, -7, 20, -76, -81, 20, -7, 2},
+ {2, -7, 20, -74, -84, 20, -7, 2}, {1, -6, 19, -71, -86, 20, -7, 2},
+ {1, -6, 19, -69, -88, 20, -7, 2}, {1, -6, 19, -66, -91, 20, -7, 2},
+ {2, -6, 18, -64, -93, 20, -7, 2}, {1, -6, 18, -61, -95, 20, -6, 1},
+ {1, -6, 17, -58, -97, 20, -6, 1}, {1, -6, 17, -56, -99, 20, -6, 1},
+ {1, -6, 16, -53, -101, 20, -6, 1}, {1, -5, 16, -51, -103, 19, -6, 1},
+ {1, -5, 15, -48, -105, 19, -6, 1}, {1, -5, 14, -45, -107, 19, -6, 1},
+ {1, -5, 14, -43, -109, 18, -5, 1}, {1, -5, 13, -40, -111, 18, -5, 1},
+ {1, -4, 12, -38, -112, 17, -5, 1}, {1, -4, 12, -35, -114, 16, -5, 1},
+ {1, -4, 11, -32, -116, 16, -5, 1}, {1, -4, 10, -30, -117, 15, -4, 1},
+ {1, -3, 9, -28, -118, 14, -4, 1}, {1, -3, 9, -25, -120, 13, -4, 1},
+ {1, -3, 8, -22, -121, 12, -4, 1}, {1, -3, 7, -20, -122, 11, -3, 1},
+ {1, -2, 6, -18, -123, 10, -3, 1}, {0, -2, 6, -15, -124, 9, -3, 1},
+ {0, -2, 5, -13, -125, 8, -2, 1}, {0, -1, 4, -11, -125, 7, -2, 0},
+ {0, -1, 3, -8, -126, 6, -2, 0}, {0, -1, 3, -6, -127, 4, -1, 0},
+ {0, -1, 2, -4, -127, 3, -1, 0}, {0, 0, 1, -2, -128, 1, 0, 0},
+};
+
+void SuperResCoefficients_SSE4_1(const int upscaled_width,
+ const int initial_subpixel_x, const int step,
+ void* const coefficients) {
+ auto* dst = static_cast<uint8_t*>(coefficients);
+ int subpixel_x = initial_subpixel_x;
+ int x = RightShiftWithCeiling(upscaled_width, 4);
+ do {
+ for (int i = 0; i < 8; ++i, dst += 16) {
+ int remainder = subpixel_x & kSuperResScaleMask;
+ __m128i filter =
+ LoadLo8(kNegativeUpscaleFilter[remainder >> kSuperResExtraBits]);
+ subpixel_x += step;
+ remainder = subpixel_x & kSuperResScaleMask;
+ filter = LoadHi8(filter,
+ kNegativeUpscaleFilter[remainder >> kSuperResExtraBits]);
+ subpixel_x += step;
+ StoreAligned16(dst, filter);
+ }
+ } while (--x != 0);
+}
+
+void SuperRes_SSE4_1(const void* const coefficients, void* const source,
+ const ptrdiff_t stride, const int height,
+ const int downscaled_width, const int upscaled_width,
+ const int initial_subpixel_x, const int step,
+ void* const dest) {
+ auto* src = static_cast<uint8_t*>(source) - DivideBy2(kSuperResFilterTaps);
+ auto* dst = static_cast<uint8_t*>(dest);
+ int y = height;
+ do {
+ const auto* filter = static_cast<const uint8_t*>(coefficients);
+ uint8_t* dst_ptr = dst;
+ ExtendLine<uint8_t>(src + DivideBy2(kSuperResFilterTaps), downscaled_width,
+ kSuperResHorizontalBorder, kSuperResHorizontalBorder);
+ int subpixel_x = initial_subpixel_x;
+ // The below code calculates up to 15 extra upscaled
+ // pixels which will over-read up to 15 downscaled pixels in the end of each
+ // row. kSuperResHorizontalBorder accounts for this.
+ int x = RightShiftWithCeiling(upscaled_width, 4);
+ do {
+ __m128i weighted_src[8];
+ for (int i = 0; i < 8; ++i, filter += 16) {
+ __m128i s = LoadLo8(&src[subpixel_x >> kSuperResScaleBits]);
+ subpixel_x += step;
+ s = LoadHi8(s, &src[subpixel_x >> kSuperResScaleBits]);
+ subpixel_x += step;
+ const __m128i f = LoadAligned16(filter);
+ weighted_src[i] = _mm_maddubs_epi16(s, f);
+ }
+
+ __m128i a[4];
+ a[0] = _mm_hadd_epi16(weighted_src[0], weighted_src[1]);
+ a[1] = _mm_hadd_epi16(weighted_src[2], weighted_src[3]);
+ a[2] = _mm_hadd_epi16(weighted_src[4], weighted_src[5]);
+ a[3] = _mm_hadd_epi16(weighted_src[6], weighted_src[7]);
+ Transpose2x16_U16(a, a);
+ a[0] = _mm_adds_epi16(a[0], a[1]);
+ a[1] = _mm_adds_epi16(a[2], a[3]);
+ const __m128i rounding = _mm_set1_epi16(1 << (kFilterBits - 1));
+ a[0] = _mm_subs_epi16(rounding, a[0]);
+ a[1] = _mm_subs_epi16(rounding, a[1]);
+ a[0] = _mm_srai_epi16(a[0], kFilterBits);
+ a[1] = _mm_srai_epi16(a[1], kFilterBits);
+ StoreAligned16(dst_ptr, _mm_packus_epi16(a[0], a[1]));
+ dst_ptr += 16;
+ } while (--x != 0);
+ src += stride;
+ dst += stride;
+ } while (--y != 0);
+}
+
+void Init8bpp() {
+ Dsp* dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ dsp->super_res_coefficients = SuperResCoefficients_SSE4_1;
+ dsp->super_res = SuperRes_SSE4_1;
+}
+
+} // namespace
+} // namespace low_bitdepth
+
+void SuperResInit_SSE4_1() { low_bitdepth::Init8bpp(); }
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_TARGETING_SSE4_1
+
+namespace libgav1 {
+namespace dsp {
+
+void SuperResInit_SSE4_1() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/super_res_sse4.h b/src/dsp/x86/super_res_sse4.h
new file mode 100644
index 0000000..aef5147
--- /dev/null
+++ b/src/dsp/x86/super_res_sse4.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_SUPER_RES_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_SUPER_RES_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::super_res_row. This function is not thread-safe.
+void SuperResInit_SSE4_1();
+
+} // namespace dsp
+} // namespace libgav1
+
+#if LIBGAV1_TARGETING_SSE4_1
+#ifndef LIBGAV1_Dsp8bpp_SuperRes
+#define LIBGAV1_Dsp8bpp_SuperRes LIBGAV1_CPU_SSE4_1
+#endif
+#endif // LIBGAV1_TARGETING_SSE4_1
+
+#endif // LIBGAV1_SRC_DSP_X86_SUPER_RES_SSE4_H_
diff --git a/src/dsp/x86/transpose_sse4.h b/src/dsp/x86/transpose_sse4.h
new file mode 100644
index 0000000..208b301
--- /dev/null
+++ b/src/dsp/x86/transpose_sse4.h
@@ -0,0 +1,307 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_TRANSPOSE_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_TRANSPOSE_SSE4_H_
+
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+#include <emmintrin.h>
+
+namespace libgav1 {
+namespace dsp {
+
+LIBGAV1_ALWAYS_INLINE void Transpose2x16_U16(const __m128i* const in,
+ __m128i* const out) {
+ // Unpack 16 bit elements. Goes from:
+ // in[0]: 00 01 10 11 20 21 30 31
+ // in[0]: 40 41 50 51 60 61 70 71
+ // in[0]: 80 81 90 91 a0 a1 b0 b1
+ // in[0]: c0 c1 d0 d1 e0 e1 f0 f1
+ // to:
+ // a0: 00 40 01 41 10 50 11 51
+ // a1: 20 60 21 61 30 70 31 71
+ // a2: 80 c0 81 c1 90 d0 91 d1
+ // a3: a0 e0 a1 e1 b0 f0 b1 f1
+ const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
+ const __m128i a1 = _mm_unpackhi_epi16(in[0], in[1]);
+ const __m128i a2 = _mm_unpacklo_epi16(in[2], in[3]);
+ const __m128i a3 = _mm_unpackhi_epi16(in[2], in[3]);
+ // b0: 00 20 40 60 01 21 41 61
+ // b1: 10 30 50 70 11 31 51 71
+ // b2: 80 a0 c0 e0 81 a1 c1 e1
+ // b3: 90 b0 d0 f0 91 b1 d1 f1
+ const __m128i b0 = _mm_unpacklo_epi16(a0, a1);
+ const __m128i b1 = _mm_unpackhi_epi16(a0, a1);
+ const __m128i b2 = _mm_unpacklo_epi16(a2, a3);
+ const __m128i b3 = _mm_unpackhi_epi16(a2, a3);
+ // out[0]: 00 10 20 30 40 50 60 70
+ // out[1]: 01 11 21 31 41 51 61 71
+ // out[2]: 80 90 a0 b0 c0 d0 e0 f0
+ // out[3]: 81 91 a1 b1 c1 d1 e1 f1
+ out[0] = _mm_unpacklo_epi16(b0, b1);
+ out[1] = _mm_unpackhi_epi16(b0, b1);
+ out[2] = _mm_unpacklo_epi16(b2, b3);
+ out[3] = _mm_unpackhi_epi16(b2, b3);
+}
+
+LIBGAV1_ALWAYS_INLINE __m128i Transpose4x4_U8(const __m128i* const in) {
+ // Unpack 8 bit elements. Goes from:
+ // in[0]: 00 01 02 03
+ // in[1]: 10 11 12 13
+ // in[2]: 20 21 22 23
+ // in[3]: 30 31 32 33
+ // to:
+ // a0: 00 10 01 11 02 12 03 13
+ // a1: 20 30 21 31 22 32 23 33
+ const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]);
+ const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]);
+
+ // Unpack 32 bit elements resulting in:
+ // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ return _mm_unpacklo_epi16(a0, a1);
+}
+
+LIBGAV1_ALWAYS_INLINE void Transpose8x8To4x16_U8(const __m128i* const in,
+ __m128i* out) {
+ // Unpack 8 bit elements. Goes from:
+ // in[0]: 00 01 02 03 04 05 06 07
+ // in[1]: 10 11 12 13 14 15 16 17
+ // in[2]: 20 21 22 23 24 25 26 27
+ // in[3]: 30 31 32 33 34 35 36 37
+ // in[4]: 40 41 42 43 44 45 46 47
+ // in[5]: 50 51 52 53 54 55 56 57
+ // in[6]: 60 61 62 63 64 65 66 67
+ // in[7]: 70 71 72 73 74 75 76 77
+ // to:
+ // a0: 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+ // a1: 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+ // a2: 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
+ // a3: 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
+ const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]);
+ const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]);
+ const __m128i a2 = _mm_unpacklo_epi8(in[4], in[5]);
+ const __m128i a3 = _mm_unpacklo_epi8(in[6], in[7]);
+
+ // b0: 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ // b1: 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
+ // b2: 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+ // b3: 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
+ const __m128i b0 = _mm_unpacklo_epi16(a0, a1);
+ const __m128i b1 = _mm_unpacklo_epi16(a2, a3);
+ const __m128i b2 = _mm_unpackhi_epi16(a0, a1);
+ const __m128i b3 = _mm_unpackhi_epi16(a2, a3);
+
+ // out[0]: 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+ // out[1]: 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+ // out[2]: 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
+ // out[3]: 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
+ out[0] = _mm_unpacklo_epi32(b0, b1);
+ out[1] = _mm_unpackhi_epi32(b0, b1);
+ out[2] = _mm_unpacklo_epi32(b2, b3);
+ out[3] = _mm_unpackhi_epi32(b2, b3);
+}
+
+LIBGAV1_ALWAYS_INLINE void Transpose4x4_U16(const __m128i* in, __m128i* out) {
+ // Unpack 16 bit elements. Goes from:
+ // in[0]: 00 01 02 03 XX XX XX XX
+ // in[1]: 10 11 12 13 XX XX XX XX
+ // in[2]: 20 21 22 23 XX XX XX XX
+ // in[3]: 30 31 32 33 XX XX XX XX
+ // to:
+ // ba: 00 10 01 11 02 12 03 13
+ // dc: 20 30 21 31 22 32 23 33
+ const __m128i ba = _mm_unpacklo_epi16(in[0], in[1]);
+ const __m128i dc = _mm_unpacklo_epi16(in[2], in[3]);
+ // Unpack 32 bit elements resulting in:
+ // dcba_lo: 00 10 20 30 01 11 21 31
+ // dcba_hi: 02 12 22 32 03 13 23 33
+ const __m128i dcba_lo = _mm_unpacklo_epi32(ba, dc);
+ const __m128i dcba_hi = _mm_unpackhi_epi32(ba, dc);
+ // Assign or shift right by 8 bytes resulting in:
+ // out[0]: 00 10 20 30 01 11 21 31
+ // out[1]: 01 11 21 31 XX XX XX XX
+ // out[2]: 02 12 22 32 03 13 23 33
+ // out[3]: 03 13 23 33 XX XX XX XX
+ out[0] = dcba_lo;
+ out[1] = _mm_srli_si128(dcba_lo, 8);
+ out[2] = dcba_hi;
+ out[3] = _mm_srli_si128(dcba_hi, 8);
+}
+
+LIBGAV1_ALWAYS_INLINE void Transpose4x8To8x4_U16(const __m128i* in,
+ __m128i* out) {
+ // Unpack 16 bit elements. Goes from:
+ // in[0]: 00 01 02 03 XX XX XX XX
+ // in[1]: 10 11 12 13 XX XX XX XX
+ // in[2]: 20 21 22 23 XX XX XX XX
+ // in[3]: 30 31 32 33 XX XX XX XX
+ // in[4]: 40 41 42 43 XX XX XX XX
+ // in[5]: 50 51 52 53 XX XX XX XX
+ // in[6]: 60 61 62 63 XX XX XX XX
+ // in[7]: 70 71 72 73 XX XX XX XX
+ // to:
+ // a0: 00 10 01 11 02 12 03 13
+ // a1: 20 30 21 31 22 32 23 33
+ // a2: 40 50 41 51 42 52 43 53
+ // a3: 60 70 61 71 62 72 63 73
+ const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
+ const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
+ const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]);
+ const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]);
+
+ // Unpack 32 bit elements resulting in:
+ // b0: 00 10 20 30 01 11 21 31
+ // b1: 40 50 60 70 41 51 61 71
+ // b2: 02 12 22 32 03 13 23 33
+ // b3: 42 52 62 72 43 53 63 73
+ const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
+ const __m128i b1 = _mm_unpacklo_epi32(a2, a3);
+ const __m128i b2 = _mm_unpackhi_epi32(a0, a1);
+ const __m128i b3 = _mm_unpackhi_epi32(a2, a3);
+
+ // Unpack 64 bit elements resulting in:
+ // out[0]: 00 10 20 30 40 50 60 70
+ // out[1]: 01 11 21 31 41 51 61 71
+ // out[2]: 02 12 22 32 42 52 62 72
+ // out[3]: 03 13 23 33 43 53 63 73
+ out[0] = _mm_unpacklo_epi64(b0, b1);
+ out[1] = _mm_unpackhi_epi64(b0, b1);
+ out[2] = _mm_unpacklo_epi64(b2, b3);
+ out[3] = _mm_unpackhi_epi64(b2, b3);
+}
+
+LIBGAV1_ALWAYS_INLINE void Transpose8x4To4x8_U16(const __m128i* in,
+ __m128i* out) {
+ // Unpack 16 bit elements. Goes from:
+ // in[0]: 00 01 02 03 04 05 06 07
+ // in[1]: 10 11 12 13 14 15 16 17
+ // in[2]: 20 21 22 23 24 25 26 27
+ // in[3]: 30 31 32 33 34 35 36 37
+
+ // to:
+ // a0: 00 10 01 11 02 12 03 13
+ // a1: 20 30 21 31 22 32 23 33
+ // a4: 04 14 05 15 06 16 07 17
+ // a5: 24 34 25 35 26 36 27 37
+ const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
+ const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
+ const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]);
+ const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]);
+
+ // Unpack 32 bit elements resulting in:
+ // b0: 00 10 20 30 01 11 21 31
+ // b2: 04 14 24 34 05 15 25 35
+ // b4: 02 12 22 32 03 13 23 33
+ // b6: 06 16 26 36 07 17 27 37
+ const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
+ const __m128i b2 = _mm_unpacklo_epi32(a4, a5);
+ const __m128i b4 = _mm_unpackhi_epi32(a0, a1);
+ const __m128i b6 = _mm_unpackhi_epi32(a4, a5);
+
+ // Unpack 64 bit elements resulting in:
+ // out[0]: 00 10 20 30 XX XX XX XX
+ // out[1]: 01 11 21 31 XX XX XX XX
+ // out[2]: 02 12 22 32 XX XX XX XX
+ // out[3]: 03 13 23 33 XX XX XX XX
+ // out[4]: 04 14 24 34 XX XX XX XX
+ // out[5]: 05 15 25 35 XX XX XX XX
+ // out[6]: 06 16 26 36 XX XX XX XX
+ // out[7]: 07 17 27 37 XX XX XX XX
+ const __m128i zeros = _mm_setzero_si128();
+ out[0] = _mm_unpacklo_epi64(b0, zeros);
+ out[1] = _mm_unpackhi_epi64(b0, zeros);
+ out[2] = _mm_unpacklo_epi64(b4, zeros);
+ out[3] = _mm_unpackhi_epi64(b4, zeros);
+ out[4] = _mm_unpacklo_epi64(b2, zeros);
+ out[5] = _mm_unpackhi_epi64(b2, zeros);
+ out[6] = _mm_unpacklo_epi64(b6, zeros);
+ out[7] = _mm_unpackhi_epi64(b6, zeros);
+}
+
+LIBGAV1_ALWAYS_INLINE void Transpose8x8_U16(const __m128i* const in,
+ __m128i* const out) {
+ // Unpack 16 bit elements. Goes from:
+ // in[0]: 00 01 02 03 04 05 06 07
+ // in[1]: 10 11 12 13 14 15 16 17
+ // in[2]: 20 21 22 23 24 25 26 27
+ // in[3]: 30 31 32 33 34 35 36 37
+ // in[4]: 40 41 42 43 44 45 46 47
+ // in[5]: 50 51 52 53 54 55 56 57
+ // in[6]: 60 61 62 63 64 65 66 67
+ // in[7]: 70 71 72 73 74 75 76 77
+ // to:
+ // a0: 00 10 01 11 02 12 03 13
+ // a1: 20 30 21 31 22 32 23 33
+ // a2: 40 50 41 51 42 52 43 53
+ // a3: 60 70 61 71 62 72 63 73
+ // a4: 04 14 05 15 06 16 07 17
+ // a5: 24 34 25 35 26 36 27 37
+ // a6: 44 54 45 55 46 56 47 57
+ // a7: 64 74 65 75 66 76 67 77
+ const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
+ const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
+ const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]);
+ const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]);
+ const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]);
+ const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]);
+ const __m128i a6 = _mm_unpackhi_epi16(in[4], in[5]);
+ const __m128i a7 = _mm_unpackhi_epi16(in[6], in[7]);
+
+ // Unpack 32 bit elements resulting in:
+ // b0: 00 10 20 30 01 11 21 31
+ // b1: 40 50 60 70 41 51 61 71
+ // b2: 04 14 24 34 05 15 25 35
+ // b3: 44 54 64 74 45 55 65 75
+ // b4: 02 12 22 32 03 13 23 33
+ // b5: 42 52 62 72 43 53 63 73
+ // b6: 06 16 26 36 07 17 27 37
+ // b7: 46 56 66 76 47 57 67 77
+ const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
+ const __m128i b1 = _mm_unpacklo_epi32(a2, a3);
+ const __m128i b2 = _mm_unpacklo_epi32(a4, a5);
+ const __m128i b3 = _mm_unpacklo_epi32(a6, a7);
+ const __m128i b4 = _mm_unpackhi_epi32(a0, a1);
+ const __m128i b5 = _mm_unpackhi_epi32(a2, a3);
+ const __m128i b6 = _mm_unpackhi_epi32(a4, a5);
+ const __m128i b7 = _mm_unpackhi_epi32(a6, a7);
+
+ // Unpack 64 bit elements resulting in:
+ // out[0]: 00 10 20 30 40 50 60 70
+ // out[1]: 01 11 21 31 41 51 61 71
+ // out[2]: 02 12 22 32 42 52 62 72
+ // out[3]: 03 13 23 33 43 53 63 73
+ // out[4]: 04 14 24 34 44 54 64 74
+ // out[5]: 05 15 25 35 45 55 65 75
+ // out[6]: 06 16 26 36 46 56 66 76
+ // out[7]: 07 17 27 37 47 57 67 77
+ out[0] = _mm_unpacklo_epi64(b0, b1);
+ out[1] = _mm_unpackhi_epi64(b0, b1);
+ out[2] = _mm_unpacklo_epi64(b4, b5);
+ out[3] = _mm_unpackhi_epi64(b4, b5);
+ out[4] = _mm_unpacklo_epi64(b2, b3);
+ out[5] = _mm_unpackhi_epi64(b2, b3);
+ out[6] = _mm_unpacklo_epi64(b6, b7);
+ out[7] = _mm_unpackhi_epi64(b6, b7);
+}
+
+} // namespace dsp
+} // namespace libgav1
+
+#endif // LIBGAV1_TARGETING_SSE4_1
+#endif // LIBGAV1_SRC_DSP_X86_TRANSPOSE_SSE4_H_
diff --git a/src/dsp/x86/warp_sse4.cc b/src/dsp/x86/warp_sse4.cc
new file mode 100644
index 0000000..43279ab
--- /dev/null
+++ b/src/dsp/x86/warp_sse4.cc
@@ -0,0 +1,525 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/warp.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <smmintrin.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <type_traits>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/dsp/x86/transpose_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+// Number of extra bits of precision in warped filtering.
+constexpr int kWarpedDiffPrecisionBits = 10;
+
+// This assumes the two filters contain filter[x] and filter[x+2].
+inline __m128i AccumulateFilter(const __m128i sum, const __m128i filter_0,
+ const __m128i filter_1,
+ const __m128i& src_window) {
+ const __m128i filter_taps = _mm_unpacklo_epi8(filter_0, filter_1);
+ const __m128i src =
+ _mm_unpacklo_epi8(src_window, _mm_srli_si128(src_window, 2));
+ return _mm_add_epi16(sum, _mm_maddubs_epi16(src, filter_taps));
+}
+
+constexpr int kFirstPassOffset = 1 << 14;
+constexpr int kOffsetRemoval =
+ (kFirstPassOffset >> kInterRoundBitsHorizontal) * 128;
+
+// Applies the horizontal filter to one source row and stores the result in
+// |intermediate_result_row|. |intermediate_result_row| is a row in the 15x8
+// |intermediate_result| two-dimensional array.
+inline void HorizontalFilter(const int sx4, const int16_t alpha,
+ const __m128i src_row,
+ int16_t intermediate_result_row[8]) {
+ int sx = sx4 - MultiplyBy4(alpha);
+ __m128i filter[8];
+ for (__m128i& f : filter) {
+ const int offset = RightShiftWithRounding(sx, kWarpedDiffPrecisionBits) +
+ kWarpedPixelPrecisionShifts;
+ f = LoadLo8(kWarpedFilters8[offset]);
+ sx += alpha;
+ }
+ Transpose8x8To4x16_U8(filter, filter);
+ // |filter| now contains two filters per register.
+ // Staggered combinations allow us to take advantage of _mm_maddubs_epi16
+ // without overflowing the sign bit. The sign bit is hit only where two taps
+ // paired in a single madd add up to more than 128. This is only possible with
+ // two adjacent "inner" taps. Therefore, pairing odd with odd and even with
+ // even guarantees safety. |sum| is given a negative offset to allow for large
+ // intermediate values.
+ // k = 0, 2.
+ __m128i src_row_window = src_row;
+ __m128i sum = _mm_set1_epi16(-kFirstPassOffset);
+ sum = AccumulateFilter(sum, filter[0], filter[1], src_row_window);
+
+ // k = 1, 3.
+ src_row_window = _mm_srli_si128(src_row_window, 1);
+ sum = AccumulateFilter(sum, _mm_srli_si128(filter[0], 8),
+ _mm_srli_si128(filter[1], 8), src_row_window);
+ // k = 4, 6.
+ src_row_window = _mm_srli_si128(src_row_window, 3);
+ sum = AccumulateFilter(sum, filter[2], filter[3], src_row_window);
+
+ // k = 5, 7.
+ src_row_window = _mm_srli_si128(src_row_window, 1);
+ sum = AccumulateFilter(sum, _mm_srli_si128(filter[2], 8),
+ _mm_srli_si128(filter[3], 8), src_row_window);
+
+ sum = RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal);
+ StoreUnaligned16(intermediate_result_row, sum);
+}
+
+template <bool is_compound>
+inline void WriteVerticalFilter(const __m128i filter[8],
+ const int16_t intermediate_result[15][8], int y,
+ void* dst_row) {
+ constexpr int kRoundBitsVertical =
+ is_compound ? kInterRoundBitsCompoundVertical : kInterRoundBitsVertical;
+ __m128i sum_low = _mm_set1_epi32(kOffsetRemoval);
+ __m128i sum_high = sum_low;
+ for (int k = 0; k < 8; k += 2) {
+ const __m128i filters_low = _mm_unpacklo_epi16(filter[k], filter[k + 1]);
+ const __m128i filters_high = _mm_unpackhi_epi16(filter[k], filter[k + 1]);
+ const __m128i intermediate_0 = LoadUnaligned16(intermediate_result[y + k]);
+ const __m128i intermediate_1 =
+ LoadUnaligned16(intermediate_result[y + k + 1]);
+ const __m128i intermediate_low =
+ _mm_unpacklo_epi16(intermediate_0, intermediate_1);
+ const __m128i intermediate_high =
+ _mm_unpackhi_epi16(intermediate_0, intermediate_1);
+
+ const __m128i product_low = _mm_madd_epi16(filters_low, intermediate_low);
+ const __m128i product_high =
+ _mm_madd_epi16(filters_high, intermediate_high);
+ sum_low = _mm_add_epi32(sum_low, product_low);
+ sum_high = _mm_add_epi32(sum_high, product_high);
+ }
+ sum_low = RightShiftWithRounding_S32(sum_low, kRoundBitsVertical);
+ sum_high = RightShiftWithRounding_S32(sum_high, kRoundBitsVertical);
+ if (is_compound) {
+ const __m128i sum = _mm_packs_epi32(sum_low, sum_high);
+ StoreUnaligned16(static_cast<int16_t*>(dst_row), sum);
+ } else {
+ const __m128i sum = _mm_packus_epi32(sum_low, sum_high);
+ StoreLo8(static_cast<uint8_t*>(dst_row), _mm_packus_epi16(sum, sum));
+ }
+}
+
+template <bool is_compound>
+inline void WriteVerticalFilter(const __m128i filter[8],
+ const int16_t* intermediate_result_column,
+ void* dst_row) {
+ constexpr int kRoundBitsVertical =
+ is_compound ? kInterRoundBitsCompoundVertical : kInterRoundBitsVertical;
+ __m128i sum_low = _mm_setzero_si128();
+ __m128i sum_high = _mm_setzero_si128();
+ for (int k = 0; k < 8; k += 2) {
+ const __m128i filters_low = _mm_unpacklo_epi16(filter[k], filter[k + 1]);
+ const __m128i filters_high = _mm_unpackhi_epi16(filter[k], filter[k + 1]);
+ // Equivalent to unpacking two vectors made by duplicating int16_t values.
+ const __m128i intermediate =
+ _mm_set1_epi32((intermediate_result_column[k + 1] << 16) |
+ intermediate_result_column[k]);
+ const __m128i product_low = _mm_madd_epi16(filters_low, intermediate);
+ const __m128i product_high = _mm_madd_epi16(filters_high, intermediate);
+ sum_low = _mm_add_epi32(sum_low, product_low);
+ sum_high = _mm_add_epi32(sum_high, product_high);
+ }
+ sum_low = RightShiftWithRounding_S32(sum_low, kRoundBitsVertical);
+ sum_high = RightShiftWithRounding_S32(sum_high, kRoundBitsVertical);
+ if (is_compound) {
+ const __m128i sum = _mm_packs_epi32(sum_low, sum_high);
+ StoreUnaligned16(static_cast<int16_t*>(dst_row), sum);
+ } else {
+ const __m128i sum = _mm_packus_epi32(sum_low, sum_high);
+ StoreLo8(static_cast<uint8_t*>(dst_row), _mm_packus_epi16(sum, sum));
+ }
+}
+
+template <bool is_compound, typename DestType>
+inline void VerticalFilter(const int16_t source[15][8], int y4, int gamma,
+ int delta, DestType* dest_row,
+ ptrdiff_t dest_stride) {
+ int sy4 = (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta);
+ for (int y = 0; y < 8; ++y) {
+ int sy = sy4 - MultiplyBy4(gamma);
+ __m128i filter[8];
+ for (__m128i& f : filter) {
+ const int offset = RightShiftWithRounding(sy, kWarpedDiffPrecisionBits) +
+ kWarpedPixelPrecisionShifts;
+ f = LoadUnaligned16(kWarpedFilters[offset]);
+ sy += gamma;
+ }
+ Transpose8x8_U16(filter, filter);
+ WriteVerticalFilter<is_compound>(filter, source, y, dest_row);
+ dest_row += dest_stride;
+ sy4 += delta;
+ }
+}
+
+template <bool is_compound, typename DestType>
+inline void VerticalFilter(const int16_t* source_cols, int y4, int gamma,
+ int delta, DestType* dest_row,
+ ptrdiff_t dest_stride) {
+ int sy4 = (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta);
+ for (int y = 0; y < 8; ++y) {
+ int sy = sy4 - MultiplyBy4(gamma);
+ __m128i filter[8];
+ for (__m128i& f : filter) {
+ const int offset = RightShiftWithRounding(sy, kWarpedDiffPrecisionBits) +
+ kWarpedPixelPrecisionShifts;
+ f = LoadUnaligned16(kWarpedFilters[offset]);
+ sy += gamma;
+ }
+ Transpose8x8_U16(filter, filter);
+ WriteVerticalFilter<is_compound>(filter, &source_cols[y], dest_row);
+ dest_row += dest_stride;
+ sy4 += delta;
+ }
+}
+
+template <bool is_compound, typename DestType>
+inline void WarpRegion1(const uint8_t* src, ptrdiff_t source_stride,
+ int source_width, int source_height, int ix4, int iy4,
+ DestType* dst_row, ptrdiff_t dest_stride) {
+ // Region 1
+ // Points to the left or right border of the first row of |src|.
+ const uint8_t* first_row_border =
+ (ix4 + 7 <= 0) ? src : src + source_width - 1;
+ // In general, for y in [-7, 8), the row number iy4 + y is clipped:
+ // const int row = Clip3(iy4 + y, 0, source_height - 1);
+ // In two special cases, iy4 + y is clipped to either 0 or
+ // source_height - 1 for all y. In the rest of the cases, iy4 + y is
+ // bounded and we can avoid clipping iy4 + y by relying on a reference
+ // frame's boundary extension on the top and bottom.
+ // Region 1.
+ // Every sample used to calculate the prediction block has the same
+ // value. So the whole prediction block has the same value.
+ const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1;
+ const uint8_t row_border_pixel = first_row_border[row * source_stride];
+
+ if (is_compound) {
+ const __m128i sum =
+ _mm_set1_epi16(row_border_pixel << (kInterRoundBitsVertical -
+ kInterRoundBitsCompoundVertical));
+ StoreUnaligned16(dst_row, sum);
+ } else {
+ memset(dst_row, row_border_pixel, 8);
+ }
+ const DestType* const first_dst_row = dst_row;
+ dst_row += dest_stride;
+ for (int y = 1; y < 8; ++y) {
+ memcpy(dst_row, first_dst_row, 8 * sizeof(*dst_row));
+ dst_row += dest_stride;
+ }
+}
+
+template <bool is_compound, typename DestType>
+inline void WarpRegion2(const uint8_t* src, ptrdiff_t source_stride,
+ int source_width, int y4, int ix4, int iy4, int gamma,
+ int delta, int16_t intermediate_result_column[15],
+ DestType* dst_row, ptrdiff_t dest_stride) {
+ // Region 2.
+ // Points to the left or right border of the first row of |src|.
+ const uint8_t* first_row_border =
+ (ix4 + 7 <= 0) ? src : src + source_width - 1;
+ // In general, for y in [-7, 8), the row number iy4 + y is clipped:
+ // const int row = Clip3(iy4 + y, 0, source_height - 1);
+ // In two special cases, iy4 + y is clipped to either 0 or
+ // source_height - 1 for all y. In the rest of the cases, iy4 + y is
+ // bounded and we can avoid clipping iy4 + y by relying on a reference
+ // frame's boundary extension on the top and bottom.
+
+ // Region 2.
+ // Horizontal filter.
+ // The input values in this region are generated by extending the border
+ // which makes them identical in the horizontal direction. This
+ // computation could be inlined in the vertical pass but most
+ // implementations will need a transpose of some sort.
+ // It is not necessary to use the offset values here because the
+ // horizontal pass is a simple shift and the vertical pass will always
+ // require using 32 bits.
+ for (int y = -7; y < 8; ++y) {
+ // We may over-read up to 13 pixels above the top source row, or up
+ // to 13 pixels below the bottom source row. This is proved in
+ // warp.cc.
+ const int row = iy4 + y;
+ int sum = first_row_border[row * source_stride];
+ sum <<= (kFilterBits - kInterRoundBitsHorizontal);
+ intermediate_result_column[y + 7] = sum;
+ }
+ // Region 2 vertical filter.
+ VerticalFilter<is_compound, DestType>(intermediate_result_column, y4, gamma,
+ delta, dst_row, dest_stride);
+}
+
+template <bool is_compound, typename DestType>
+inline void WarpRegion3(const uint8_t* src, ptrdiff_t source_stride,
+ int source_height, int alpha, int beta, int x4, int ix4,
+ int iy4, int16_t intermediate_result[15][8]) {
+ // Region 3
+ // At this point, we know ix4 - 7 < source_width - 1 and ix4 + 7 > 0.
+
+ // In general, for y in [-7, 8), the row number iy4 + y is clipped:
+ // const int row = Clip3(iy4 + y, 0, source_height - 1);
+ // In two special cases, iy4 + y is clipped to either 0 or
+ // source_height - 1 for all y. In the rest of the cases, iy4 + y is
+ // bounded and we can avoid clipping iy4 + y by relying on a reference
+ // frame's boundary extension on the top and bottom.
+ // Horizontal filter.
+ const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1;
+ const uint8_t* const src_row = src + row * source_stride;
+ // Read 15 samples from &src_row[ix4 - 7]. The 16th sample is also
+ // read but is ignored.
+ //
+ // NOTE: This may read up to 13 bytes before src_row[0] or up to 14
+ // bytes after src_row[source_width - 1]. We assume the source frame
+ // has left and right borders of at least 13 bytes that extend the
+ // frame boundary pixels. We also assume there is at least one extra
+ // padding byte after the right border of the last source row.
+ const __m128i src_row_v = LoadUnaligned16(&src_row[ix4 - 7]);
+ int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7;
+ for (int y = -7; y < 8; ++y) {
+ HorizontalFilter(sx4, alpha, src_row_v, intermediate_result[y + 7]);
+ sx4 += beta;
+ }
+}
+
+template <bool is_compound, typename DestType>
+inline void WarpRegion4(const uint8_t* src, ptrdiff_t source_stride, int alpha,
+ int beta, int x4, int ix4, int iy4,
+ int16_t intermediate_result[15][8]) {
+ // Region 4.
+ // At this point, we know ix4 - 7 < source_width - 1 and ix4 + 7 > 0.
+
+ // In general, for y in [-7, 8), the row number iy4 + y is clipped:
+ // const int row = Clip3(iy4 + y, 0, source_height - 1);
+ // In two special cases, iy4 + y is clipped to either 0 or
+ // source_height - 1 for all y. In the rest of the cases, iy4 + y is
+ // bounded and we can avoid clipping iy4 + y by relying on a reference
+ // frame's boundary extension on the top and bottom.
+ // Horizontal filter.
+ int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7;
+ for (int y = -7; y < 8; ++y) {
+ // We may over-read up to 13 pixels above the top source row, or up
+ // to 13 pixels below the bottom source row. This is proved in
+ // warp.cc.
+ const int row = iy4 + y;
+ const uint8_t* const src_row = src + row * source_stride;
+ // Read 15 samples from &src_row[ix4 - 7]. The 16th sample is also
+ // read but is ignored.
+ //
+ // NOTE: This may read up to 13 bytes before src_row[0] or up to 14
+ // bytes after src_row[source_width - 1]. We assume the source frame
+ // has left and right borders of at least 13 bytes that extend the
+ // frame boundary pixels. We also assume there is at least one extra
+ // padding byte after the right border of the last source row.
+ const __m128i src_row_v = LoadUnaligned16(&src_row[ix4 - 7]);
+ // Convert src_row_v to int8 (subtract 128).
+ HorizontalFilter(sx4, alpha, src_row_v, intermediate_result[y + 7]);
+ sx4 += beta;
+ }
+}
+
+template <bool is_compound, typename DestType>
+inline void HandleWarpBlock(const uint8_t* src, ptrdiff_t source_stride,
+ int source_width, int source_height,
+ const int* warp_params, int subsampling_x,
+ int subsampling_y, int src_x, int src_y,
+ int16_t alpha, int16_t beta, int16_t gamma,
+ int16_t delta, DestType* dst_row,
+ ptrdiff_t dest_stride) {
+ union {
+ // Intermediate_result is the output of the horizontal filtering and
+ // rounding. The range is within 13 (= bitdepth + kFilterBits + 1 -
+ // kInterRoundBitsHorizontal) bits (unsigned). We use the signed int16_t
+ // type so that we can start with a negative offset and restore it on the
+ // final filter sum.
+ int16_t intermediate_result[15][8]; // 15 rows, 8 columns.
+ // In the simple special cases where the samples in each row are all the
+ // same, store one sample per row in a column vector.
+ int16_t intermediate_result_column[15];
+ };
+
+ const int dst_x =
+ src_x * warp_params[2] + src_y * warp_params[3] + warp_params[0];
+ const int dst_y =
+ src_x * warp_params[4] + src_y * warp_params[5] + warp_params[1];
+ const int x4 = dst_x >> subsampling_x;
+ const int y4 = dst_y >> subsampling_y;
+ const int ix4 = x4 >> kWarpedModelPrecisionBits;
+ const int iy4 = y4 >> kWarpedModelPrecisionBits;
+ // A prediction block may fall outside the frame's boundaries. If a
+ // prediction block is calculated using only samples outside the frame's
+ // boundary, the filtering can be simplified. We can divide the plane
+ // into several regions and handle them differently.
+ //
+ // | |
+ // 1 | 3 | 1
+ // | |
+ // -------+-----------+-------
+ // |***********|
+ // 2 |*****4*****| 2
+ // |***********|
+ // -------+-----------+-------
+ // | |
+ // 1 | 3 | 1
+ // | |
+ //
+ // At the center, region 4 represents the frame and is the general case.
+ //
+ // In regions 1 and 2, the prediction block is outside the frame's
+ // boundary horizontally. Therefore the horizontal filtering can be
+ // simplified. Furthermore, in the region 1 (at the four corners), the
+ // prediction is outside the frame's boundary both horizontally and
+ // vertically, so we get a constant prediction block.
+ //
+ // In region 3, the prediction block is outside the frame's boundary
+ // vertically. Unfortunately because we apply the horizontal filters
+ // first, by the time we apply the vertical filters, they no longer see
+ // simple inputs. So the only simplification is that all the rows are
+ // the same, but we still need to apply all the horizontal and vertical
+ // filters.
+
+ // Check for two simple special cases, where the horizontal filter can
+ // be significantly simplified.
+ //
+ // In general, for each row, the horizontal filter is calculated as
+ // follows:
+ // for (int x = -4; x < 4; ++x) {
+ // const int offset = ...;
+ // int sum = first_pass_offset;
+ // for (int k = 0; k < 8; ++k) {
+ // const int column = Clip3(ix4 + x + k - 3, 0, source_width - 1);
+ // sum += kWarpedFilters[offset][k] * src_row[column];
+ // }
+ // ...
+ // }
+ // The column index before clipping, ix4 + x + k - 3, varies in the range
+ // ix4 - 7 <= ix4 + x + k - 3 <= ix4 + 7. If ix4 - 7 >= source_width - 1
+ // or ix4 + 7 <= 0, then all the column indexes are clipped to the same
+ // border index (source_width - 1 or 0, respectively). Then for each x,
+ // the inner for loop of the horizontal filter is reduced to multiplying
+ // the border pixel by the sum of the filter coefficients.
+ if (ix4 - 7 >= source_width - 1 || ix4 + 7 <= 0) {
+ if ((iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0)) {
+ // Outside the frame in both directions. One repeated value.
+ WarpRegion1<is_compound, DestType>(src, source_stride, source_width,
+ source_height, ix4, iy4, dst_row,
+ dest_stride);
+ return;
+ }
+ // Outside the frame horizontally. Rows repeated.
+ WarpRegion2<is_compound, DestType>(
+ src, source_stride, source_width, y4, ix4, iy4, gamma, delta,
+ intermediate_result_column, dst_row, dest_stride);
+ return;
+ }
+
+ if ((iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0)) {
+ // Outside the frame vertically.
+ WarpRegion3<is_compound, DestType>(src, source_stride, source_height, alpha,
+ beta, x4, ix4, iy4, intermediate_result);
+ } else {
+ // Inside the frame.
+ WarpRegion4<is_compound, DestType>(src, source_stride, alpha, beta, x4, ix4,
+ iy4, intermediate_result);
+ }
+ // Region 3 and 4 vertical filter.
+ VerticalFilter<is_compound, DestType>(intermediate_result, y4, gamma, delta,
+ dst_row, dest_stride);
+}
+
+template <bool is_compound>
+void Warp_SSE4_1(const void* source, ptrdiff_t source_stride, int source_width,
+ int source_height, const int* warp_params, int subsampling_x,
+ int subsampling_y, int block_start_x, int block_start_y,
+ int block_width, int block_height, int16_t alpha, int16_t beta,
+ int16_t gamma, int16_t delta, void* dest,
+ ptrdiff_t dest_stride) {
+ const auto* const src = static_cast<const uint8_t*>(source);
+ using DestType =
+ typename std::conditional<is_compound, int16_t, uint8_t>::type;
+ auto* dst = static_cast<DestType*>(dest);
+
+ // Warp process applies for each 8x8 block.
+ assert(block_width >= 8);
+ assert(block_height >= 8);
+ const int block_end_x = block_start_x + block_width;
+ const int block_end_y = block_start_y + block_height;
+
+ const int start_x = block_start_x;
+ const int start_y = block_start_y;
+ int src_x = (start_x + 4) << subsampling_x;
+ int src_y = (start_y + 4) << subsampling_y;
+ const int end_x = (block_end_x + 4) << subsampling_x;
+ const int end_y = (block_end_y + 4) << subsampling_y;
+ do {
+ DestType* dst_row = dst;
+ src_x = (start_x + 4) << subsampling_x;
+ do {
+ HandleWarpBlock<is_compound, DestType>(
+ src, source_stride, source_width, source_height, warp_params,
+ subsampling_x, subsampling_y, src_x, src_y, alpha, beta, gamma, delta,
+ dst_row, dest_stride);
+ src_x += (8 << subsampling_x);
+ dst_row += 8;
+ } while (src_x < end_x);
+ dst += 8 * dest_stride;
+ src_y += (8 << subsampling_y);
+ } while (src_y < end_y);
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+ dsp->warp = Warp_SSE4_1</*is_compound=*/false>;
+ dsp->warp_compound = Warp_SSE4_1</*is_compound=*/true>;
+}
+
+} // namespace
+} // namespace low_bitdepth
+
+void WarpInit_SSE4_1() { low_bitdepth::Init8bpp(); }
+
+} // namespace dsp
+} // namespace libgav1
+#else // !LIBGAV1_TARGETING_SSE4_1
+
+namespace libgav1 {
+namespace dsp {
+
+void WarpInit_SSE4_1() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/warp_sse4.h b/src/dsp/x86/warp_sse4.h
new file mode 100644
index 0000000..a2dc5ca
--- /dev/null
+++ b/src/dsp/x86/warp_sse4.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_WARP_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_WARP_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::warp. This function is not thread-safe.
+void WarpInit_SSE4_1();
+
+} // namespace dsp
+} // namespace libgav1
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#ifndef LIBGAV1_Dsp8bpp_Warp
+#define LIBGAV1_Dsp8bpp_Warp LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WarpCompound
+#define LIBGAV1_Dsp8bpp_WarpCompound LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif // LIBGAV1_TARGETING_SSE4_1
+
+#endif // LIBGAV1_SRC_DSP_X86_WARP_SSE4_H_
diff --git a/src/dsp/x86/weight_mask_sse4.cc b/src/dsp/x86/weight_mask_sse4.cc
new file mode 100644
index 0000000..dfd5662
--- /dev/null
+++ b/src/dsp/x86/weight_mask_sse4.cc
@@ -0,0 +1,464 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/x86/weight_mask_sse4.h"
+
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <smmintrin.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+constexpr int kRoundingBits8bpp = 4;
+
+template <bool mask_is_inverse>
+inline void WeightMask8_SSE4(const int16_t* prediction_0,
+ const int16_t* prediction_1, uint8_t* mask) {
+ const __m128i pred_0 = LoadAligned16(prediction_0);
+ const __m128i pred_1 = LoadAligned16(prediction_1);
+ const __m128i difference = RightShiftWithRounding_U16(
+ _mm_abs_epi16(_mm_sub_epi16(pred_0, pred_1)), kRoundingBits8bpp);
+ const __m128i scaled_difference = _mm_srli_epi16(difference, 4);
+ const __m128i difference_offset = _mm_set1_epi8(38);
+ const __m128i adjusted_difference =
+ _mm_adds_epu8(_mm_packus_epi16(scaled_difference, scaled_difference),
+ difference_offset);
+ const __m128i mask_ceiling = _mm_set1_epi8(64);
+ const __m128i mask_value = _mm_min_epi8(adjusted_difference, mask_ceiling);
+ if (mask_is_inverse) {
+ const __m128i inverted_mask_value = _mm_sub_epi8(mask_ceiling, mask_value);
+ StoreLo8(mask, inverted_mask_value);
+ } else {
+ StoreLo8(mask, mask_value);
+ }
+}
+
+#define WEIGHT8_WITHOUT_STRIDE \
+ WeightMask8_SSE4<mask_is_inverse>(pred_0, pred_1, mask)
+
+#define WEIGHT8_AND_STRIDE \
+ WEIGHT8_WITHOUT_STRIDE; \
+ pred_0 += 8; \
+ pred_1 += 8; \
+ mask += mask_stride
+
+template <bool mask_is_inverse>
+void WeightMask8x8_SSE4(const void* prediction_0, const void* prediction_1,
+ uint8_t* mask, ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ int y = 0;
+ do {
+ WEIGHT8_AND_STRIDE;
+ } while (++y < 7);
+ WEIGHT8_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask8x16_SSE4(const void* prediction_0, const void* prediction_1,
+ uint8_t* mask, ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ int y3 = 0;
+ do {
+ WEIGHT8_AND_STRIDE;
+ WEIGHT8_AND_STRIDE;
+ WEIGHT8_AND_STRIDE;
+ } while (++y3 < 5);
+ WEIGHT8_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask8x32_SSE4(const void* prediction_0, const void* prediction_1,
+ uint8_t* mask, ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ int y5 = 0;
+ do {
+ WEIGHT8_AND_STRIDE;
+ WEIGHT8_AND_STRIDE;
+ WEIGHT8_AND_STRIDE;
+ WEIGHT8_AND_STRIDE;
+ WEIGHT8_AND_STRIDE;
+ } while (++y5 < 6);
+ WEIGHT8_AND_STRIDE;
+ WEIGHT8_WITHOUT_STRIDE;
+}
+
+#define WEIGHT16_WITHOUT_STRIDE \
+ WeightMask8_SSE4<mask_is_inverse>(pred_0, pred_1, mask); \
+ WeightMask8_SSE4<mask_is_inverse>(pred_0 + 8, pred_1 + 8, mask + 8)
+
+#define WEIGHT16_AND_STRIDE \
+ WEIGHT16_WITHOUT_STRIDE; \
+ pred_0 += 16; \
+ pred_1 += 16; \
+ mask += mask_stride
+
+template <bool mask_is_inverse>
+void WeightMask16x8_SSE4(const void* prediction_0, const void* prediction_1,
+ uint8_t* mask, ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ int y = 0;
+ do {
+ WEIGHT16_AND_STRIDE;
+ } while (++y < 7);
+ WEIGHT16_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask16x16_SSE4(const void* prediction_0, const void* prediction_1,
+ uint8_t* mask, ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ int y3 = 0;
+ do {
+ WEIGHT16_AND_STRIDE;
+ WEIGHT16_AND_STRIDE;
+ WEIGHT16_AND_STRIDE;
+ } while (++y3 < 5);
+ WEIGHT16_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask16x32_SSE4(const void* prediction_0, const void* prediction_1,
+ uint8_t* mask, ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ int y5 = 0;
+ do {
+ WEIGHT16_AND_STRIDE;
+ WEIGHT16_AND_STRIDE;
+ WEIGHT16_AND_STRIDE;
+ WEIGHT16_AND_STRIDE;
+ WEIGHT16_AND_STRIDE;
+ } while (++y5 < 6);
+ WEIGHT16_AND_STRIDE;
+ WEIGHT16_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask16x64_SSE4(const void* prediction_0, const void* prediction_1,
+ uint8_t* mask, ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ int y3 = 0;
+ do {
+ WEIGHT16_AND_STRIDE;
+ WEIGHT16_AND_STRIDE;
+ WEIGHT16_AND_STRIDE;
+ } while (++y3 < 21);
+ WEIGHT16_WITHOUT_STRIDE;
+}
+
+#define WEIGHT32_WITHOUT_STRIDE \
+ WeightMask8_SSE4<mask_is_inverse>(pred_0, pred_1, mask); \
+ WeightMask8_SSE4<mask_is_inverse>(pred_0 + 8, pred_1 + 8, mask + 8); \
+ WeightMask8_SSE4<mask_is_inverse>(pred_0 + 16, pred_1 + 16, mask + 16); \
+ WeightMask8_SSE4<mask_is_inverse>(pred_0 + 24, pred_1 + 24, mask + 24)
+
+#define WEIGHT32_AND_STRIDE \
+ WEIGHT32_WITHOUT_STRIDE; \
+ pred_0 += 32; \
+ pred_1 += 32; \
+ mask += mask_stride
+
+template <bool mask_is_inverse>
+void WeightMask32x8_SSE4(const void* prediction_0, const void* prediction_1,
+ uint8_t* mask, ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ WEIGHT32_AND_STRIDE;
+ WEIGHT32_AND_STRIDE;
+ WEIGHT32_AND_STRIDE;
+ WEIGHT32_AND_STRIDE;
+ WEIGHT32_AND_STRIDE;
+ WEIGHT32_AND_STRIDE;
+ WEIGHT32_AND_STRIDE;
+ WEIGHT32_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask32x16_SSE4(const void* prediction_0, const void* prediction_1,
+ uint8_t* mask, ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ int y3 = 0;
+ do {
+ WEIGHT32_AND_STRIDE;
+ WEIGHT32_AND_STRIDE;
+ WEIGHT32_AND_STRIDE;
+ } while (++y3 < 5);
+ WEIGHT32_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask32x32_SSE4(const void* prediction_0, const void* prediction_1,
+ uint8_t* mask, ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ int y5 = 0;
+ do {
+ WEIGHT32_AND_STRIDE;
+ WEIGHT32_AND_STRIDE;
+ WEIGHT32_AND_STRIDE;
+ WEIGHT32_AND_STRIDE;
+ WEIGHT32_AND_STRIDE;
+ } while (++y5 < 6);
+ WEIGHT32_AND_STRIDE;
+ WEIGHT32_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask32x64_SSE4(const void* prediction_0, const void* prediction_1,
+ uint8_t* mask, ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ int y3 = 0;
+ do {
+ WEIGHT32_AND_STRIDE;
+ WEIGHT32_AND_STRIDE;
+ WEIGHT32_AND_STRIDE;
+ } while (++y3 < 21);
+ WEIGHT32_WITHOUT_STRIDE;
+}
+
+#define WEIGHT64_WITHOUT_STRIDE \
+ WeightMask8_SSE4<mask_is_inverse>(pred_0, pred_1, mask); \
+ WeightMask8_SSE4<mask_is_inverse>(pred_0 + 8, pred_1 + 8, mask + 8); \
+ WeightMask8_SSE4<mask_is_inverse>(pred_0 + 16, pred_1 + 16, mask + 16); \
+ WeightMask8_SSE4<mask_is_inverse>(pred_0 + 24, pred_1 + 24, mask + 24); \
+ WeightMask8_SSE4<mask_is_inverse>(pred_0 + 32, pred_1 + 32, mask + 32); \
+ WeightMask8_SSE4<mask_is_inverse>(pred_0 + 40, pred_1 + 40, mask + 40); \
+ WeightMask8_SSE4<mask_is_inverse>(pred_0 + 48, pred_1 + 48, mask + 48); \
+ WeightMask8_SSE4<mask_is_inverse>(pred_0 + 56, pred_1 + 56, mask + 56)
+
+#define WEIGHT64_AND_STRIDE \
+ WEIGHT64_WITHOUT_STRIDE; \
+ pred_0 += 64; \
+ pred_1 += 64; \
+ mask += mask_stride
+
+template <bool mask_is_inverse>
+void WeightMask64x16_SSE4(const void* prediction_0, const void* prediction_1,
+ uint8_t* mask, ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ int y3 = 0;
+ do {
+ WEIGHT64_AND_STRIDE;
+ WEIGHT64_AND_STRIDE;
+ WEIGHT64_AND_STRIDE;
+ } while (++y3 < 5);
+ WEIGHT64_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask64x32_SSE4(const void* prediction_0, const void* prediction_1,
+ uint8_t* mask, ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ int y5 = 0;
+ do {
+ WEIGHT64_AND_STRIDE;
+ WEIGHT64_AND_STRIDE;
+ WEIGHT64_AND_STRIDE;
+ WEIGHT64_AND_STRIDE;
+ WEIGHT64_AND_STRIDE;
+ } while (++y5 < 6);
+ WEIGHT64_AND_STRIDE;
+ WEIGHT64_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask64x64_SSE4(const void* prediction_0, const void* prediction_1,
+ uint8_t* mask, ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ int y3 = 0;
+ do {
+ WEIGHT64_AND_STRIDE;
+ WEIGHT64_AND_STRIDE;
+ WEIGHT64_AND_STRIDE;
+ } while (++y3 < 21);
+ WEIGHT64_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask64x128_SSE4(const void* prediction_0, const void* prediction_1,
+ uint8_t* mask, ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ int y3 = 0;
+ do {
+ WEIGHT64_AND_STRIDE;
+ WEIGHT64_AND_STRIDE;
+ WEIGHT64_AND_STRIDE;
+ } while (++y3 < 42);
+ WEIGHT64_AND_STRIDE;
+ WEIGHT64_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask128x64_SSE4(const void* prediction_0, const void* prediction_1,
+ uint8_t* mask, ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ int y3 = 0;
+ const ptrdiff_t adjusted_mask_stride = mask_stride - 64;
+ do {
+ WEIGHT64_WITHOUT_STRIDE;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += 64;
+ WEIGHT64_WITHOUT_STRIDE;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += adjusted_mask_stride;
+
+ WEIGHT64_WITHOUT_STRIDE;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += 64;
+ WEIGHT64_WITHOUT_STRIDE;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += adjusted_mask_stride;
+
+ WEIGHT64_WITHOUT_STRIDE;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += 64;
+ WEIGHT64_WITHOUT_STRIDE;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += adjusted_mask_stride;
+ } while (++y3 < 21);
+ WEIGHT64_WITHOUT_STRIDE;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += 64;
+ WEIGHT64_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask128x128_SSE4(const void* prediction_0, const void* prediction_1,
+ uint8_t* mask, ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ int y3 = 0;
+ const ptrdiff_t adjusted_mask_stride = mask_stride - 64;
+ do {
+ WEIGHT64_WITHOUT_STRIDE;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += 64;
+ WEIGHT64_WITHOUT_STRIDE;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += adjusted_mask_stride;
+
+ WEIGHT64_WITHOUT_STRIDE;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += 64;
+ WEIGHT64_WITHOUT_STRIDE;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += adjusted_mask_stride;
+
+ WEIGHT64_WITHOUT_STRIDE;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += 64;
+ WEIGHT64_WITHOUT_STRIDE;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += adjusted_mask_stride;
+ } while (++y3 < 42);
+ WEIGHT64_WITHOUT_STRIDE;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += 64;
+ WEIGHT64_WITHOUT_STRIDE;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += adjusted_mask_stride;
+
+ WEIGHT64_WITHOUT_STRIDE;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += 64;
+ WEIGHT64_WITHOUT_STRIDE;
+}
+
+#define INIT_WEIGHT_MASK_8BPP(width, height, w_index, h_index) \
+ dsp->weight_mask[w_index][h_index][0] = \
+ WeightMask##width##x##height##_SSE4<0>; \
+ dsp->weight_mask[w_index][h_index][1] = WeightMask##width##x##height##_SSE4<1>
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+ INIT_WEIGHT_MASK_8BPP(8, 8, 0, 0);
+ INIT_WEIGHT_MASK_8BPP(8, 16, 0, 1);
+ INIT_WEIGHT_MASK_8BPP(8, 32, 0, 2);
+ INIT_WEIGHT_MASK_8BPP(16, 8, 1, 0);
+ INIT_WEIGHT_MASK_8BPP(16, 16, 1, 1);
+ INIT_WEIGHT_MASK_8BPP(16, 32, 1, 2);
+ INIT_WEIGHT_MASK_8BPP(16, 64, 1, 3);
+ INIT_WEIGHT_MASK_8BPP(32, 8, 2, 0);
+ INIT_WEIGHT_MASK_8BPP(32, 16, 2, 1);
+ INIT_WEIGHT_MASK_8BPP(32, 32, 2, 2);
+ INIT_WEIGHT_MASK_8BPP(32, 64, 2, 3);
+ INIT_WEIGHT_MASK_8BPP(64, 16, 3, 1);
+ INIT_WEIGHT_MASK_8BPP(64, 32, 3, 2);
+ INIT_WEIGHT_MASK_8BPP(64, 64, 3, 3);
+ INIT_WEIGHT_MASK_8BPP(64, 128, 3, 4);
+ INIT_WEIGHT_MASK_8BPP(128, 64, 4, 3);
+ INIT_WEIGHT_MASK_8BPP(128, 128, 4, 4);
+}
+
+} // namespace
+} // namespace low_bitdepth
+
+void WeightMaskInit_SSE4_1() { low_bitdepth::Init8bpp(); }
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_TARGETING_SSE4_1
+
+namespace libgav1 {
+namespace dsp {
+
+void WeightMaskInit_SSE4_1() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/weight_mask_sse4.h b/src/dsp/x86/weight_mask_sse4.h
new file mode 100644
index 0000000..07636b7
--- /dev/null
+++ b/src/dsp/x86/weight_mask_sse4.h
@@ -0,0 +1,104 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_WEIGHT_MASK_SSE4_H_
+#define LIBGAV1_SRC_DSP_ARM_WEIGHT_MASK_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::weight_mask. This function is not thread-safe.
+void WeightMaskInit_SSE4_1();
+
+} // namespace dsp
+} // namespace libgav1
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_8x8
+#define LIBGAV1_Dsp8bpp_WeightMask_8x8 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_8x16
+#define LIBGAV1_Dsp8bpp_WeightMask_8x16 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_8x32
+#define LIBGAV1_Dsp8bpp_WeightMask_8x32 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_16x8
+#define LIBGAV1_Dsp8bpp_WeightMask_16x8 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_16x16
+#define LIBGAV1_Dsp8bpp_WeightMask_16x16 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_16x32
+#define LIBGAV1_Dsp8bpp_WeightMask_16x32 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_16x64
+#define LIBGAV1_Dsp8bpp_WeightMask_16x64 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_32x8
+#define LIBGAV1_Dsp8bpp_WeightMask_32x8 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_32x16
+#define LIBGAV1_Dsp8bpp_WeightMask_32x16 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_32x32
+#define LIBGAV1_Dsp8bpp_WeightMask_32x32 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_32x64
+#define LIBGAV1_Dsp8bpp_WeightMask_32x64 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_64x16
+#define LIBGAV1_Dsp8bpp_WeightMask_64x16 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_64x32
+#define LIBGAV1_Dsp8bpp_WeightMask_64x32 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_64x64
+#define LIBGAV1_Dsp8bpp_WeightMask_64x64 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_64x128
+#define LIBGAV1_Dsp8bpp_WeightMask_64x128 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_128x64
+#define LIBGAV1_Dsp8bpp_WeightMask_128x64 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_128x128
+#define LIBGAV1_Dsp8bpp_WeightMask_128x128 LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif // LIBGAV1_TARGETING_SSE4_1
+
+#endif // LIBGAV1_SRC_DSP_ARM_WEIGHT_MASK_SSE4_H_