1 files changed, 166 insertions, 0 deletions
diff --git a/src/dsp/x86/super_res_sse4.cc b/src/dsp/x86/super_res_sse4.cc
new file mode 100644
index 0000000..b2bdfd2
--- /dev/null
+++ b/src/dsp/x86/super_res_sse4.cc
@@ -0,0 +1,166 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/super_res.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <smmintrin.h>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/dsp/x86/transpose_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+// Upscale_Filter as defined in AV1 Section 7.16
+// Negative to make them fit in 8-bit.
+alignas(16) const int8_t
+    kNegativeUpscaleFilter[kSuperResFilterShifts][kSuperResFilterTaps] = {
+        {0, 0, 0, -128, 0, 0, 0, 0},       {0, 0, 1, -128, -2, 1, 0, 0},
+        {0, -1, 3, -127, -4, 2, -1, 0},    {0, -1, 4, -127, -6, 3, -1, 0},
+        {0, -2, 6, -126, -8, 3, -1, 0},    {0, -2, 7, -125, -11, 4, -1, 0},
+        {1, -2, 8, -125, -13, 5, -2, 0},   {1, -3, 9, -124, -15, 6, -2, 0},
+        {1, -3, 10, -123, -18, 6, -2, 1},  {1, -3, 11, -122, -20, 7, -3, 1},
+        {1, -4, 12, -121, -22, 8, -3, 1},  {1, -4, 13, -120, -25, 9, -3, 1},
+        {1, -4, 14, -118, -28, 9, -3, 1},  {1, -4, 15, -117, -30, 10, -4, 1},
+        {1, -5, 16, -116, -32, 11, -4, 1}, {1, -5, 16, -114, -35, 12, -4, 1},
+        {1, -5, 17, -112, -38, 12, -4, 1}, {1, -5, 18, -111, -40, 13, -5, 1},
+        {1, -5, 18, -109, -43, 14, -5, 1}, {1, -6, 19, -107, -45, 14, -5, 1},
+        {1, -6, 19, -105, -48, 15, -5, 1}, {1, -6, 19, -103, -51, 16, -5, 1},
+        {1, -6, 20, -101, -53, 16, -6, 1}, {1, -6, 20, -99, -56, 17, -6, 1},
+        {1, -6, 20, -97, -58, 17, -6, 1},  {1, -6, 20, -95, -61, 18, -6, 1},
+        {2, -7, 20, -93, -64, 18, -6, 2},  {2, -7, 20, -91, -66, 19, -6, 1},
+        {2, -7, 20, -88, -69, 19, -6, 1},  {2, -7, 20, -86, -71, 19, -6, 1},
+        {2, -7, 20, -84, -74, 20, -7, 2},  {2, -7, 20, -81, -76, 20, -7, 1},
+        {2, -7, 20, -79, -79, 20, -7, 2},  {1, -7, 20, -76, -81, 20, -7, 2},
+        {2, -7, 20, -74, -84, 20, -7, 2},  {1, -6, 19, -71, -86, 20, -7, 2},
+        {1, -6, 19, -69, -88, 20, -7, 2},  {1, -6, 19, -66, -91, 20, -7, 2},
+        {2, -6, 18, -64, -93, 20, -7, 2},  {1, -6, 18, -61, -95, 20, -6, 1},
+        {1, -6, 17, -58, -97, 20, -6, 1},  {1, -6, 17, -56, -99, 20, -6, 1},
+        {1, -6, 16, -53, -101, 20, -6, 1}, {1, -5, 16, -51, -103, 19, -6, 1},
+        {1, -5, 15, -48, -105, 19, -6, 1}, {1, -5, 14, -45, -107, 19, -6, 1},
+        {1, -5, 14, -43, -109, 18, -5, 1}, {1, -5, 13, -40, -111, 18, -5, 1},
+        {1, -4, 12, -38, -112, 17, -5, 1}, {1, -4, 12, -35, -114, 16, -5, 1},
+        {1, -4, 11, -32, -116, 16, -5, 1}, {1, -4, 10, -30, -117, 15, -4, 1},
+        {1, -3, 9, -28, -118, 14, -4, 1},  {1, -3, 9, -25, -120, 13, -4, 1},
+        {1, -3, 8, -22, -121, 12, -4, 1},  {1, -3, 7, -20, -122, 11, -3, 1},
+        {1, -2, 6, -18, -123, 10, -3, 1},  {0, -2, 6, -15, -124, 9, -3, 1},
+        {0, -2, 5, -13, -125, 8, -2, 1},   {0, -1, 4, -11, -125, 7, -2, 0},
+        {0, -1, 3, -8, -126, 6, -2, 0},    {0, -1, 3, -6, -127, 4, -1, 0},
+        {0, -1, 2, -4, -127, 3, -1, 0},    {0, 0, 1, -2, -128, 1, 0, 0},
+};
+
+void SuperResCoefficients_SSE4_1(const int upscaled_width,
+                                 const int initial_subpixel_x, const int step,
+                                 void* const coefficients) {
+  auto* dst = static_cast<uint8_t*>(coefficients);
+  int subpixel_x = initial_subpixel_x;
+  int x = RightShiftWithCeiling(upscaled_width, 4);
+  do {
+    for (int i = 0; i < 8; ++i, dst += 16) {
+      int remainder = subpixel_x & kSuperResScaleMask;
+      __m128i filter =
+          LoadLo8(kNegativeUpscaleFilter[remainder >> kSuperResExtraBits]);
+      subpixel_x += step;
+      remainder = subpixel_x & kSuperResScaleMask;
+      filter = LoadHi8(filter,
+                       kNegativeUpscaleFilter[remainder >> kSuperResExtraBits]);
+      subpixel_x += step;
+      StoreAligned16(dst, filter);
+    }
+  } while (--x != 0);
+}
+
+void SuperRes_SSE4_1(const void* const coefficients, void* const source,
+                     const ptrdiff_t stride, const int height,
+                     const int downscaled_width, const int upscaled_width,
+                     const int initial_subpixel_x, const int step,
+                     void* const dest) {
+  auto* src = static_cast<uint8_t*>(source) - DivideBy2(kSuperResFilterTaps);
+  auto* dst = static_cast<uint8_t*>(dest);
+  int y = height;
+  do {
+    const auto* filter = static_cast<const uint8_t*>(coefficients);
+    uint8_t* dst_ptr = dst;
+    ExtendLine<uint8_t>(src + DivideBy2(kSuperResFilterTaps), downscaled_width,
+                        kSuperResHorizontalBorder, kSuperResHorizontalBorder);
+    int subpixel_x = initial_subpixel_x;
+    // The below code calculates up to 15 extra upscaled
+    // pixels which will over-read up to 15 downscaled pixels in the end of each
+    // row. kSuperResHorizontalBorder accounts for this.
+    int x = RightShiftWithCeiling(upscaled_width, 4);
+    do {
+      __m128i weighted_src[8];
+      for (int i = 0; i < 8; ++i, filter += 16) {
+        __m128i s = LoadLo8(&src[subpixel_x >> kSuperResScaleBits]);
+        subpixel_x += step;
+        s = LoadHi8(s, &src[subpixel_x >> kSuperResScaleBits]);
+        subpixel_x += step;
+        const __m128i f = LoadAligned16(filter);
+        weighted_src[i] = _mm_maddubs_epi16(s, f);
+      }
+
+      __m128i a[4];
+      a[0] = _mm_hadd_epi16(weighted_src[0], weighted_src[1]);
+      a[1] = _mm_hadd_epi16(weighted_src[2], weighted_src[3]);
+      a[2] = _mm_hadd_epi16(weighted_src[4], weighted_src[5]);
+      a[3] = _mm_hadd_epi16(weighted_src[6], weighted_src[7]);
+      Transpose2x16_U16(a, a);
+      a[0] = _mm_adds_epi16(a[0], a[1]);
+      a[1] = _mm_adds_epi16(a[2], a[3]);
+      const __m128i rounding = _mm_set1_epi16(1 << (kFilterBits - 1));
+      a[0] = _mm_subs_epi16(rounding, a[0]);
+      a[1] = _mm_subs_epi16(rounding, a[1]);
+      a[0] = _mm_srai_epi16(a[0], kFilterBits);
+      a[1] = _mm_srai_epi16(a[1], kFilterBits);
+      StoreAligned16(dst_ptr, _mm_packus_epi16(a[0], a[1]));
+      dst_ptr += 16;
+    } while (--x != 0);
+    src += stride;
+    dst += stride;
+  } while (--y != 0);
+}
+
+void Init8bpp() {
+  Dsp* dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  dsp->super_res_coefficients = SuperResCoefficients_SSE4_1;
+  dsp->super_res = SuperRes_SSE4_1;
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+void SuperResInit_SSE4_1() { low_bitdepth::Init8bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else  // !LIBGAV1_TARGETING_SSE4_1
+
+namespace libgav1 {
+namespace dsp {
+
+void SuperResInit_SSE4_1() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_TARGETING_SSE4_1