aboutsummaryrefslogtreecommitdiff
path: root/src/dsp/x86
diff options
context:
space:
mode:
authorBoyuan Yang <byang@debian.org>2021-11-07 08:50:18 -0500
committerBoyuan Yang <byang@debian.org>2021-11-07 08:50:18 -0500
commit320ef65362608ee1148c299d8d5d7618af34e470 (patch)
treec47911c219d1e35b8b0771e9e0176eff0e0d08ec /src/dsp/x86
parent2381d803c76105f44717d75f089ec37f51e5cfe4 (diff)
downloadlibgav1-320ef65362608ee1148c299d8d5d7618af34e470.tar.gz
libgav1-320ef65362608ee1148c299d8d5d7618af34e470.tar.bz2
libgav1-320ef65362608ee1148c299d8d5d7618af34e470.zip
New upstream version 0.17.0
Diffstat (limited to 'src/dsp/x86')
-rw-r--r--src/dsp/x86/average_blend_sse4.cc36
-rw-r--r--src/dsp/x86/cdef_avx2.cc30
-rw-r--r--src/dsp/x86/cdef_sse4.cc29
-rw-r--r--src/dsp/x86/common_avx2_test.cc67
-rw-r--r--src/dsp/x86/common_sse4_test.cc64
-rw-r--r--src/dsp/x86/convolve_avx2.cc127
-rw-r--r--src/dsp/x86/convolve_sse4.cc284
-rw-r--r--src/dsp/x86/distance_weighted_blend_sse4.cc52
-rw-r--r--src/dsp/x86/film_grain_sse4.cc106
-rw-r--r--src/dsp/x86/intra_edge_sse4.cc9
-rw-r--r--src/dsp/x86/intrapred_cfl_sse4.cc53
-rw-r--r--src/dsp/x86/intrapred_filter_sse4.cc23
-rw-r--r--src/dsp/x86/intrapred_smooth_sse4.cc378
-rw-r--r--src/dsp/x86/intrapred_sse4.cc202
-rw-r--r--src/dsp/x86/inverse_transform_sse4.cc191
-rw-r--r--src/dsp/x86/inverse_transform_sse4.h52
-rw-r--r--src/dsp/x86/loop_restoration_10bit_avx2.cc22
-rw-r--r--src/dsp/x86/loop_restoration_10bit_sse4.cc22
-rw-r--r--src/dsp/x86/loop_restoration_avx2.cc22
-rw-r--r--src/dsp/x86/loop_restoration_sse4.cc22
-rw-r--r--src/dsp/x86/mask_blend_sse4.cc159
-rw-r--r--src/dsp/x86/motion_field_projection_sse4.cc21
-rw-r--r--src/dsp/x86/motion_vector_search_sse4.cc65
-rw-r--r--src/dsp/x86/obmc_sse4.cc142
-rw-r--r--src/dsp/x86/super_res_sse4.cc12
-rw-r--r--src/dsp/x86/warp_sse4.cc66
-rw-r--r--src/dsp/x86/weight_mask_sse4.cc198
27 files changed, 1368 insertions, 1086 deletions
diff --git a/src/dsp/x86/average_blend_sse4.cc b/src/dsp/x86/average_blend_sse4.cc
index ec9f589..911c5a9 100644
--- a/src/dsp/x86/average_blend_sse4.cc
+++ b/src/dsp/x86/average_blend_sse4.cc
@@ -35,8 +35,9 @@ namespace {
constexpr int kInterPostRoundBit = 4;
-inline void AverageBlend4Row(const int16_t* prediction_0,
- const int16_t* prediction_1, uint8_t* dest) {
+inline void AverageBlend4Row(const int16_t* LIBGAV1_RESTRICT prediction_0,
+ const int16_t* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT dest) {
const __m128i pred_0 = LoadLo8(prediction_0);
const __m128i pred_1 = LoadLo8(prediction_1);
__m128i res = _mm_add_epi16(pred_0, pred_1);
@@ -44,8 +45,9 @@ inline void AverageBlend4Row(const int16_t* prediction_0,
Store4(dest, _mm_packus_epi16(res, res));
}
-inline void AverageBlend8Row(const int16_t* prediction_0,
- const int16_t* prediction_1, uint8_t* dest) {
+inline void AverageBlend8Row(const int16_t* LIBGAV1_RESTRICT prediction_0,
+ const int16_t* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT dest) {
const __m128i pred_0 = LoadAligned16(prediction_0);
const __m128i pred_1 = LoadAligned16(prediction_1);
__m128i res = _mm_add_epi16(pred_0, pred_1);
@@ -53,9 +55,10 @@ inline void AverageBlend8Row(const int16_t* prediction_0,
StoreLo8(dest, _mm_packus_epi16(res, res));
}
-inline void AverageBlendLargeRow(const int16_t* prediction_0,
- const int16_t* prediction_1, const int width,
- uint8_t* dest) {
+inline void AverageBlendLargeRow(const int16_t* LIBGAV1_RESTRICT prediction_0,
+ const int16_t* LIBGAV1_RESTRICT prediction_1,
+ const int width,
+ uint8_t* LIBGAV1_RESTRICT dest) {
int x = 0;
do {
const __m128i pred_00 = LoadAligned16(&prediction_0[x]);
@@ -71,8 +74,10 @@ inline void AverageBlendLargeRow(const int16_t* prediction_0,
} while (x < width);
}
-void AverageBlend_SSE4_1(const void* prediction_0, const void* prediction_1,
- const int width, const int height, void* const dest,
+void AverageBlend_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ const int width, const int height,
+ void* LIBGAV1_RESTRICT const dest,
const ptrdiff_t dest_stride) {
auto* dst = static_cast<uint8_t*>(dest);
const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
@@ -148,11 +153,11 @@ namespace {
constexpr int kInterPostRoundBitPlusOne = 5;
template <const int width, const int offset>
-inline void AverageBlendRow(const uint16_t* prediction_0,
- const uint16_t* prediction_1,
+inline void AverageBlendRow(const uint16_t* LIBGAV1_RESTRICT prediction_0,
+ const uint16_t* LIBGAV1_RESTRICT prediction_1,
const __m128i& compound_offset,
const __m128i& round_offset, const __m128i& max,
- const __m128i& zero, uint16_t* dst,
+ const __m128i& zero, uint16_t* LIBGAV1_RESTRICT dst,
const ptrdiff_t dest_stride) {
// pred_0/1 max range is 16b.
const __m128i pred_0 = LoadUnaligned16(prediction_0 + offset);
@@ -182,9 +187,10 @@ inline void AverageBlendRow(const uint16_t* prediction_0,
StoreHi8(dst + dest_stride, result);
}
-void AverageBlend10bpp_SSE4_1(const void* prediction_0,
- const void* prediction_1, const int width,
- const int height, void* const dest,
+void AverageBlend10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ const int width, const int height,
+ void* LIBGAV1_RESTRICT const dest,
const ptrdiff_t dst_stride) {
auto* dst = static_cast<uint16_t*>(dest);
const ptrdiff_t dest_stride = dst_stride / sizeof(dst[0]);
diff --git a/src/dsp/x86/cdef_avx2.cc b/src/dsp/x86/cdef_avx2.cc
index d41dc38..01a2b9f 100644
--- a/src/dsp/x86/cdef_avx2.cc
+++ b/src/dsp/x86/cdef_avx2.cc
@@ -269,8 +269,8 @@ LIBGAV1_ALWAYS_INLINE void AddPartial_D7_D5(__m256i* v_src, __m256i* partial_lo,
_mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_pair_add[3], 10));
}
-LIBGAV1_ALWAYS_INLINE void AddPartial(const uint8_t* src, ptrdiff_t stride,
- __m256i* partial) {
+LIBGAV1_ALWAYS_INLINE void AddPartial(const uint8_t* LIBGAV1_RESTRICT src,
+ ptrdiff_t stride, __m256i* partial) {
// 8x8 input
// 00 01 02 03 04 05 06 07
// 10 11 12 13 14 15 16 17
@@ -451,8 +451,10 @@ inline void Cost2And6_Pair(uint32_t* cost, const __m256i partial_a,
cost[6] = _mm_cvtsi128_si32(_mm_srli_si128(sums, 8));
}
-void CdefDirection_AVX2(const void* const source, ptrdiff_t stride,
- uint8_t* const direction, int* const variance) {
+void CdefDirection_AVX2(const void* LIBGAV1_RESTRICT const source,
+ ptrdiff_t stride,
+ uint8_t* LIBGAV1_RESTRICT const direction,
+ int* LIBGAV1_RESTRICT const variance) {
assert(direction != nullptr);
assert(variance != nullptr);
const auto* src = static_cast<const uint8_t*>(source);
@@ -500,8 +502,9 @@ void CdefDirection_AVX2(const void* const source, ptrdiff_t stride,
// CdefFilter
// Load 4 vectors based on the given |direction|.
-inline void LoadDirection(const uint16_t* const src, const ptrdiff_t stride,
- __m128i* output, const int direction) {
+inline void LoadDirection(const uint16_t* LIBGAV1_RESTRICT const src,
+ const ptrdiff_t stride, __m128i* output,
+ const int direction) {
// Each |direction| describes a different set of source values. Expand this
// set by negating each set. For |direction| == 0 this gives a diagonal line
// from top right to bottom left. The first value is y, the second x. Negative
@@ -525,8 +528,9 @@ inline void LoadDirection(const uint16_t* const src, const ptrdiff_t stride,
// Load 4 vectors based on the given |direction|. Use when |block_width| == 4 to
// do 2 rows at a time.
-void LoadDirection4(const uint16_t* const src, const ptrdiff_t stride,
- __m128i* output, const int direction) {
+void LoadDirection4(const uint16_t* LIBGAV1_RESTRICT const src,
+ const ptrdiff_t stride, __m128i* output,
+ const int direction) {
const int y_0 = kCdefDirections[direction][0][0];
const int x_0 = kCdefDirections[direction][0][1];
const int y_1 = kCdefDirections[direction][1][0];
@@ -569,11 +573,11 @@ inline __m256i ApplyConstrainAndTap(const __m256i& pixel, const __m256i& val,
}
template <int width, bool enable_primary = true, bool enable_secondary = true>
-void CdefFilter_AVX2(const uint16_t* src, const ptrdiff_t src_stride,
- const int height, const int primary_strength,
- const int secondary_strength, const int damping,
- const int direction, void* dest,
- const ptrdiff_t dst_stride) {
+void CdefFilter_AVX2(const uint16_t* LIBGAV1_RESTRICT src,
+ const ptrdiff_t src_stride, const int height,
+ const int primary_strength, const int secondary_strength,
+ const int damping, const int direction,
+ void* LIBGAV1_RESTRICT dest, const ptrdiff_t dst_stride) {
static_assert(width == 8 || width == 4, "Invalid CDEF width.");
static_assert(enable_primary || enable_secondary, "");
constexpr bool clipping_required = enable_primary && enable_secondary;
diff --git a/src/dsp/x86/cdef_sse4.cc b/src/dsp/x86/cdef_sse4.cc
index 6ede778..6c48844 100644
--- a/src/dsp/x86/cdef_sse4.cc
+++ b/src/dsp/x86/cdef_sse4.cc
@@ -241,8 +241,8 @@ LIBGAV1_ALWAYS_INLINE void AddPartial_D5_D7(__m128i* v_src, __m128i* partial_lo,
*partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_pair_add[3], 10));
}
-LIBGAV1_ALWAYS_INLINE void AddPartial(const uint8_t* src, ptrdiff_t stride,
- __m128i* partial_lo,
+LIBGAV1_ALWAYS_INLINE void AddPartial(const uint8_t* LIBGAV1_RESTRICT src,
+ ptrdiff_t stride, __m128i* partial_lo,
__m128i* partial_hi) {
// 8x8 input
// 00 01 02 03 04 05 06 07
@@ -395,8 +395,10 @@ inline uint32_t SquareSum_S16(const __m128i a) {
return SumVector_S32(square);
}
-void CdefDirection_SSE4_1(const void* const source, ptrdiff_t stride,
- uint8_t* const direction, int* const variance) {
+void CdefDirection_SSE4_1(const void* LIBGAV1_RESTRICT const source,
+ ptrdiff_t stride,
+ uint8_t* LIBGAV1_RESTRICT const direction,
+ int* LIBGAV1_RESTRICT const variance) {
assert(direction != nullptr);
assert(variance != nullptr);
const auto* src = static_cast<const uint8_t*>(source);
@@ -438,8 +440,9 @@ void CdefDirection_SSE4_1(const void* const source, ptrdiff_t stride,
// CdefFilter
// Load 4 vectors based on the given |direction|.
-inline void LoadDirection(const uint16_t* const src, const ptrdiff_t stride,
- __m128i* output, const int direction) {
+inline void LoadDirection(const uint16_t* LIBGAV1_RESTRICT const src,
+ const ptrdiff_t stride, __m128i* output,
+ const int direction) {
// Each |direction| describes a different set of source values. Expand this
// set by negating each set. For |direction| == 0 this gives a diagonal line
// from top right to bottom left. The first value is y, the second x. Negative
@@ -463,8 +466,9 @@ inline void LoadDirection(const uint16_t* const src, const ptrdiff_t stride,
// Load 4 vectors based on the given |direction|. Use when |block_width| == 4 to
// do 2 rows at a time.
-void LoadDirection4(const uint16_t* const src, const ptrdiff_t stride,
- __m128i* output, const int direction) {
+void LoadDirection4(const uint16_t* LIBGAV1_RESTRICT const src,
+ const ptrdiff_t stride, __m128i* output,
+ const int direction) {
const int y_0 = kCdefDirections[direction][0][0];
const int x_0 = kCdefDirections[direction][0][1];
const int y_1 = kCdefDirections[direction][1][0];
@@ -507,10 +511,11 @@ inline __m128i ApplyConstrainAndTap(const __m128i& pixel, const __m128i& val,
}
template <int width, bool enable_primary = true, bool enable_secondary = true>
-void CdefFilter_SSE4_1(const uint16_t* src, const ptrdiff_t src_stride,
- const int height, const int primary_strength,
- const int secondary_strength, const int damping,
- const int direction, void* dest,
+void CdefFilter_SSE4_1(const uint16_t* LIBGAV1_RESTRICT src,
+ const ptrdiff_t src_stride, const int height,
+ const int primary_strength, const int secondary_strength,
+ const int damping, const int direction,
+ void* LIBGAV1_RESTRICT dest,
const ptrdiff_t dst_stride) {
static_assert(width == 8 || width == 4, "Invalid CDEF width.");
static_assert(enable_primary || enable_secondary, "");
diff --git a/src/dsp/x86/common_avx2_test.cc b/src/dsp/x86/common_avx2_test.cc
new file mode 100644
index 0000000..2062683
--- /dev/null
+++ b/src/dsp/x86/common_avx2_test.cc
@@ -0,0 +1,67 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/x86/common_avx2.h"
+
+#include "gtest/gtest.h"
+
+#if LIBGAV1_TARGETING_AVX2
+
+#include <cstdint>
+
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+// Show that RightShiftWithRounding_S16() is equal to
+// RightShiftWithRounding() only for values less than or equal to
+// INT16_MAX - ((1 << bits) >> 1). In particular, if bits == 16, then
+// RightShiftWithRounding_S16() is equal to RightShiftWithRounding() only for
+// negative values.
+TEST(CommonDspTest, AVX2RightShiftWithRoundingS16) {
+ for (int bits = 0; bits < 16; ++bits) {
+ const int bias = (1 << bits) >> 1;
+ for (int32_t value = INT16_MIN; value <= INT16_MAX; ++value) {
+ const __m256i v_val_d = _mm256_set1_epi16(value);
+ const __m256i v_result_d = RightShiftWithRounding_S16(v_val_d, bits);
+ // Note _mm256_extract_epi16 is avoided for compatibility with Visual
+ // Studio < 2017.
+ const int16_t result =
+ _mm_extract_epi16(_mm256_extracti128_si256(v_result_d, 0), 0);
+ const int32_t expected = RightShiftWithRounding(value, bits);
+ if (value <= INT16_MAX - bias) {
+ EXPECT_EQ(result, expected) << "value: " << value << ", bits: " << bits;
+ } else {
+ EXPECT_EQ(expected, 1 << (15 - bits));
+ EXPECT_EQ(result, -expected)
+ << "value: " << value << ", bits: " << bits;
+ }
+ }
+ }
+}
+
+} // namespace
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_TARGETING_AVX2
+
+TEST(CommonDspTest, AVX2) {
+ GTEST_SKIP() << "Build this module for x86(-64) with AVX2 enabled to enable "
+ "the tests.";
+}
+
+#endif // LIBGAV1_TARGETING_AVX2
diff --git a/src/dsp/x86/common_sse4_test.cc b/src/dsp/x86/common_sse4_test.cc
new file mode 100644
index 0000000..4ea811a
--- /dev/null
+++ b/src/dsp/x86/common_sse4_test.cc
@@ -0,0 +1,64 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/x86/common_sse4.h"
+
+#include "gtest/gtest.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <cstdint>
+
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+// Show that RightShiftWithRounding_S16() is equal to
+// RightShiftWithRounding() only for values less than or equal to
+// INT16_MAX - ((1 << bits) >> 1). In particular, if bits == 16, then
+// RightShiftWithRounding_S16() is equal to RightShiftWithRounding() only for
+// negative values.
+TEST(CommonDspTest, SSE4RightShiftWithRoundingS16) {
+ for (int bits = 0; bits < 16; ++bits) {
+ const int bias = (1 << bits) >> 1;
+ for (int32_t value = INT16_MIN; value <= INT16_MAX; ++value) {
+ const __m128i v_val_d = _mm_set1_epi16(value);
+ const __m128i v_result_d = RightShiftWithRounding_S16(v_val_d, bits);
+ const int16_t result = _mm_extract_epi16(v_result_d, 0);
+ const int32_t expected = RightShiftWithRounding(value, bits);
+ if (value <= INT16_MAX - bias) {
+ EXPECT_EQ(result, expected) << "value: " << value << ", bits: " << bits;
+ } else {
+ EXPECT_EQ(expected, 1 << (15 - bits));
+ EXPECT_EQ(result, -expected)
+ << "value: " << value << ", bits: " << bits;
+ }
+ }
+ }
+}
+
+} // namespace
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_TARGETING_SSE4_1
+
+TEST(CommonDspTest, SSE4) {
+ GTEST_SKIP() << "Build this module for x86(-64) with SSE4 enabled to enable "
+ "the tests.";
+}
+
+#endif // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/convolve_avx2.cc b/src/dsp/x86/convolve_avx2.cc
index 2ecb77c..4126ca9 100644
--- a/src/dsp/x86/convolve_avx2.cc
+++ b/src/dsp/x86/convolve_avx2.cc
@@ -127,10 +127,11 @@ __m256i HorizontalTaps8To16(const __m256i* const src,
// Filter 2xh sizes.
template <int num_taps, int filter_index, bool is_2d = false,
bool is_compound = false>
-void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride,
- void* const dest, const ptrdiff_t pred_stride,
- const int /*width*/, const int height,
- const __m128i* const v_tap) {
+void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
+ const ptrdiff_t src_stride,
+ void* LIBGAV1_RESTRICT const dest,
+ const ptrdiff_t pred_stride, const int /*width*/,
+ const int height, const __m128i* const v_tap) {
auto* dest8 = static_cast<uint8_t*>(dest);
auto* dest16 = static_cast<uint16_t*>(dest);
@@ -195,10 +196,11 @@ void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride,
// Filter widths >= 4.
template <int num_taps, int filter_index, bool is_2d = false,
bool is_compound = false>
-void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride,
- void* const dest, const ptrdiff_t pred_stride,
- const int width, const int height,
- const __m256i* const v_tap) {
+void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
+ const ptrdiff_t src_stride,
+ void* LIBGAV1_RESTRICT const dest,
+ const ptrdiff_t pred_stride, const int width,
+ const int height, const __m256i* const v_tap) {
auto* dest8 = static_cast<uint8_t*>(dest);
auto* dest16 = static_cast<uint16_t*>(dest);
@@ -467,7 +469,8 @@ __m256i SimpleSum2DVerticalTaps(const __m256i* const src,
}
template <int num_taps, bool is_compound = false>
-void Filter2DVertical16xH(const uint16_t* src, void* const dst,
+void Filter2DVertical16xH(const uint16_t* LIBGAV1_RESTRICT src,
+ void* LIBGAV1_RESTRICT const dst,
const ptrdiff_t dst_stride, const int width,
const int height, const __m256i* const taps) {
assert(width >= 8);
@@ -542,9 +545,10 @@ void Filter2DVertical16xH(const uint16_t* src, void* const dst,
template <bool is_2d = false, bool is_compound = false>
LIBGAV1_ALWAYS_INLINE void DoHorizontalPass2xH(
- const uint8_t* const src, const ptrdiff_t src_stride, void* const dst,
- const ptrdiff_t dst_stride, const int width, const int height,
- const int filter_id, const int filter_index) {
+ const uint8_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride,
+ void* LIBGAV1_RESTRICT const dst, const ptrdiff_t dst_stride,
+ const int width, const int height, const int filter_id,
+ const int filter_index) {
assert(filter_id != 0);
__m128i v_tap[4];
const __m128i v_horizontal_filter =
@@ -567,9 +571,10 @@ LIBGAV1_ALWAYS_INLINE void DoHorizontalPass2xH(
template <bool is_2d = false, bool is_compound = false>
LIBGAV1_ALWAYS_INLINE void DoHorizontalPass(
- const uint8_t* const src, const ptrdiff_t src_stride, void* const dst,
- const ptrdiff_t dst_stride, const int width, const int height,
- const int filter_id, const int filter_index) {
+ const uint8_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride,
+ void* LIBGAV1_RESTRICT const dst, const ptrdiff_t dst_stride,
+ const int width, const int height, const int filter_id,
+ const int filter_index) {
assert(filter_id != 0);
__m256i v_tap[4];
const __m128i v_horizontal_filter =
@@ -602,13 +607,13 @@ LIBGAV1_ALWAYS_INLINE void DoHorizontalPass(
}
}
-void Convolve2D_AVX2(const void* const reference,
+void Convolve2D_AVX2(const void* LIBGAV1_RESTRICT const reference,
const ptrdiff_t reference_stride,
const int horizontal_filter_index,
const int vertical_filter_index,
const int horizontal_filter_id,
const int vertical_filter_id, const int width,
- const int height, void* prediction,
+ const int height, void* LIBGAV1_RESTRICT prediction,
const ptrdiff_t pred_stride) {
const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
@@ -774,10 +779,11 @@ __m256i SumVerticalTaps(const __m256i* const srcs, const __m256i* const v_tap) {
}
template <int filter_index, bool is_compound = false>
-void FilterVertical32xH(const uint8_t* src, const ptrdiff_t src_stride,
- void* const dst, const ptrdiff_t dst_stride,
- const int width, const int height,
- const __m256i* const v_tap) {
+void FilterVertical32xH(const uint8_t* LIBGAV1_RESTRICT src,
+ const ptrdiff_t src_stride,
+ void* LIBGAV1_RESTRICT const dst,
+ const ptrdiff_t dst_stride, const int width,
+ const int height, const __m256i* const v_tap) {
const int num_taps = GetNumTapsInFilter(filter_index);
const int next_row = num_taps - 1;
auto* dst8 = static_cast<uint8_t*>(dst);
@@ -856,10 +862,11 @@ void FilterVertical32xH(const uint8_t* src, const ptrdiff_t src_stride,
}
template <int filter_index, bool is_compound = false>
-void FilterVertical16xH(const uint8_t* src, const ptrdiff_t src_stride,
- void* const dst, const ptrdiff_t dst_stride,
- const int /*width*/, const int height,
- const __m256i* const v_tap) {
+void FilterVertical16xH(const uint8_t* LIBGAV1_RESTRICT src,
+ const ptrdiff_t src_stride,
+ void* LIBGAV1_RESTRICT const dst,
+ const ptrdiff_t dst_stride, const int /*width*/,
+ const int height, const __m256i* const v_tap) {
const int num_taps = GetNumTapsInFilter(filter_index);
const int next_row = num_taps;
auto* dst8 = static_cast<uint8_t*>(dst);
@@ -958,10 +965,11 @@ void FilterVertical16xH(const uint8_t* src, const ptrdiff_t src_stride,
}
template <int filter_index, bool is_compound = false>
-void FilterVertical8xH(const uint8_t* src, const ptrdiff_t src_stride,
- void* const dst, const ptrdiff_t dst_stride,
- const int /*width*/, const int height,
- const __m256i* const v_tap) {
+void FilterVertical8xH(const uint8_t* LIBGAV1_RESTRICT src,
+ const ptrdiff_t src_stride,
+ void* LIBGAV1_RESTRICT const dst,
+ const ptrdiff_t dst_stride, const int /*width*/,
+ const int height, const __m256i* const v_tap) {
const int num_taps = GetNumTapsInFilter(filter_index);
const int next_row = num_taps;
auto* dst8 = static_cast<uint8_t*>(dst);
@@ -1055,10 +1063,11 @@ void FilterVertical8xH(const uint8_t* src, const ptrdiff_t src_stride,
}
template <int filter_index, bool is_compound = false>
-void FilterVertical8xH(const uint8_t* src, const ptrdiff_t src_stride,
- void* const dst, const ptrdiff_t dst_stride,
- const int /*width*/, const int height,
- const __m128i* const v_tap) {
+void FilterVertical8xH(const uint8_t* LIBGAV1_RESTRICT src,
+ const ptrdiff_t src_stride,
+ void* LIBGAV1_RESTRICT const dst,
+ const ptrdiff_t dst_stride, const int /*width*/,
+ const int height, const __m128i* const v_tap) {
const int num_taps = GetNumTapsInFilter(filter_index);
const int next_row = num_taps - 1;
auto* dst8 = static_cast<uint8_t*>(dst);
@@ -1119,13 +1128,13 @@ void FilterVertical8xH(const uint8_t* src, const ptrdiff_t src_stride,
} while (--y != 0);
}
-void ConvolveVertical_AVX2(const void* const reference,
+void ConvolveVertical_AVX2(const void* LIBGAV1_RESTRICT const reference,
const ptrdiff_t reference_stride,
const int /*horizontal_filter_index*/,
const int vertical_filter_index,
const int /*horizontal_filter_id*/,
const int vertical_filter_id, const int width,
- const int height, void* prediction,
+ const int height, void* LIBGAV1_RESTRICT prediction,
const ptrdiff_t pred_stride) {
const int filter_index = GetFilterIndex(vertical_filter_index, height);
const int vertical_taps = GetNumTapsInFilter(filter_index);
@@ -1257,11 +1266,11 @@ void ConvolveVertical_AVX2(const void* const reference,
}
void ConvolveCompoundVertical_AVX2(
- const void* const reference, const ptrdiff_t reference_stride,
- const int /*horizontal_filter_index*/, const int vertical_filter_index,
- const int /*horizontal_filter_id*/, const int vertical_filter_id,
- const int width, const int height, void* prediction,
- const ptrdiff_t /*pred_stride*/) {
+ const void* LIBGAV1_RESTRICT const reference,
+ const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+ const int vertical_filter_index, const int /*horizontal_filter_id*/,
+ const int vertical_filter_id, const int width, const int height,
+ void* LIBGAV1_RESTRICT prediction, const ptrdiff_t /*pred_stride*/) {
const int filter_index = GetFilterIndex(vertical_filter_index, height);
const int vertical_taps = GetNumTapsInFilter(filter_index);
const ptrdiff_t src_stride = reference_stride;
@@ -1366,14 +1375,12 @@ void ConvolveCompoundVertical_AVX2(
}
}
-void ConvolveHorizontal_AVX2(const void* const reference,
- const ptrdiff_t reference_stride,
- const int horizontal_filter_index,
- const int /*vertical_filter_index*/,
- const int horizontal_filter_id,
- const int /*vertical_filter_id*/, const int width,
- const int height, void* prediction,
- const ptrdiff_t pred_stride) {
+void ConvolveHorizontal_AVX2(
+ const void* LIBGAV1_RESTRICT const reference,
+ const ptrdiff_t reference_stride, const int horizontal_filter_index,
+ const int /*vertical_filter_index*/, const int horizontal_filter_id,
+ const int /*vertical_filter_id*/, const int width, const int height,
+ void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) {
const int filter_index = GetFilterIndex(horizontal_filter_index, width);
// Set |src| to the outermost tap.
const auto* src = static_cast<const uint8_t*>(reference) - kHorizontalOffset;
@@ -1390,11 +1397,11 @@ void ConvolveHorizontal_AVX2(const void* const reference,
}
void ConvolveCompoundHorizontal_AVX2(
- const void* const reference, const ptrdiff_t reference_stride,
- const int horizontal_filter_index, const int /*vertical_filter_index*/,
- const int horizontal_filter_id, const int /*vertical_filter_id*/,
- const int width, const int height, void* prediction,
- const ptrdiff_t pred_stride) {
+ const void* LIBGAV1_RESTRICT const reference,
+ const ptrdiff_t reference_stride, const int horizontal_filter_index,
+ const int /*vertical_filter_index*/, const int horizontal_filter_id,
+ const int /*vertical_filter_id*/, const int width, const int height,
+ void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) {
const int filter_index = GetFilterIndex(horizontal_filter_index, width);
// Set |src| to the outermost tap.
const auto* src = static_cast<const uint8_t*>(reference) - kHorizontalOffset;
@@ -1415,14 +1422,12 @@ void ConvolveCompoundHorizontal_AVX2(
filter_index);
}
-void ConvolveCompound2D_AVX2(const void* const reference,
- const ptrdiff_t reference_stride,
- const int horizontal_filter_index,
- const int vertical_filter_index,
- const int horizontal_filter_id,
- const int vertical_filter_id, const int width,
- const int height, void* prediction,
- const ptrdiff_t pred_stride) {
+void ConvolveCompound2D_AVX2(
+ const void* LIBGAV1_RESTRICT const reference,
+ const ptrdiff_t reference_stride, const int horizontal_filter_index,
+ const int vertical_filter_index, const int horizontal_filter_id,
+ const int vertical_filter_id, const int width, const int height,
+ void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) {
const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
const int vertical_taps = GetNumTapsInFilter(vert_filter_index);
diff --git a/src/dsp/x86/convolve_sse4.cc b/src/dsp/x86/convolve_sse4.cc
index 9b72fe4..f7e5a71 100644
--- a/src/dsp/x86/convolve_sse4.cc
+++ b/src/dsp/x86/convolve_sse4.cc
@@ -37,7 +37,7 @@ namespace {
#include "src/dsp/x86/convolve_sse4.inc"
template <int filter_index>
-__m128i SumHorizontalTaps(const uint8_t* const src,
+__m128i SumHorizontalTaps(const uint8_t* LIBGAV1_RESTRICT const src,
const __m128i* const v_tap) {
__m128i v_src[4];
const __m128i src_long = LoadUnaligned16(src);
@@ -68,7 +68,7 @@ __m128i SumHorizontalTaps(const uint8_t* const src,
}
template <int filter_index>
-__m128i SimpleHorizontalTaps(const uint8_t* const src,
+__m128i SimpleHorizontalTaps(const uint8_t* LIBGAV1_RESTRICT const src,
const __m128i* const v_tap) {
__m128i sum = SumHorizontalTaps<filter_index>(src, v_tap);
@@ -84,7 +84,7 @@ __m128i SimpleHorizontalTaps(const uint8_t* const src,
}
template <int filter_index>
-__m128i HorizontalTaps8To16(const uint8_t* const src,
+__m128i HorizontalTaps8To16(const uint8_t* LIBGAV1_RESTRICT const src,
const __m128i* const v_tap) {
const __m128i sum = SumHorizontalTaps<filter_index>(src, v_tap);
@@ -93,10 +93,11 @@ __m128i HorizontalTaps8To16(const uint8_t* const src,
template <int num_taps, int filter_index, bool is_2d = false,
bool is_compound = false>
-void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride,
- void* const dest, const ptrdiff_t pred_stride,
- const int width, const int height,
- const __m128i* const v_tap) {
+void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
+ const ptrdiff_t src_stride,
+ void* LIBGAV1_RESTRICT const dest,
+ const ptrdiff_t pred_stride, const int width,
+ const int height, const __m128i* const v_tap) {
auto* dest8 = static_cast<uint8_t*>(dest);
auto* dest16 = static_cast<uint16_t*>(dest);
@@ -206,9 +207,10 @@ void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride,
template <bool is_2d = false, bool is_compound = false>
LIBGAV1_ALWAYS_INLINE void DoHorizontalPass(
- const uint8_t* const src, const ptrdiff_t src_stride, void* const dst,
- const ptrdiff_t dst_stride, const int width, const int height,
- const int filter_id, const int filter_index) {
+ const uint8_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride,
+ void* LIBGAV1_RESTRICT const dst, const ptrdiff_t dst_stride,
+ const int width, const int height, const int filter_id,
+ const int filter_index) {
assert(filter_id != 0);
__m128i v_tap[4];
const __m128i v_horizontal_filter =
@@ -241,13 +243,13 @@ LIBGAV1_ALWAYS_INLINE void DoHorizontalPass(
}
}
-void Convolve2D_SSE4_1(const void* const reference,
+void Convolve2D_SSE4_1(const void* LIBGAV1_RESTRICT const reference,
const ptrdiff_t reference_stride,
const int horizontal_filter_index,
const int vertical_filter_index,
const int horizontal_filter_id,
const int vertical_filter_id, const int width,
- const int height, void* prediction,
+ const int height, void* LIBGAV1_RESTRICT prediction,
const ptrdiff_t pred_stride) {
const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
@@ -328,10 +330,11 @@ void Convolve2D_SSE4_1(const void* const reference,
}
template <int filter_index, bool is_compound = false>
-void FilterVertical(const uint8_t* src, const ptrdiff_t src_stride,
- void* const dst, const ptrdiff_t dst_stride,
- const int width, const int height,
- const __m128i* const v_tap) {
+void FilterVertical(const uint8_t* LIBGAV1_RESTRICT src,
+ const ptrdiff_t src_stride,
+ void* LIBGAV1_RESTRICT const dst,
+ const ptrdiff_t dst_stride, const int width,
+ const int height, const __m128i* const v_tap) {
const int num_taps = GetNumTapsInFilter(filter_index);
const int next_row = num_taps - 1;
auto* dst8 = static_cast<uint8_t*>(dst);
@@ -400,14 +403,12 @@ void FilterVertical(const uint8_t* src, const ptrdiff_t src_stride,
} while (x < width);
}
-void ConvolveVertical_SSE4_1(const void* const reference,
- const ptrdiff_t reference_stride,
- const int /*horizontal_filter_index*/,
- const int vertical_filter_index,
- const int /*horizontal_filter_id*/,
- const int vertical_filter_id, const int width,
- const int height, void* prediction,
- const ptrdiff_t pred_stride) {
+void ConvolveVertical_SSE4_1(
+ const void* LIBGAV1_RESTRICT const reference,
+ const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+ const int vertical_filter_index, const int /*horizontal_filter_id*/,
+ const int vertical_filter_id, const int width, const int height,
+ void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) {
const int filter_index = GetFilterIndex(vertical_filter_index, height);
const int vertical_taps = GetNumTapsInFilter(filter_index);
const ptrdiff_t src_stride = reference_stride;
@@ -477,14 +478,12 @@ void ConvolveVertical_SSE4_1(const void* const reference,
}
}
-void ConvolveCompoundCopy_SSE4(const void* const reference,
- const ptrdiff_t reference_stride,
- const int /*horizontal_filter_index*/,
- const int /*vertical_filter_index*/,
- const int /*horizontal_filter_id*/,
- const int /*vertical_filter_id*/,
- const int width, const int height,
- void* prediction, const ptrdiff_t pred_stride) {
+void ConvolveCompoundCopy_SSE4(
+ const void* LIBGAV1_RESTRICT const reference,
+ const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+ const int /*vertical_filter_index*/, const int /*horizontal_filter_id*/,
+ const int /*vertical_filter_id*/, const int width, const int height,
+ void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) {
const auto* src = static_cast<const uint8_t*>(reference);
const ptrdiff_t src_stride = reference_stride;
auto* dest = static_cast<uint16_t*>(prediction);
@@ -539,11 +538,11 @@ void ConvolveCompoundCopy_SSE4(const void* const reference,
}
void ConvolveCompoundVertical_SSE4_1(
- const void* const reference, const ptrdiff_t reference_stride,
- const int /*horizontal_filter_index*/, const int vertical_filter_index,
- const int /*horizontal_filter_id*/, const int vertical_filter_id,
- const int width, const int height, void* prediction,
- const ptrdiff_t /*pred_stride*/) {
+ const void* LIBGAV1_RESTRICT const reference,
+ const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+ const int vertical_filter_index, const int /*horizontal_filter_id*/,
+ const int vertical_filter_id, const int width, const int height,
+ void* LIBGAV1_RESTRICT prediction, const ptrdiff_t /*pred_stride*/) {
const int filter_index = GetFilterIndex(vertical_filter_index, height);
const int vertical_taps = GetNumTapsInFilter(filter_index);
const ptrdiff_t src_stride = reference_stride;
@@ -608,14 +607,12 @@ void ConvolveCompoundVertical_SSE4_1(
}
}
-void ConvolveHorizontal_SSE4_1(const void* const reference,
- const ptrdiff_t reference_stride,
- const int horizontal_filter_index,
- const int /*vertical_filter_index*/,
- const int horizontal_filter_id,
- const int /*vertical_filter_id*/,
- const int width, const int height,
- void* prediction, const ptrdiff_t pred_stride) {
+void ConvolveHorizontal_SSE4_1(
+ const void* LIBGAV1_RESTRICT const reference,
+ const ptrdiff_t reference_stride, const int horizontal_filter_index,
+ const int /*vertical_filter_index*/, const int horizontal_filter_id,
+ const int /*vertical_filter_id*/, const int width, const int height,
+ void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) {
const int filter_index = GetFilterIndex(horizontal_filter_index, width);
// Set |src| to the outermost tap.
const auto* src = static_cast<const uint8_t*>(reference) - kHorizontalOffset;
@@ -626,11 +623,11 @@ void ConvolveHorizontal_SSE4_1(const void* const reference,
}
void ConvolveCompoundHorizontal_SSE4_1(
- const void* const reference, const ptrdiff_t reference_stride,
- const int horizontal_filter_index, const int /*vertical_filter_index*/,
- const int horizontal_filter_id, const int /*vertical_filter_id*/,
- const int width, const int height, void* prediction,
- const ptrdiff_t /*pred_stride*/) {
+ const void* LIBGAV1_RESTRICT const reference,
+ const ptrdiff_t reference_stride, const int horizontal_filter_index,
+ const int /*vertical_filter_index*/, const int horizontal_filter_id,
+ const int /*vertical_filter_id*/, const int width, const int height,
+ void* LIBGAV1_RESTRICT prediction, const ptrdiff_t /*pred_stride*/) {
const int filter_index = GetFilterIndex(horizontal_filter_index, width);
const auto* src = static_cast<const uint8_t*>(reference) - kHorizontalOffset;
auto* dest = static_cast<uint16_t*>(prediction);
@@ -640,14 +637,12 @@ void ConvolveCompoundHorizontal_SSE4_1(
filter_index);
}
-void ConvolveCompound2D_SSE4_1(const void* const reference,
- const ptrdiff_t reference_stride,
- const int horizontal_filter_index,
- const int vertical_filter_index,
- const int horizontal_filter_id,
- const int vertical_filter_id, const int width,
- const int height, void* prediction,
- const ptrdiff_t /*pred_stride*/) {
+void ConvolveCompound2D_SSE4_1(
+ const void* LIBGAV1_RESTRICT const reference,
+ const ptrdiff_t reference_stride, const int horizontal_filter_index,
+ const int vertical_filter_index, const int horizontal_filter_id,
+ const int vertical_filter_id, const int width, const int height,
+ void* LIBGAV1_RESTRICT prediction, const ptrdiff_t /*pred_stride*/) {
// The output of the horizontal filter, i.e. the intermediate_result, is
// guaranteed to fit in int16_t.
alignas(16) uint16_t
@@ -835,7 +830,8 @@ inline void GetHalfSubPixelFilter(__m128i* output) {
// exceed 4 when width <= 4, |grade_x| is set to 1 regardless of the value of
// |step_x|.
template <int num_taps, int grade_x>
-inline void PrepareSourceVectors(const uint8_t* src, const __m128i src_indices,
+inline void PrepareSourceVectors(const uint8_t* LIBGAV1_RESTRICT src,
+ const __m128i src_indices,
__m128i* const source /*[num_taps >> 1]*/) {
// |used_bytes| is only computed in msan builds. Mask away unused bytes for
// msan because it incorrectly models the outcome of the shuffles in some
@@ -900,10 +896,11 @@ inline __m128i HorizontalScaleIndices(const __m128i subpel_indices) {
}
template <int grade_x, int filter_index, int num_taps>
-inline void ConvolveHorizontalScale(const uint8_t* src, ptrdiff_t src_stride,
- int width, int subpixel_x, int step_x,
+inline void ConvolveHorizontalScale(const uint8_t* LIBGAV1_RESTRICT src,
+ ptrdiff_t src_stride, int width,
+ int subpixel_x, int step_x,
int intermediate_height,
- int16_t* intermediate) {
+ int16_t* LIBGAV1_RESTRICT intermediate) {
// Account for the 0-taps that precede the 2 nonzero taps.
const int kernel_offset = (8 - num_taps) >> 1;
const int ref_x = subpixel_x >> kScaleSubPixelBits;
@@ -946,11 +943,11 @@ inline void ConvolveHorizontalScale(const uint8_t* src, ptrdiff_t src_stride,
}
// |width| >= 8
+ int16_t* intermediate_x = intermediate;
int x = 0;
do {
const uint8_t* src_x =
&src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
- int16_t* intermediate_x = intermediate + x;
// Only add steps to the 10-bit truncated p to avoid overflow.
const __m128i p_fraction = _mm_set1_epi16(p & 1023);
const __m128i subpel_indices = _mm_add_epi16(index_steps, p_fraction);
@@ -976,7 +973,8 @@ inline void ConvolveHorizontalScale(const uint8_t* src, ptrdiff_t src_stride,
}
template <int num_taps>
-inline void PrepareVerticalTaps(const int8_t* taps, __m128i* output) {
+inline void PrepareVerticalTaps(const int8_t* LIBGAV1_RESTRICT taps,
+ __m128i* output) {
// Avoid overreading the filter due to starting at kernel_offset.
// The only danger of overread is in the final filter, which has 4 taps.
const __m128i filter =
@@ -1072,10 +1070,12 @@ __m128i Sum2DVerticalTaps4x2(const __m128i* const src, const __m128i* taps_lo,
// |width_class| is 2, 4, or 8, according to the Store function that should be
// used.
template <int num_taps, int width_class, bool is_compound>
-inline void ConvolveVerticalScale(const int16_t* src, const int width,
- const int subpixel_y, const int filter_index,
- const int step_y, const int height,
- void* dest, const ptrdiff_t dest_stride) {
+inline void ConvolveVerticalScale(const int16_t* LIBGAV1_RESTRICT src,
+ const int intermediate_height,
+ const int width, const int subpixel_y,
+ const int filter_index, const int step_y,
+ const int height, void* LIBGAV1_RESTRICT dest,
+ const ptrdiff_t dest_stride) {
constexpr ptrdiff_t src_stride = kIntermediateStride;
constexpr int kernel_offset = (8 - num_taps) / 2;
const int16_t* src_y = src;
@@ -1138,15 +1138,19 @@ inline void ConvolveVerticalScale(const int16_t* src, const int width,
// |width_class| >= 8
__m128i filter_taps[num_taps >> 1];
- do { // y > 0
- src_y = src + (p >> kScaleSubPixelBits) * src_stride;
- const int filter_id = (p >> 6) & kSubPixelMask;
- const int8_t* filter =
- kHalfSubPixelFilters[filter_index][filter_id] + kernel_offset;
- PrepareVerticalTaps<num_taps>(filter, filter_taps);
-
- int x = 0;
- do { // x < width
+ int x = 0;
+ do { // x < width
+ auto* dest_y = static_cast<uint8_t*>(dest) + x;
+ auto* dest16_y = static_cast<uint16_t*>(dest) + x;
+ int p = subpixel_y & 1023;
+ int y = height;
+ do { // y > 0
+ const int filter_id = (p >> 6) & kSubPixelMask;
+ const int8_t* filter =
+ kHalfSubPixelFilters[filter_index][filter_id] + kernel_offset;
+ PrepareVerticalTaps<num_taps>(filter, filter_taps);
+
+ src_y = src + (p >> kScaleSubPixelBits) * src_stride;
for (int i = 0; i < num_taps; ++i) {
s[i] = LoadUnaligned16(src_y + i * src_stride);
}
@@ -1154,38 +1158,36 @@ inline void ConvolveVerticalScale(const int16_t* src, const int width,
const __m128i sums =
Sum2DVerticalTaps<num_taps, is_compound>(s, filter_taps);
if (is_compound) {
- StoreUnaligned16(dest16_y + x, sums);
+ StoreUnaligned16(dest16_y, sums);
} else {
- StoreLo8(dest_y + x, _mm_packus_epi16(sums, sums));
+ StoreLo8(dest_y, _mm_packus_epi16(sums, sums));
}
- x += 8;
- src_y += 8;
- } while (x < width);
- p += step_y;
- dest_y += dest_stride;
- dest16_y += dest_stride;
- } while (--y != 0);
+ p += step_y;
+ dest_y += dest_stride;
+ dest16_y += dest_stride;
+ } while (--y != 0);
+ src += kIntermediateStride * intermediate_height;
+ x += 8;
+ } while (x < width);
}
template <bool is_compound>
-void ConvolveScale2D_SSE4_1(const void* const reference,
+void ConvolveScale2D_SSE4_1(const void* LIBGAV1_RESTRICT const reference,
const ptrdiff_t reference_stride,
const int horizontal_filter_index,
const int vertical_filter_index,
const int subpixel_x, const int subpixel_y,
const int step_x, const int step_y, const int width,
- const int height, void* prediction,
+ const int height, void* LIBGAV1_RESTRICT prediction,
const ptrdiff_t pred_stride) {
const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
assert(step_x <= 2048);
// The output of the horizontal filter, i.e. the intermediate_result, is
// guaranteed to fit in int16_t.
- // TODO(petersonab): Reduce intermediate block stride to width to make smaller
- // blocks faster.
alignas(16) int16_t
- intermediate_result[kMaxSuperBlockSizeInPixels *
- (2 * kMaxSuperBlockSizeInPixels + kSubPixelTaps)];
+ intermediate_result[kIntermediateAllocWidth *
+ (2 * kIntermediateAllocWidth + kSubPixelTaps)];
const int num_vert_taps = GetNumTapsInFilter(vert_filter_index);
const int intermediate_height =
(((height - 1) * step_y + (1 << kScaleSubPixelBits) - 1) >>
@@ -1282,76 +1284,78 @@ void ConvolveScale2D_SSE4_1(const void* const reference,
case 1:
if (!is_compound && width == 2) {
ConvolveVerticalScale<6, 2, is_compound>(
- intermediate, width, subpixel_y, vert_filter_index, step_y, height,
- prediction, pred_stride);
+ intermediate, intermediate_height, width, subpixel_y,
+ vert_filter_index, step_y, height, prediction, pred_stride);
} else if (width == 4) {
ConvolveVerticalScale<6, 4, is_compound>(
- intermediate, width, subpixel_y, vert_filter_index, step_y, height,
- prediction, pred_stride);
+ intermediate, intermediate_height, width, subpixel_y,
+ vert_filter_index, step_y, height, prediction, pred_stride);
} else {
ConvolveVerticalScale<6, 8, is_compound>(
- intermediate, width, subpixel_y, vert_filter_index, step_y, height,
- prediction, pred_stride);
+ intermediate, intermediate_height, width, subpixel_y,
+ vert_filter_index, step_y, height, prediction, pred_stride);
}
break;
case 2:
if (!is_compound && width == 2) {
ConvolveVerticalScale<8, 2, is_compound>(
- intermediate, width, subpixel_y, vert_filter_index, step_y, height,
- prediction, pred_stride);
+ intermediate, intermediate_height, width, subpixel_y,
+ vert_filter_index, step_y, height, prediction, pred_stride);
} else if (width == 4) {
ConvolveVerticalScale<8, 4, is_compound>(
- intermediate, width, subpixel_y, vert_filter_index, step_y, height,
- prediction, pred_stride);
+ intermediate, intermediate_height, width, subpixel_y,
+ vert_filter_index, step_y, height, prediction, pred_stride);
} else {
ConvolveVerticalScale<8, 8, is_compound>(
- intermediate, width, subpixel_y, vert_filter_index, step_y, height,
- prediction, pred_stride);
+ intermediate, intermediate_height, width, subpixel_y,
+ vert_filter_index, step_y, height, prediction, pred_stride);
}
break;
case 3:
if (!is_compound && width == 2) {
ConvolveVerticalScale<2, 2, is_compound>(
- intermediate, width, subpixel_y, vert_filter_index, step_y, height,
- prediction, pred_stride);
+ intermediate, intermediate_height, width, subpixel_y,
+ vert_filter_index, step_y, height, prediction, pred_stride);
} else if (width == 4) {
ConvolveVerticalScale<2, 4, is_compound>(
- intermediate, width, subpixel_y, vert_filter_index, step_y, height,
- prediction, pred_stride);
+ intermediate, intermediate_height, width, subpixel_y,
+ vert_filter_index, step_y, height, prediction, pred_stride);
} else {
ConvolveVerticalScale<2, 8, is_compound>(
- intermediate, width, subpixel_y, vert_filter_index, step_y, height,
- prediction, pred_stride);
+ intermediate, intermediate_height, width, subpixel_y,
+ vert_filter_index, step_y, height, prediction, pred_stride);
}
break;
default:
assert(vert_filter_index == 4 || vert_filter_index == 5);
if (!is_compound && width == 2) {
ConvolveVerticalScale<4, 2, is_compound>(
- intermediate, width, subpixel_y, vert_filter_index, step_y, height,
- prediction, pred_stride);
+ intermediate, intermediate_height, width, subpixel_y,
+ vert_filter_index, step_y, height, prediction, pred_stride);
} else if (width == 4) {
ConvolveVerticalScale<4, 4, is_compound>(
- intermediate, width, subpixel_y, vert_filter_index, step_y, height,
- prediction, pred_stride);
+ intermediate, intermediate_height, width, subpixel_y,
+ vert_filter_index, step_y, height, prediction, pred_stride);
} else {
ConvolveVerticalScale<4, 8, is_compound>(
- intermediate, width, subpixel_y, vert_filter_index, step_y, height,
- prediction, pred_stride);
+ intermediate, intermediate_height, width, subpixel_y,
+ vert_filter_index, step_y, height, prediction, pred_stride);
}
}
}
-inline void HalfAddHorizontal(const uint8_t* src, uint8_t* dst) {
+inline void HalfAddHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
+ uint8_t* LIBGAV1_RESTRICT dst) {
const __m128i left = LoadUnaligned16(src);
const __m128i right = LoadUnaligned16(src + 1);
StoreUnaligned16(dst, _mm_avg_epu8(left, right));
}
template <int width>
-inline void IntraBlockCopyHorizontal(const uint8_t* src,
+inline void IntraBlockCopyHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
const ptrdiff_t src_stride,
- const int height, uint8_t* dst,
+ const int height,
+ uint8_t* LIBGAV1_RESTRICT dst,
const ptrdiff_t dst_stride) {
const ptrdiff_t src_remainder_stride = src_stride - (width - 16);
const ptrdiff_t dst_remainder_stride = dst_stride - (width - 16);
@@ -1392,10 +1396,11 @@ inline void IntraBlockCopyHorizontal(const uint8_t* src,
}
void ConvolveIntraBlockCopyHorizontal_SSE4_1(
- const void* const reference, const ptrdiff_t reference_stride,
- const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
- const int /*subpixel_x*/, const int /*subpixel_y*/, const int width,
- const int height, void* const prediction, const ptrdiff_t pred_stride) {
+ const void* LIBGAV1_RESTRICT const reference,
+ const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+ const int /*vertical_filter_index*/, const int /*subpixel_x*/,
+ const int /*subpixel_y*/, const int width, const int height,
+ void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) {
const auto* src = static_cast<const uint8_t*>(reference);
auto* dest = static_cast<uint8_t*>(prediction);
@@ -1464,9 +1469,10 @@ void ConvolveIntraBlockCopyHorizontal_SSE4_1(
}
template <int width>
-inline void IntraBlockCopyVertical(const uint8_t* src,
+inline void IntraBlockCopyVertical(const uint8_t* LIBGAV1_RESTRICT src,
const ptrdiff_t src_stride, const int height,
- uint8_t* dst, const ptrdiff_t dst_stride) {
+ uint8_t* LIBGAV1_RESTRICT dst,
+ const ptrdiff_t dst_stride) {
const ptrdiff_t src_remainder_stride = src_stride - (width - 16);
const ptrdiff_t dst_remainder_stride = dst_stride - (width - 16);
__m128i row[8], below[8];
@@ -1553,11 +1559,11 @@ inline void IntraBlockCopyVertical(const uint8_t* src,
}
void ConvolveIntraBlockCopyVertical_SSE4_1(
- const void* const reference, const ptrdiff_t reference_stride,
- const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
- const int /*horizontal_filter_id*/, const int /*vertical_filter_id*/,
- const int width, const int height, void* const prediction,
- const ptrdiff_t pred_stride) {
+ const void* LIBGAV1_RESTRICT const reference,
+ const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+ const int /*vertical_filter_index*/, const int /*horizontal_filter_id*/,
+ const int /*vertical_filter_id*/, const int width, const int height,
+ void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) {
const auto* src = static_cast<const uint8_t*>(reference);
auto* dest = static_cast<uint8_t*>(prediction);
@@ -1622,7 +1628,8 @@ void ConvolveIntraBlockCopyVertical_SSE4_1(
}
// Load then add two uint8_t vectors. Return the uint16_t vector result.
-inline __m128i LoadU8AndAddLong(const uint8_t* src, const uint8_t* src1) {
+inline __m128i LoadU8AndAddLong(const uint8_t* LIBGAV1_RESTRICT src,
+ const uint8_t* LIBGAV1_RESTRICT src1) {
const __m128i a = _mm_cvtepu8_epi16(LoadLo8(src));
const __m128i b = _mm_cvtepu8_epi16(LoadLo8(src1));
return _mm_add_epi16(a, b);
@@ -1637,8 +1644,9 @@ inline __m128i AddU16RightShift2AndPack(__m128i v0, __m128i v1) {
}
template <int width>
-inline void IntraBlockCopy2D(const uint8_t* src, const ptrdiff_t src_stride,
- const int height, uint8_t* dst,
+inline void IntraBlockCopy2D(const uint8_t* LIBGAV1_RESTRICT src,
+ const ptrdiff_t src_stride, const int height,
+ uint8_t* LIBGAV1_RESTRICT dst,
const ptrdiff_t dst_stride) {
const ptrdiff_t src_remainder_stride = src_stride - (width - 8);
const ptrdiff_t dst_remainder_stride = dst_stride - (width - 8);
@@ -1793,11 +1801,11 @@ inline void IntraBlockCopy2D(const uint8_t* src, const ptrdiff_t src_stride,
}
void ConvolveIntraBlockCopy2D_SSE4_1(
- const void* const reference, const ptrdiff_t reference_stride,
- const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
- const int /*horizontal_filter_id*/, const int /*vertical_filter_id*/,
- const int width, const int height, void* const prediction,
- const ptrdiff_t pred_stride) {
+ const void* LIBGAV1_RESTRICT const reference,
+ const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+ const int /*vertical_filter_index*/, const int /*horizontal_filter_id*/,
+ const int /*vertical_filter_id*/, const int width, const int height,
+ void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) {
const auto* src = static_cast<const uint8_t*>(reference);
auto* dest = static_cast<uint8_t*>(prediction);
// Note: allow vertical access to height + 1. Because this function is only
diff --git a/src/dsp/x86/distance_weighted_blend_sse4.cc b/src/dsp/x86/distance_weighted_blend_sse4.cc
index 3c29b19..c813df4 100644
--- a/src/dsp/x86/distance_weighted_blend_sse4.cc
+++ b/src/dsp/x86/distance_weighted_blend_sse4.cc
@@ -54,8 +54,10 @@ inline __m128i ComputeWeightedAverage8(const __m128i& pred0,
template <int height>
inline void DistanceWeightedBlend4xH_SSE4_1(
- const int16_t* pred_0, const int16_t* pred_1, const uint8_t weight_0,
- const uint8_t weight_1, void* const dest, const ptrdiff_t dest_stride) {
+ const int16_t* LIBGAV1_RESTRICT pred_0,
+ const int16_t* LIBGAV1_RESTRICT pred_1, const uint8_t weight_0,
+ const uint8_t weight_1, void* LIBGAV1_RESTRICT const dest,
+ const ptrdiff_t dest_stride) {
auto* dst = static_cast<uint8_t*>(dest);
const __m128i weights = _mm_set1_epi32(weight_0 | (weight_1 << 16));
@@ -98,8 +100,10 @@ inline void DistanceWeightedBlend4xH_SSE4_1(
template <int height>
inline void DistanceWeightedBlend8xH_SSE4_1(
- const int16_t* pred_0, const int16_t* pred_1, const uint8_t weight_0,
- const uint8_t weight_1, void* const dest, const ptrdiff_t dest_stride) {
+ const int16_t* LIBGAV1_RESTRICT pred_0,
+ const int16_t* LIBGAV1_RESTRICT pred_1, const uint8_t weight_0,
+ const uint8_t weight_1, void* LIBGAV1_RESTRICT const dest,
+ const ptrdiff_t dest_stride) {
auto* dst = static_cast<uint8_t*>(dest);
const __m128i weights = _mm_set1_epi32(weight_0 | (weight_1 << 16));
@@ -125,9 +129,10 @@ inline void DistanceWeightedBlend8xH_SSE4_1(
}
inline void DistanceWeightedBlendLarge_SSE4_1(
- const int16_t* pred_0, const int16_t* pred_1, const uint8_t weight_0,
- const uint8_t weight_1, const int width, const int height, void* const dest,
- const ptrdiff_t dest_stride) {
+ const int16_t* LIBGAV1_RESTRICT pred_0,
+ const int16_t* LIBGAV1_RESTRICT pred_1, const uint8_t weight_0,
+ const uint8_t weight_1, const int width, const int height,
+ void* LIBGAV1_RESTRICT const dest, const ptrdiff_t dest_stride) {
auto* dst = static_cast<uint8_t*>(dest);
const __m128i weights = _mm_set1_epi32(weight_0 | (weight_1 << 16));
@@ -154,11 +159,12 @@ inline void DistanceWeightedBlendLarge_SSE4_1(
} while (--y != 0);
}
-void DistanceWeightedBlend_SSE4_1(const void* prediction_0,
- const void* prediction_1,
+void DistanceWeightedBlend_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
const uint8_t weight_0,
const uint8_t weight_1, const int width,
- const int height, void* const dest,
+ const int height,
+ void* LIBGAV1_RESTRICT const dest,
const ptrdiff_t dest_stride) {
const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
@@ -257,8 +263,10 @@ inline __m128i ComputeWeightedAverage8(const __m128i& pred0,
template <int height>
inline void DistanceWeightedBlend4xH_SSE4_1(
- const uint16_t* pred_0, const uint16_t* pred_1, const uint8_t weight_0,
- const uint8_t weight_1, void* const dest, const ptrdiff_t dest_stride) {
+ const uint16_t* LIBGAV1_RESTRICT pred_0,
+ const uint16_t* LIBGAV1_RESTRICT pred_1, const uint8_t weight_0,
+ const uint8_t weight_1, void* LIBGAV1_RESTRICT const dest,
+ const ptrdiff_t dest_stride) {
auto* dst = static_cast<uint16_t*>(dest);
const __m128i weight0 = _mm_set1_epi32(weight_0);
const __m128i weight1 = _mm_set1_epi32(weight_1);
@@ -301,8 +309,10 @@ inline void DistanceWeightedBlend4xH_SSE4_1(
template <int height>
inline void DistanceWeightedBlend8xH_SSE4_1(
- const uint16_t* pred_0, const uint16_t* pred_1, const uint8_t weight_0,
- const uint8_t weight_1, void* const dest, const ptrdiff_t dest_stride) {
+ const uint16_t* LIBGAV1_RESTRICT pred_0,
+ const uint16_t* LIBGAV1_RESTRICT pred_1, const uint8_t weight_0,
+ const uint8_t weight_1, void* LIBGAV1_RESTRICT const dest,
+ const ptrdiff_t dest_stride) {
auto* dst = static_cast<uint16_t*>(dest);
const __m128i weight0 = _mm_set1_epi32(weight_0);
const __m128i weight1 = _mm_set1_epi32(weight_1);
@@ -332,9 +342,10 @@ inline void DistanceWeightedBlend8xH_SSE4_1(
}
inline void DistanceWeightedBlendLarge_SSE4_1(
- const uint16_t* pred_0, const uint16_t* pred_1, const uint8_t weight_0,
- const uint8_t weight_1, const int width, const int height, void* const dest,
- const ptrdiff_t dest_stride) {
+ const uint16_t* LIBGAV1_RESTRICT pred_0,
+ const uint16_t* LIBGAV1_RESTRICT pred_1, const uint8_t weight_0,
+ const uint8_t weight_1, const int width, const int height,
+ void* LIBGAV1_RESTRICT const dest, const ptrdiff_t dest_stride) {
auto* dst = static_cast<uint16_t*>(dest);
const __m128i weight0 = _mm_set1_epi32(weight_0);
const __m128i weight1 = _mm_set1_epi32(weight_1);
@@ -364,11 +375,12 @@ inline void DistanceWeightedBlendLarge_SSE4_1(
} while (--y != 0);
}
-void DistanceWeightedBlend_SSE4_1(const void* prediction_0,
- const void* prediction_1,
+void DistanceWeightedBlend_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
const uint8_t weight_0,
const uint8_t weight_1, const int width,
- const int height, void* const dest,
+ const int height,
+ void* LIBGAV1_RESTRICT const dest,
const ptrdiff_t dest_stride) {
const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
diff --git a/src/dsp/x86/film_grain_sse4.cc b/src/dsp/x86/film_grain_sse4.cc
index 745c1ca..9ece947 100644
--- a/src/dsp/x86/film_grain_sse4.cc
+++ b/src/dsp/x86/film_grain_sse4.cc
@@ -126,30 +126,16 @@ inline __m128i Clip3(const __m128i value, const __m128i low,
}
template <int bitdepth, typename Pixel>
-inline __m128i GetScalingFactors(
- const uint8_t scaling_lut[kScalingLookupTableSize], const Pixel* source) {
+inline __m128i GetScalingFactors(const int16_t* scaling_lut,
+ const Pixel* source) {
alignas(16) int16_t start_vals[8];
- if (bitdepth == 8) {
- // TODO(petersonab): Speed this up by creating a uint16_t scaling_lut.
- // Currently this code results in a series of movzbl.
- for (int i = 0; i < 8; ++i) {
- start_vals[i] = scaling_lut[source[i]];
- }
- return LoadAligned16(start_vals);
- }
- alignas(16) int16_t end_vals[8];
- // TODO(petersonab): Precompute this into a larger table for direct lookups.
+ static_assert(bitdepth <= kBitdepth10,
+ "SSE4 Film Grain is not yet implemented for 12bpp.");
for (int i = 0; i < 8; ++i) {
- const int index = source[i] >> 2;
- start_vals[i] = scaling_lut[index];
- end_vals[i] = scaling_lut[index + 1];
+ assert(source[i] < kScalingLookupTableSize << (bitdepth - 2));
+ start_vals[i] = scaling_lut[source[i]];
}
- const __m128i start = LoadAligned16(start_vals);
- const __m128i end = LoadAligned16(end_vals);
- __m128i remainder = LoadSource(source);
- remainder = _mm_srli_epi16(_mm_slli_epi16(remainder, 14), 1);
- const __m128i delta = _mm_mulhrs_epi16(_mm_sub_epi16(end, start), remainder);
- return _mm_add_epi16(start, delta);
+ return LoadAligned16(start_vals);
}
// |scaling_shift| is in range [8,11].
@@ -162,11 +148,10 @@ inline __m128i ScaleNoise(const __m128i noise, const __m128i scaling,
template <int bitdepth, typename GrainType, typename Pixel>
void BlendNoiseWithImageLuma_SSE4_1(
- const void* noise_image_ptr, int min_value, int max_luma, int scaling_shift,
- int width, int height, int start_height,
- const uint8_t scaling_lut_y[kScalingLookupTableSize],
- const void* source_plane_y, ptrdiff_t source_stride_y, void* dest_plane_y,
- ptrdiff_t dest_stride_y) {
+ const void* LIBGAV1_RESTRICT noise_image_ptr, int min_value, int max_luma,
+ int scaling_shift, int width, int height, int start_height,
+ const int16_t* scaling_lut_y, const void* source_plane_y,
+ ptrdiff_t source_stride_y, void* dest_plane_y, ptrdiff_t dest_stride_y) {
const auto* noise_image =
static_cast<const Array2D<GrainType>*>(noise_image_ptr);
const auto* in_y_row = static_cast<const Pixel*>(source_plane_y);
@@ -181,7 +166,6 @@ void BlendNoiseWithImageLuma_SSE4_1(
do {
int x = 0;
for (; x < safe_width; x += 8) {
- // TODO(b/133525232): Make 16-pixel version of loop body.
const __m128i orig = LoadSource(&in_y_row[x]);
const __m128i scaling =
GetScalingFactors<bitdepth, Pixel>(scaling_lut_y, &in_y_row[x]);
@@ -216,9 +200,9 @@ void BlendNoiseWithImageLuma_SSE4_1(
template <int bitdepth, typename GrainType, typename Pixel>
inline __m128i BlendChromaValsWithCfl(
- const Pixel* average_luma_buffer,
- const uint8_t scaling_lut[kScalingLookupTableSize],
- const Pixel* chroma_cursor, const GrainType* noise_image_cursor,
+ const Pixel* LIBGAV1_RESTRICT average_luma_buffer,
+ const int16_t* scaling_lut, const Pixel* LIBGAV1_RESTRICT chroma_cursor,
+ const GrainType* LIBGAV1_RESTRICT noise_image_cursor,
const __m128i scaling_shift) {
const __m128i scaling =
GetScalingFactors<bitdepth, Pixel>(scaling_lut, average_luma_buffer);
@@ -232,11 +216,10 @@ template <int bitdepth, typename GrainType, typename Pixel>
LIBGAV1_ALWAYS_INLINE void BlendChromaPlaneWithCfl_SSE4_1(
const Array2D<GrainType>& noise_image, int min_value, int max_chroma,
int width, int height, int start_height, int subsampling_x,
- int subsampling_y, int scaling_shift,
- const uint8_t scaling_lut[kScalingLookupTableSize], const Pixel* in_y_row,
- ptrdiff_t source_stride_y, const Pixel* in_chroma_row,
- ptrdiff_t source_stride_chroma, Pixel* out_chroma_row,
- ptrdiff_t dest_stride) {
+ int subsampling_y, int scaling_shift, const int16_t* scaling_lut,
+ const Pixel* LIBGAV1_RESTRICT in_y_row, ptrdiff_t source_stride_y,
+ const Pixel* in_chroma_row, ptrdiff_t source_stride_chroma,
+ Pixel* out_chroma_row, ptrdiff_t dest_stride) {
const __m128i floor = _mm_set1_epi16(min_value);
const __m128i ceiling = _mm_set1_epi16(max_chroma);
alignas(16) Pixel luma_buffer[16];
@@ -258,8 +241,6 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlaneWithCfl_SSE4_1(
int x = 0;
for (; x < safe_chroma_width; x += 8) {
const int luma_x = x << subsampling_x;
- // TODO(petersonab): Consider specializing by subsampling_x. In the 444
- // case &in_y_row[x] can be passed to GetScalingFactors directly.
const __m128i average_luma =
GetAverageLuma(&in_y_row[luma_x], subsampling_x);
StoreUnsigned(average_luma_buffer, average_luma);
@@ -277,7 +258,7 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlaneWithCfl_SSE4_1(
// Prevent huge indices from entering GetScalingFactors due to
// uninitialized values. This is not a problem in 8bpp because the table
// is made larger than 255 values.
- if (bitdepth > 8) {
+ if (bitdepth > kBitdepth8) {
memset(luma_buffer, 0, sizeof(luma_buffer));
}
const int luma_x = x << subsampling_x;
@@ -306,11 +287,11 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlaneWithCfl_SSE4_1(
// This further implies that scaling_lut_u == scaling_lut_v == scaling_lut_y.
template <int bitdepth, typename GrainType, typename Pixel>
void BlendNoiseWithImageChromaWithCfl_SSE4_1(
- Plane plane, const FilmGrainParams& params, const void* noise_image_ptr,
- int min_value, int max_chroma, int width, int height, int start_height,
- int subsampling_x, int subsampling_y,
- const uint8_t scaling_lut[kScalingLookupTableSize],
- const void* source_plane_y, ptrdiff_t source_stride_y,
+ Plane plane, const FilmGrainParams& params,
+ const void* LIBGAV1_RESTRICT noise_image_ptr, int min_value, int max_chroma,
+ int width, int height, int start_height, int subsampling_x,
+ int subsampling_y, const int16_t* scaling_lut,
+ const void* LIBGAV1_RESTRICT source_plane_y, ptrdiff_t source_stride_y,
const void* source_plane_uv, ptrdiff_t source_stride_uv,
void* dest_plane_uv, ptrdiff_t dest_stride_uv) {
const auto* noise_image =
@@ -335,10 +316,10 @@ namespace {
// |offset| is 32x4 packed to add with the result of _mm_madd_epi16.
inline __m128i BlendChromaValsNoCfl8bpp(
- const uint8_t scaling_lut[kScalingLookupTableSize], const __m128i& orig,
- const int8_t* noise_image_cursor, const __m128i& average_luma,
- const __m128i& scaling_shift, const __m128i& offset,
- const __m128i& weights) {
+ const int16_t* scaling_lut, const __m128i& orig,
+ const int8_t* LIBGAV1_RESTRICT noise_image_cursor,
+ const __m128i& average_luma, const __m128i& scaling_shift,
+ const __m128i& offset, const __m128i& weights) {
uint8_t merged_buffer[8];
const __m128i combined_lo =
_mm_madd_epi16(_mm_unpacklo_epi16(average_luma, orig), weights);
@@ -351,9 +332,9 @@ inline __m128i BlendChromaValsNoCfl8bpp(
StoreLo8(merged_buffer, _mm_packus_epi16(merged, merged));
const __m128i scaling =
- GetScalingFactors<8, uint8_t>(scaling_lut, merged_buffer);
+ GetScalingFactors<kBitdepth8, uint8_t>(scaling_lut, merged_buffer);
__m128i noise = LoadSource(noise_image_cursor);
- noise = ScaleNoise<8>(noise, scaling, scaling_shift);
+ noise = ScaleNoise<kBitdepth8>(noise, scaling, scaling_shift);
return _mm_add_epi16(orig, noise);
}
@@ -361,11 +342,10 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlane8bpp_SSE4_1(
const Array2D<int8_t>& noise_image, int min_value, int max_chroma,
int width, int height, int start_height, int subsampling_x,
int subsampling_y, int scaling_shift, int chroma_offset,
- int chroma_multiplier, int luma_multiplier,
- const uint8_t scaling_lut[kScalingLookupTableSize], const uint8_t* in_y_row,
- ptrdiff_t source_stride_y, const uint8_t* in_chroma_row,
- ptrdiff_t source_stride_chroma, uint8_t* out_chroma_row,
- ptrdiff_t dest_stride) {
+ int chroma_multiplier, int luma_multiplier, const int16_t* scaling_lut,
+ const uint8_t* LIBGAV1_RESTRICT in_y_row, ptrdiff_t source_stride_y,
+ const uint8_t* in_chroma_row, ptrdiff_t source_stride_chroma,
+ uint8_t* out_chroma_row, ptrdiff_t dest_stride) {
const __m128i floor = _mm_set1_epi16(min_value);
const __m128i ceiling = _mm_set1_epi16(max_chroma);
@@ -432,11 +412,11 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlane8bpp_SSE4_1(
// This function is for the case params_.chroma_scaling_from_luma == false.
void BlendNoiseWithImageChroma8bpp_SSE4_1(
- Plane plane, const FilmGrainParams& params, const void* noise_image_ptr,
- int min_value, int max_chroma, int width, int height, int start_height,
- int subsampling_x, int subsampling_y,
- const uint8_t scaling_lut[kScalingLookupTableSize],
- const void* source_plane_y, ptrdiff_t source_stride_y,
+ Plane plane, const FilmGrainParams& params,
+ const void* LIBGAV1_RESTRICT noise_image_ptr, int min_value, int max_chroma,
+ int width, int height, int start_height, int subsampling_x,
+ int subsampling_y, const int16_t* scaling_lut,
+ const void* LIBGAV1_RESTRICT source_plane_y, ptrdiff_t source_stride_y,
const void* source_plane_uv, ptrdiff_t source_stride_uv,
void* dest_plane_uv, ptrdiff_t dest_stride_uv) {
assert(plane == kPlaneU || plane == kPlaneV);
@@ -463,10 +443,10 @@ void Init8bpp() {
assert(dsp != nullptr);
dsp->film_grain.blend_noise_luma =
- BlendNoiseWithImageLuma_SSE4_1<8, int8_t, uint8_t>;
+ BlendNoiseWithImageLuma_SSE4_1<kBitdepth8, int8_t, uint8_t>;
dsp->film_grain.blend_noise_chroma[0] = BlendNoiseWithImageChroma8bpp_SSE4_1;
dsp->film_grain.blend_noise_chroma[1] =
- BlendNoiseWithImageChromaWithCfl_SSE4_1<8, int8_t, uint8_t>;
+ BlendNoiseWithImageChromaWithCfl_SSE4_1<kBitdepth8, int8_t, uint8_t>;
}
} // namespace
@@ -481,9 +461,9 @@ void Init10bpp() {
assert(dsp != nullptr);
dsp->film_grain.blend_noise_luma =
- BlendNoiseWithImageLuma_SSE4_1<10, int16_t, uint16_t>;
+ BlendNoiseWithImageLuma_SSE4_1<kBitdepth10, int16_t, uint16_t>;
dsp->film_grain.blend_noise_chroma[1] =
- BlendNoiseWithImageChromaWithCfl_SSE4_1<10, int16_t, uint16_t>;
+ BlendNoiseWithImageChromaWithCfl_SSE4_1<kBitdepth10, int16_t, uint16_t>;
}
} // namespace
diff --git a/src/dsp/x86/intra_edge_sse4.cc b/src/dsp/x86/intra_edge_sse4.cc
index d6af907..967be06 100644
--- a/src/dsp/x86/intra_edge_sse4.cc
+++ b/src/dsp/x86/intra_edge_sse4.cc
@@ -41,7 +41,8 @@ constexpr int kMaxEdgeBufferSize = 129;
// This function applies the kernel [0, 4, 8, 4, 0] to 12 values.
// Assumes |edge| has 16 packed byte values. Produces 12 filter outputs to
// write as overlapping sets of 8-bytes.
-inline void ComputeKernel1Store12(uint8_t* dest, const uint8_t* source) {
+inline void ComputeKernel1Store12(uint8_t* LIBGAV1_RESTRICT dest,
+ const uint8_t* LIBGAV1_RESTRICT source) {
const __m128i edge_lo = LoadUnaligned16(source);
const __m128i edge_hi = _mm_srli_si128(edge_lo, 6);
// Samples matched with the '4' tap, expanded to 16-bit.
@@ -77,7 +78,8 @@ inline void ComputeKernel1Store12(uint8_t* dest, const uint8_t* source) {
// This function applies the kernel [0, 5, 6, 5, 0] to 12 values.
// Assumes |edge| has 8 packed byte values, and that the 2 invalid values will
// be overwritten or safely discarded.
-inline void ComputeKernel2Store12(uint8_t* dest, const uint8_t* source) {
+inline void ComputeKernel2Store12(uint8_t* LIBGAV1_RESTRICT dest,
+ const uint8_t* LIBGAV1_RESTRICT source) {
const __m128i edge_lo = LoadUnaligned16(source);
const __m128i edge_hi = _mm_srli_si128(edge_lo, 6);
const __m128i outers_lo = _mm_cvtepu8_epi16(edge_lo);
@@ -115,7 +117,8 @@ inline void ComputeKernel2Store12(uint8_t* dest, const uint8_t* source) {
}
// This function applies the kernel [2, 4, 4, 4, 2] to 8 values.
-inline void ComputeKernel3Store8(uint8_t* dest, const uint8_t* source) {
+inline void ComputeKernel3Store8(uint8_t* LIBGAV1_RESTRICT dest,
+ const uint8_t* LIBGAV1_RESTRICT source) {
const __m128i edge_lo = LoadUnaligned16(source);
const __m128i edge_hi = _mm_srli_si128(edge_lo, 4);
// Finish |edge_lo| life cycle quickly.
diff --git a/src/dsp/x86/intrapred_cfl_sse4.cc b/src/dsp/x86/intrapred_cfl_sse4.cc
index f2dcfdb..eb7e466 100644
--- a/src/dsp/x86/intrapred_cfl_sse4.cc
+++ b/src/dsp/x86/intrapred_cfl_sse4.cc
@@ -88,7 +88,7 @@ inline __m128i CflPredictUnclipped(const __m128i* input, __m128i alpha_q12,
template <int width, int height>
void CflIntraPredictor_SSE4_1(
- void* const dest, ptrdiff_t stride,
+ void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
const int alpha) {
auto* dst = static_cast<uint8_t*>(dest);
@@ -127,7 +127,8 @@ void CflIntraPredictor_SSE4_1(
template <int block_height_log2, bool is_inside>
void CflSubsampler444_4xH_SSE4_1(
int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
- const int max_luma_height, const void* const source, ptrdiff_t stride) {
+ const int max_luma_height, const void* LIBGAV1_RESTRICT const source,
+ ptrdiff_t stride) {
static_assert(block_height_log2 <= 4, "");
const int block_height = 1 << block_height_log2;
const int visible_height = max_luma_height;
@@ -189,7 +190,7 @@ template <int block_height_log2>
void CflSubsampler444_4xH_SSE4_1(
int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
const int max_luma_width, const int max_luma_height,
- const void* const source, ptrdiff_t stride) {
+ const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
static_assert(block_height_log2 <= 4, "");
assert(max_luma_width >= 4);
assert(max_luma_height >= 4);
@@ -209,7 +210,7 @@ template <int block_height_log2, bool inside>
void CflSubsampler444_8xH_SSE4_1(
int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
const int max_luma_width, const int max_luma_height,
- const void* const source, ptrdiff_t stride) {
+ const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
static_assert(block_height_log2 <= 5, "");
const int block_height = 1 << block_height_log2, block_width = 8;
const int visible_height = max_luma_height;
@@ -292,7 +293,7 @@ template <int block_height_log2>
void CflSubsampler444_8xH_SSE4_1(
int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
const int max_luma_width, const int max_luma_height,
- const void* const source, ptrdiff_t stride) {
+ const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
static_assert(block_height_log2 <= 5, "");
assert(max_luma_width >= 4);
assert(max_luma_height >= 4);
@@ -315,7 +316,7 @@ template <int block_width_log2, int block_height_log2, bool inside>
void CflSubsampler444_SSE4_1(
int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
const int max_luma_width, const int max_luma_height,
- const void* const source, ptrdiff_t stride) {
+ const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
static_assert(block_width_log2 == 4 || block_width_log2 == 5, "");
static_assert(block_height_log2 <= 5, "");
assert(max_luma_width >= 4);
@@ -418,7 +419,7 @@ template <int block_width_log2, int block_height_log2>
void CflSubsampler444_SSE4_1(
int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
const int max_luma_width, const int max_luma_height,
- const void* const source, ptrdiff_t stride) {
+ const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
static_assert(block_width_log2 == 4 || block_width_log2 == 5, "");
static_assert(block_height_log2 <= 5, "");
assert(max_luma_width >= 4);
@@ -441,7 +442,7 @@ template <int block_height_log2>
void CflSubsampler420_4xH_SSE4_1(
int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
const int /*max_luma_width*/, const int max_luma_height,
- const void* const source, ptrdiff_t stride) {
+ const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
const int block_height = 1 << block_height_log2;
const auto* src = static_cast<const uint8_t*>(source);
int16_t* luma_ptr = luma[0];
@@ -511,7 +512,7 @@ template <int block_height_log2, int max_luma_width>
inline void CflSubsampler420Impl_8xH_SSE4_1(
int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
const int /*max_luma_width*/, const int max_luma_height,
- const void* const source, ptrdiff_t stride) {
+ const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
const int block_height = 1 << block_height_log2;
const auto* src = static_cast<const uint8_t*>(source);
const __m128i zero = _mm_setzero_si128();
@@ -620,7 +621,7 @@ template <int block_height_log2>
void CflSubsampler420_8xH_SSE4_1(
int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
const int max_luma_width, const int max_luma_height,
- const void* const source, ptrdiff_t stride) {
+ const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
if (max_luma_width == 8) {
CflSubsampler420Impl_8xH_SSE4_1<block_height_log2, 8>(
luma, max_luma_width, max_luma_height, source, stride);
@@ -634,7 +635,7 @@ template <int block_width_log2, int block_height_log2, int max_luma_width>
inline void CflSubsampler420Impl_WxH_SSE4_1(
int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
const int /*max_luma_width*/, const int max_luma_height,
- const void* const source, ptrdiff_t stride) {
+ const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
const auto* src = static_cast<const uint8_t*>(source);
const __m128i zero = _mm_setzero_si128();
__m128i final_sum = zero;
@@ -751,7 +752,7 @@ template <int block_width_log2, int block_height_log2>
void CflSubsampler420_WxH_SSE4_1(
int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
const int max_luma_width, const int max_luma_height,
- const void* const source, ptrdiff_t stride) {
+ const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
switch (max_luma_width) {
case 8:
CflSubsampler420Impl_WxH_SSE4_1<block_width_log2, block_height_log2, 8>(
@@ -968,7 +969,7 @@ inline __m128i ClipEpi16(__m128i x, __m128i min, __m128i max) {
template <int width, int height>
void CflIntraPredictor_10bpp_SSE4_1(
- void* const dest, ptrdiff_t stride,
+ void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
const int alpha) {
constexpr int kCflLumaBufferStrideLog2_16i = 5;
@@ -1018,7 +1019,8 @@ void CflIntraPredictor_10bpp_SSE4_1(
template <int block_height_log2, bool is_inside>
void CflSubsampler444_4xH_SSE4_1(
int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
- const int max_luma_height, const void* const source, ptrdiff_t stride) {
+ const int max_luma_height, const void* LIBGAV1_RESTRICT const source,
+ ptrdiff_t stride) {
static_assert(block_height_log2 <= 4, "");
const int block_height = 1 << block_height_log2;
const int visible_height = max_luma_height;
@@ -1079,7 +1081,7 @@ template <int block_height_log2>
void CflSubsampler444_4xH_SSE4_1(
int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
const int max_luma_width, const int max_luma_height,
- const void* const source, ptrdiff_t stride) {
+ const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
static_cast<void>(max_luma_width);
static_cast<void>(max_luma_height);
static_assert(block_height_log2 <= 4, "");
@@ -1099,7 +1101,8 @@ void CflSubsampler444_4xH_SSE4_1(
template <int block_height_log2, bool is_inside>
void CflSubsampler444_8xH_SSE4_1(
int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
- const int max_luma_height, const void* const source, ptrdiff_t stride) {
+ const int max_luma_height, const void* LIBGAV1_RESTRICT const source,
+ ptrdiff_t stride) {
const int block_height = 1 << block_height_log2;
const int visible_height = max_luma_height;
const __m128i dup16 = _mm_set1_epi32(0x01000100);
@@ -1158,7 +1161,7 @@ template <int block_height_log2>
void CflSubsampler444_8xH_SSE4_1(
int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
const int max_luma_width, const int max_luma_height,
- const void* const source, ptrdiff_t stride) {
+ const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
static_cast<void>(max_luma_width);
static_cast<void>(max_luma_height);
static_assert(block_height_log2 <= 5, "");
@@ -1182,7 +1185,7 @@ template <int block_width_log2, int block_height_log2, bool is_inside>
void CflSubsampler444_WxH_SSE4_1(
int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
const int max_luma_width, const int max_luma_height,
- const void* const source, ptrdiff_t stride) {
+ const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
const int block_height = 1 << block_height_log2;
const int visible_height = max_luma_height;
const int block_width = 1 << block_width_log2;
@@ -1278,7 +1281,7 @@ template <int block_width_log2, int block_height_log2>
void CflSubsampler444_WxH_SSE4_1(
int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
const int max_luma_width, const int max_luma_height,
- const void* const source, ptrdiff_t stride) {
+ const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
static_assert(block_width_log2 == 4 || block_width_log2 == 5,
"This function will only work for block_width 16 and 32.");
static_assert(block_height_log2 <= 5, "");
@@ -1300,7 +1303,7 @@ template <int block_height_log2>
void CflSubsampler420_4xH_SSE4_1(
int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
const int /*max_luma_width*/, const int max_luma_height,
- const void* const source, ptrdiff_t stride) {
+ const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
const int block_height = 1 << block_height_log2;
const auto* src = static_cast<const uint16_t*>(source);
const ptrdiff_t src_stride = stride / sizeof(src[0]);
@@ -1371,7 +1374,8 @@ void CflSubsampler420_4xH_SSE4_1(
template <int block_height_log2, int max_luma_width>
inline void CflSubsampler420Impl_8xH_SSE4_1(
int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
- const int max_luma_height, const void* const source, ptrdiff_t stride) {
+ const int max_luma_height, const void* LIBGAV1_RESTRICT const source,
+ ptrdiff_t stride) {
const int block_height = 1 << block_height_log2;
const auto* src = static_cast<const uint16_t*>(source);
const ptrdiff_t src_stride = stride / sizeof(src[0]);
@@ -1483,7 +1487,7 @@ template <int block_height_log2>
void CflSubsampler420_8xH_SSE4_1(
int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
const int max_luma_width, const int max_luma_height,
- const void* const source, ptrdiff_t stride) {
+ const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
if (max_luma_width == 8) {
CflSubsampler420Impl_8xH_SSE4_1<block_height_log2, 8>(luma, max_luma_height,
source, stride);
@@ -1496,7 +1500,8 @@ void CflSubsampler420_8xH_SSE4_1(
template <int block_width_log2, int block_height_log2, int max_luma_width>
inline void CflSubsampler420Impl_WxH_SSE4_1(
int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
- const int max_luma_height, const void* const source, ptrdiff_t stride) {
+ const int max_luma_height, const void* LIBGAV1_RESTRICT const source,
+ ptrdiff_t stride) {
const auto* src = static_cast<const uint16_t*>(source);
const ptrdiff_t src_stride = stride / sizeof(src[0]);
const __m128i zero = _mm_setzero_si128();
@@ -1615,7 +1620,7 @@ template <int block_width_log2, int block_height_log2>
void CflSubsampler420_WxH_SSE4_1(
int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
const int max_luma_width, const int max_luma_height,
- const void* const source, ptrdiff_t stride) {
+ const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
switch (max_luma_width) {
case 8:
CflSubsampler420Impl_WxH_SSE4_1<block_width_log2, block_height_log2, 8>(
diff --git a/src/dsp/x86/intrapred_filter_sse4.cc b/src/dsp/x86/intrapred_filter_sse4.cc
index 022af8d..a43a5cf 100644
--- a/src/dsp/x86/intrapred_filter_sse4.cc
+++ b/src/dsp/x86/intrapred_filter_sse4.cc
@@ -64,10 +64,10 @@ constexpr int kDuplicateFirstHalf = 0x44;
// at zero to preserve the sum.
// |pixels| contains p0-p7 in order as shown above.
// |taps_0_1| contains the filter kernels used to predict f0 and f1, and so on.
-inline void Filter4x2_SSE4_1(uint8_t* dst, const ptrdiff_t stride,
- const __m128i& pixels, const __m128i& taps_0_1,
- const __m128i& taps_2_3, const __m128i& taps_4_5,
- const __m128i& taps_6_7) {
+inline void Filter4x2_SSE4_1(uint8_t* LIBGAV1_RESTRICT dst,
+ const ptrdiff_t stride, const __m128i& pixels,
+ const __m128i& taps_0_1, const __m128i& taps_2_3,
+ const __m128i& taps_4_5, const __m128i& taps_6_7) {
const __m128i mul_0_01 = _mm_maddubs_epi16(pixels, taps_0_1);
const __m128i mul_0_23 = _mm_maddubs_epi16(pixels, taps_2_3);
// |output_half| contains 8 partial sums for f0-f7.
@@ -93,10 +93,10 @@ inline void Filter4x2_SSE4_1(uint8_t* dst, const ptrdiff_t stride,
// for successive blocks. This implementation takes advantage of the fact
// that the p5 and p6 for each sub-block come solely from the |left_ptr| buffer,
// using shifts to arrange things to fit reusable shuffle vectors.
-inline void Filter4xH(uint8_t* dest, ptrdiff_t stride,
- const uint8_t* const top_ptr,
- const uint8_t* const left_ptr, FilterIntraPredictor pred,
- const int height) {
+inline void Filter4xH(uint8_t* LIBGAV1_RESTRICT dest, ptrdiff_t stride,
+ const uint8_t* LIBGAV1_RESTRICT const top_ptr,
+ const uint8_t* LIBGAV1_RESTRICT const left_ptr,
+ FilterIntraPredictor pred, const int height) {
// Two filter kernels per vector.
const __m128i taps_0_1 = LoadAligned16(kFilterIntraTaps[pred][0]);
const __m128i taps_2_3 = LoadAligned16(kFilterIntraTaps[pred][2]);
@@ -271,9 +271,10 @@ inline void Filter4xH(uint8_t* dest, ptrdiff_t stride,
}
}
-void FilterIntraPredictor_SSE4_1(void* const dest, ptrdiff_t stride,
- const void* const top_row,
- const void* const left_column,
+void FilterIntraPredictor_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+ ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column,
FilterIntraPredictor pred, const int width,
const int height) {
const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
diff --git a/src/dsp/x86/intrapred_smooth_sse4.cc b/src/dsp/x86/intrapred_smooth_sse4.cc
index de9f551..b53ee8c 100644
--- a/src/dsp/x86/intrapred_smooth_sse4.cc
+++ b/src/dsp/x86/intrapred_smooth_sse4.cc
@@ -38,23 +38,12 @@ namespace {
// to have visibility of the values. This helps reduce loads and in the
// creation of the inverse weights.
constexpr uint8_t kSmoothWeights[] = {
- // block dimension = 4
- 255, 149, 85, 64,
- // block dimension = 8
- 255, 197, 146, 105, 73, 50, 37, 32,
- // block dimension = 16
- 255, 225, 196, 170, 145, 123, 102, 84, 68, 54, 43, 33, 26, 20, 17, 16,
- // block dimension = 32
- 255, 240, 225, 210, 196, 182, 169, 157, 145, 133, 122, 111, 101, 92, 83, 74,
- 66, 59, 52, 45, 39, 34, 29, 25, 21, 17, 14, 12, 10, 9, 8, 8,
- // block dimension = 64
- 255, 248, 240, 233, 225, 218, 210, 203, 196, 189, 182, 176, 169, 163, 156,
- 150, 144, 138, 133, 127, 121, 116, 111, 106, 101, 96, 91, 86, 82, 77, 73,
- 69, 65, 61, 57, 54, 50, 47, 44, 41, 38, 35, 32, 29, 27, 25, 22, 20, 18, 16,
- 15, 13, 12, 10, 9, 8, 7, 6, 6, 5, 5, 4, 4, 4};
+#include "src/dsp/smooth_weights.inc"
+};
template <int y_mask>
-inline void WriteSmoothHorizontalSum4(void* const dest, const __m128i& left,
+inline void WriteSmoothHorizontalSum4(void* LIBGAV1_RESTRICT const dest,
+ const __m128i& left,
const __m128i& weights,
const __m128i& scaled_top_right,
const __m128i& round) {
@@ -77,7 +66,8 @@ inline __m128i SmoothDirectionalSum8(const __m128i& pixels,
return _mm_add_epi16(scaled_corner, weighted_px);
}
-inline void WriteSmoothDirectionalSum8(uint8_t* dest, const __m128i& pixels,
+inline void WriteSmoothDirectionalSum8(uint8_t* LIBGAV1_RESTRICT dest,
+ const __m128i& pixels,
const __m128i& weights,
const __m128i& scaled_corner,
const __m128i& round) {
@@ -91,13 +81,11 @@ inline void WriteSmoothDirectionalSum8(uint8_t* dest, const __m128i& pixels,
// For Horizontal, pixels1 and pixels2 are the same repeated value. For
// Vertical, weights1 and weights2 are the same, and scaled_corner1 and
// scaled_corner2 are the same.
-inline void WriteSmoothDirectionalSum16(uint8_t* dest, const __m128i& pixels1,
- const __m128i& pixels2,
- const __m128i& weights1,
- const __m128i& weights2,
- const __m128i& scaled_corner1,
- const __m128i& scaled_corner2,
- const __m128i& round) {
+inline void WriteSmoothDirectionalSum16(
+ uint8_t* LIBGAV1_RESTRICT dest, const __m128i& pixels1,
+ const __m128i& pixels2, const __m128i& weights1, const __m128i& weights2,
+ const __m128i& scaled_corner1, const __m128i& scaled_corner2,
+ const __m128i& round) {
const __m128i weighted_px1 = _mm_mullo_epi16(pixels1, weights1);
const __m128i weighted_px2 = _mm_mullo_epi16(pixels2, weights2);
const __m128i pred_sum1 = _mm_add_epi16(scaled_corner1, weighted_px1);
@@ -109,8 +97,9 @@ inline void WriteSmoothDirectionalSum16(uint8_t* dest, const __m128i& pixels1,
}
template <int y_mask>
-inline void WriteSmoothPredSum4(uint8_t* const dest, const __m128i& top,
- const __m128i& left, const __m128i& weights_x,
+inline void WriteSmoothPredSum4(uint8_t* LIBGAV1_RESTRICT const dest,
+ const __m128i& top, const __m128i& left,
+ const __m128i& weights_x,
const __m128i& weights_y,
const __m128i& scaled_bottom_left,
const __m128i& scaled_top_right,
@@ -135,7 +124,8 @@ inline void WriteSmoothPredSum4(uint8_t* const dest, const __m128i& top,
// pixels[0]: above and below_pred interleave vector
// pixels[1]: left vector
// pixels[2]: right_pred vector
-inline void LoadSmoothPixels4(const uint8_t* above, const uint8_t* left,
+inline void LoadSmoothPixels4(const uint8_t* LIBGAV1_RESTRICT above,
+ const uint8_t* LIBGAV1_RESTRICT left,
const int height, __m128i* pixels) {
if (height == 4) {
pixels[1] = Load4(left);
@@ -156,8 +146,9 @@ inline void LoadSmoothPixels4(const uint8_t* above, const uint8_t* left,
// weight_h[2]: same as [0], second half for height = 16 only
// weight_h[3]: same as [1], second half for height = 16 only
// weight_w[0]: weights_w and scale - weights_w interleave vector
-inline void LoadSmoothWeights4(const uint8_t* weight_array, const int height,
- __m128i* weight_h, __m128i* weight_w) {
+inline void LoadSmoothWeights4(const uint8_t* LIBGAV1_RESTRICT weight_array,
+ const int height, __m128i* weight_h,
+ __m128i* weight_w) {
const __m128i scale = _mm_set1_epi16(256);
const __m128i x_weights = Load4(weight_array);
weight_h[0] = _mm_cvtepu8_epi16(x_weights);
@@ -179,7 +170,8 @@ inline void LoadSmoothWeights4(const uint8_t* weight_array, const int height,
}
inline void WriteSmoothPred4x8(const __m128i* pixel, const __m128i* weights_y,
- const __m128i* weight_x, uint8_t* dst,
+ const __m128i* weight_x,
+ uint8_t* LIBGAV1_RESTRICT dst,
const ptrdiff_t stride,
const bool use_second_half) {
const __m128i round = _mm_set1_epi32(256);
@@ -215,8 +207,9 @@ inline void WriteSmoothPred4x8(const __m128i* pixel, const __m128i* weights_y,
// The interleaving approach has some overhead that causes it to underperform in
// the 4x4 case.
-void Smooth4x4_SSE4_1(void* const dest, const ptrdiff_t stride,
- const void* top_row, const void* left_column) {
+void Smooth4x4_SSE4_1(void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT top_row,
+ const void* LIBGAV1_RESTRICT left_column) {
const __m128i top = _mm_cvtepu8_epi32(Load4(top_row));
const __m128i left = _mm_cvtepu8_epi32(Load4(left_column));
const __m128i weights = _mm_cvtepu8_epi32(Load4(kSmoothWeights));
@@ -247,8 +240,9 @@ void Smooth4x4_SSE4_1(void* const dest, const ptrdiff_t stride,
scaled_bottom_left, scaled_top_right, scale);
}
-void Smooth4x8_SSE4_1(void* const dest, const ptrdiff_t stride,
- const void* top_row, const void* left_column) {
+void Smooth4x8_SSE4_1(void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT top_row,
+ const void* LIBGAV1_RESTRICT left_column) {
const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
__m128i weights_x[1];
@@ -260,8 +254,10 @@ void Smooth4x8_SSE4_1(void* const dest, const ptrdiff_t stride,
WriteSmoothPred4x8(pixels, weights_y, weights_x, dst, stride, false);
}
-void Smooth4x16_SSE4_1(void* const dest, const ptrdiff_t stride,
- const void* top_row, const void* left_column) {
+void Smooth4x16_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+ const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT top_row,
+ const void* LIBGAV1_RESTRICT left_column) {
const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
__m128i weights_x[1];
@@ -283,7 +279,8 @@ void Smooth4x16_SSE4_1(void* const dest, const ptrdiff_t stride,
// pixels[5]: above and below_pred interleave vector, second half
// pixels[6]: left vector + 16
// pixels[7]: right_pred vector
-inline void LoadSmoothPixels8(const uint8_t* above, const uint8_t* left,
+inline void LoadSmoothPixels8(const uint8_t* LIBGAV1_RESTRICT above,
+ const uint8_t* LIBGAV1_RESTRICT left,
const int height, __m128i* pixels) {
const __m128i bottom_left = _mm_set1_epi16(left[height - 1]);
__m128i top_row = _mm_cvtepu8_epi16(LoadLo8(above));
@@ -317,8 +314,9 @@ inline void LoadSmoothPixels8(const uint8_t* above, const uint8_t* left,
// weight_h[7]: same as [1], offset 24
// weight_w[0]: weights_w and scale - weights_w interleave vector, first half
// weight_w[1]: weights_w and scale - weights_w interleave vector, second half
-inline void LoadSmoothWeights8(const uint8_t* weight_array, const int height,
- __m128i* weight_w, __m128i* weight_h) {
+inline void LoadSmoothWeights8(const uint8_t* LIBGAV1_RESTRICT weight_array,
+ const int height, __m128i* weight_w,
+ __m128i* weight_h) {
const int offset = (height < 8) ? 0 : 4;
__m128i loaded_weights = LoadUnaligned16(&weight_array[offset]);
weight_h[0] = _mm_cvtepu8_epi16(loaded_weights);
@@ -360,7 +358,8 @@ inline void LoadSmoothWeights8(const uint8_t* weight_array, const int height,
inline void WriteSmoothPred8xH(const __m128i* pixels, const __m128i* weights_x,
const __m128i* weights_y, const int height,
- uint8_t* dst, const ptrdiff_t stride,
+ uint8_t* LIBGAV1_RESTRICT dst,
+ const ptrdiff_t stride,
const bool use_second_half) {
const __m128i round = _mm_set1_epi32(256);
const __m128i mask_increment = _mm_set1_epi16(0x0202);
@@ -405,8 +404,9 @@ inline void WriteSmoothPred8xH(const __m128i* pixels, const __m128i* weights_x,
}
}
-void Smooth8x4_SSE4_1(void* const dest, const ptrdiff_t stride,
- const void* top_row, const void* left_column) {
+void Smooth8x4_SSE4_1(void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT top_row,
+ const void* LIBGAV1_RESTRICT left_column) {
const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
__m128i pixels[4];
@@ -419,8 +419,9 @@ void Smooth8x4_SSE4_1(void* const dest, const ptrdiff_t stride,
WriteSmoothPred8xH(pixels, weights_x, weights_y, 4, dst, stride, false);
}
-void Smooth8x8_SSE4_1(void* const dest, const ptrdiff_t stride,
- const void* top_row, const void* left_column) {
+void Smooth8x8_SSE4_1(void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT top_row,
+ const void* LIBGAV1_RESTRICT left_column) {
const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
@@ -434,8 +435,10 @@ void Smooth8x8_SSE4_1(void* const dest, const ptrdiff_t stride,
WriteSmoothPred8xH(pixels, weights_x, weights_y, 8, dst, stride, false);
}
-void Smooth8x16_SSE4_1(void* const dest, const ptrdiff_t stride,
- const void* top_row, const void* left_column) {
+void Smooth8x16_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+ const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT top_row,
+ const void* LIBGAV1_RESTRICT left_column) {
const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
__m128i pixels[4];
@@ -450,8 +453,10 @@ void Smooth8x16_SSE4_1(void* const dest, const ptrdiff_t stride,
WriteSmoothPred8xH(pixels, weights_x, &weights_y[2], 8, dst, stride, true);
}
-void Smooth8x32_SSE4_1(void* const dest, const ptrdiff_t stride,
- const void* top_row, const void* left_column) {
+void Smooth8x32_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+ const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT top_row,
+ const void* LIBGAV1_RESTRICT left_column) {
const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
__m128i pixels[8];
@@ -473,8 +478,9 @@ void Smooth8x32_SSE4_1(void* const dest, const ptrdiff_t stride,
}
template <int width, int height>
-void SmoothWxH(void* const dest, const ptrdiff_t stride,
- const void* const top_row, const void* const left_column) {
+void SmoothWxH(void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
const uint8_t* const sm_weights_h = kSmoothWeights + height - 4;
@@ -532,8 +538,10 @@ void SmoothWxH(void* const dest, const ptrdiff_t stride,
}
}
-void SmoothHorizontal4x4_SSE4_1(void* dest, const ptrdiff_t stride,
- const void* top_row, const void* left_column) {
+void SmoothHorizontal4x4_SSE4_1(void* LIBGAV1_RESTRICT dest,
+ const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT top_row,
+ const void* LIBGAV1_RESTRICT left_column) {
const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
const __m128i top_right = _mm_set1_epi32(top_ptr[3]);
const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
@@ -553,9 +561,10 @@ void SmoothHorizontal4x4_SSE4_1(void* dest, const ptrdiff_t stride,
WriteSmoothHorizontalSum4<0xFF>(dst, left, weights, scaled_top_right, scale);
}
-void SmoothHorizontal4x8_SSE4_1(void* const dest, const ptrdiff_t stride,
- const void* const top_row,
- const void* const left_column) {
+void SmoothHorizontal4x8_SSE4_1(
+ void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const auto* const top = static_cast<const uint8_t*>(top_row);
const __m128i top_right = _mm_set1_epi32(top[3]);
const __m128i weights = _mm_cvtepu8_epi32(Load4(kSmoothWeights));
@@ -585,9 +594,10 @@ void SmoothHorizontal4x8_SSE4_1(void* const dest, const ptrdiff_t stride,
WriteSmoothHorizontalSum4<0xFF>(dst, left, weights, scaled_top_right, scale);
}
-void SmoothHorizontal4x16_SSE4_1(void* const dest, const ptrdiff_t stride,
- const void* const top_row,
- const void* const left_column) {
+void SmoothHorizontal4x16_SSE4_1(
+ void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const auto* const top = static_cast<const uint8_t*>(top_row);
const __m128i top_right = _mm_set1_epi32(top[3]);
const __m128i weights = _mm_cvtepu8_epi32(Load4(kSmoothWeights));
@@ -637,9 +647,10 @@ void SmoothHorizontal4x16_SSE4_1(void* const dest, const ptrdiff_t stride,
WriteSmoothHorizontalSum4<0xFF>(dst, left, weights, scaled_top_right, scale);
}
-void SmoothHorizontal8x4_SSE4_1(void* const dest, const ptrdiff_t stride,
- const void* const top_row,
- const void* const left_column) {
+void SmoothHorizontal8x4_SSE4_1(
+ void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const auto* const top = static_cast<const uint8_t*>(top_row);
const __m128i top_right = _mm_set1_epi16(top[7]);
const __m128i left = _mm_cvtepu8_epi16(Load4(left_column));
@@ -666,9 +677,10 @@ void SmoothHorizontal8x4_SSE4_1(void* const dest, const ptrdiff_t stride,
WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
}
-void SmoothHorizontal8x8_SSE4_1(void* const dest, const ptrdiff_t stride,
- const void* const top_row,
- const void* const left_column) {
+void SmoothHorizontal8x8_SSE4_1(
+ void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const auto* const top = static_cast<const uint8_t*>(top_row);
const __m128i top_right = _mm_set1_epi16(top[7]);
const __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column));
@@ -686,9 +698,10 @@ void SmoothHorizontal8x8_SSE4_1(void* const dest, const ptrdiff_t stride,
}
}
-void SmoothHorizontal8x16_SSE4_1(void* const dest, const ptrdiff_t stride,
- const void* const top_row,
- const void* const left_column) {
+void SmoothHorizontal8x16_SSE4_1(
+ void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const auto* const top = static_cast<const uint8_t*>(top_row);
const __m128i top_right = _mm_set1_epi16(top[7]);
const __m128i weights = _mm_cvtepu8_epi16(LoadLo8(kSmoothWeights + 4));
@@ -714,9 +727,10 @@ void SmoothHorizontal8x16_SSE4_1(void* const dest, const ptrdiff_t stride,
}
}
-void SmoothHorizontal8x32_SSE4_1(void* const dest, const ptrdiff_t stride,
- const void* const top_row,
- const void* const left_column) {
+void SmoothHorizontal8x32_SSE4_1(
+ void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const auto* const top = static_cast<const uint8_t*>(top_row);
const __m128i top_right = _mm_set1_epi16(top[7]);
const __m128i weights = _mm_cvtepu8_epi16(LoadLo8(kSmoothWeights + 4));
@@ -756,9 +770,10 @@ void SmoothHorizontal8x32_SSE4_1(void* const dest, const ptrdiff_t stride,
}
}
-void SmoothHorizontal16x4_SSE4_1(void* const dest, const ptrdiff_t stride,
- const void* const top_row,
- const void* const left_column) {
+void SmoothHorizontal16x4_SSE4_1(
+ void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const auto* const top = static_cast<const uint8_t*>(top_row);
const __m128i top_right = _mm_set1_epi16(top[15]);
const __m128i left = _mm_cvtepu8_epi16(Load4(left_column));
@@ -795,9 +810,10 @@ void SmoothHorizontal16x4_SSE4_1(void* const dest, const ptrdiff_t stride,
scaled_top_right1, scaled_top_right2, scale);
}
-void SmoothHorizontal16x8_SSE4_1(void* const dest, const ptrdiff_t stride,
- const void* const top_row,
- const void* const left_column) {
+void SmoothHorizontal16x8_SSE4_1(
+ void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const auto* const top = static_cast<const uint8_t*>(top_row);
const __m128i top_right = _mm_set1_epi16(top[15]);
const __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column));
@@ -822,9 +838,10 @@ void SmoothHorizontal16x8_SSE4_1(void* const dest, const ptrdiff_t stride,
}
}
-void SmoothHorizontal16x16_SSE4_1(void* const dest, const ptrdiff_t stride,
- const void* const top_row,
- const void* const left_column) {
+void SmoothHorizontal16x16_SSE4_1(
+ void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const auto* const top = static_cast<const uint8_t*>(top_row);
const __m128i top_right = _mm_set1_epi16(top[15]);
const __m128i weights = LoadUnaligned16(kSmoothWeights + 12);
@@ -858,9 +875,10 @@ void SmoothHorizontal16x16_SSE4_1(void* const dest, const ptrdiff_t stride,
}
}
-void SmoothHorizontal16x32_SSE4_1(void* const dest, const ptrdiff_t stride,
- const void* const top_row,
- const void* const left_column) {
+void SmoothHorizontal16x32_SSE4_1(
+ void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const auto* const top = static_cast<const uint8_t*>(top_row);
const __m128i top_right = _mm_set1_epi16(top[15]);
const __m128i weights = LoadUnaligned16(kSmoothWeights + 12);
@@ -910,9 +928,10 @@ void SmoothHorizontal16x32_SSE4_1(void* const dest, const ptrdiff_t stride,
}
}
-void SmoothHorizontal16x64_SSE4_1(void* const dest, const ptrdiff_t stride,
- const void* const top_row,
- const void* const left_column) {
+void SmoothHorizontal16x64_SSE4_1(
+ void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const auto* const top = static_cast<const uint8_t*>(top_row);
const __m128i top_right = _mm_set1_epi16(top[15]);
const __m128i weights = LoadUnaligned16(kSmoothWeights + 12);
@@ -940,9 +959,10 @@ void SmoothHorizontal16x64_SSE4_1(void* const dest, const ptrdiff_t stride,
}
}
-void SmoothHorizontal32x8_SSE4_1(void* const dest, const ptrdiff_t stride,
- const void* const top_row,
- const void* const left_column) {
+void SmoothHorizontal32x8_SSE4_1(
+ void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const auto* const top = static_cast<const uint8_t*>(top_row);
const __m128i top_right = _mm_set1_epi16(top[31]);
const __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column));
@@ -978,9 +998,10 @@ void SmoothHorizontal32x8_SSE4_1(void* const dest, const ptrdiff_t stride,
}
}
-void SmoothHorizontal32x16_SSE4_1(void* const dest, const ptrdiff_t stride,
- const void* const top_row,
- const void* const left_column) {
+void SmoothHorizontal32x16_SSE4_1(
+ void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const auto* const top = static_cast<const uint8_t*>(top_row);
const __m128i top_right = _mm_set1_epi16(top[31]);
const __m128i left1 = _mm_cvtepu8_epi16(LoadLo8(left_column));
@@ -1027,9 +1048,10 @@ void SmoothHorizontal32x16_SSE4_1(void* const dest, const ptrdiff_t stride,
}
}
-void SmoothHorizontal32x32_SSE4_1(void* const dest, const ptrdiff_t stride,
- const void* const top_row,
- const void* const left_column) {
+void SmoothHorizontal32x32_SSE4_1(
+ void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const auto* const top = static_cast<const uint8_t*>(top_row);
const __m128i top_right = _mm_set1_epi16(top[31]);
const __m128i weights_lo = LoadUnaligned16(kSmoothWeights + 28);
@@ -1096,9 +1118,10 @@ void SmoothHorizontal32x32_SSE4_1(void* const dest, const ptrdiff_t stride,
}
}
-void SmoothHorizontal32x64_SSE4_1(void* const dest, const ptrdiff_t stride,
- const void* const top_row,
- const void* const left_column) {
+void SmoothHorizontal32x64_SSE4_1(
+ void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const auto* const top = static_cast<const uint8_t*>(top_row);
const __m128i top_right = _mm_set1_epi16(top[31]);
const __m128i weights_lo = LoadUnaligned16(kSmoothWeights + 28);
@@ -1137,9 +1160,10 @@ void SmoothHorizontal32x64_SSE4_1(void* const dest, const ptrdiff_t stride,
}
}
-void SmoothHorizontal64x16_SSE4_1(void* const dest, const ptrdiff_t stride,
- const void* const top_row,
- const void* const left_column) {
+void SmoothHorizontal64x16_SSE4_1(
+ void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const auto* const top = static_cast<const uint8_t*>(top_row);
const __m128i top_right = _mm_set1_epi16(top[63]);
const __m128i left1 = _mm_cvtepu8_epi16(LoadLo8(left_column));
@@ -1212,9 +1236,10 @@ void SmoothHorizontal64x16_SSE4_1(void* const dest, const ptrdiff_t stride,
}
}
-void SmoothHorizontal64x32_SSE4_1(void* const dest, const ptrdiff_t stride,
- const void* const top_row,
- const void* const left_column) {
+void SmoothHorizontal64x32_SSE4_1(
+ void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const auto* const top = static_cast<const uint8_t*>(top_row);
const __m128i top_right = _mm_set1_epi16(top[63]);
const __m128i left1 = _mm_cvtepu8_epi16(LoadLo8(left_column));
@@ -1315,9 +1340,10 @@ void SmoothHorizontal64x32_SSE4_1(void* const dest, const ptrdiff_t stride,
}
}
-void SmoothHorizontal64x64_SSE4_1(void* const dest, const ptrdiff_t stride,
- const void* const top_row,
- const void* const left_column) {
+void SmoothHorizontal64x64_SSE4_1(
+ void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const auto* const top = static_cast<const uint8_t*>(top_row);
const __m128i top_right = _mm_set1_epi16(top[63]);
const __m128i weights_lolo = LoadUnaligned16(kSmoothWeights + 60);
@@ -1378,7 +1404,8 @@ void SmoothHorizontal64x64_SSE4_1(void* const dest, const ptrdiff_t stride,
}
}
-inline void LoadSmoothVerticalPixels4(const uint8_t* above, const uint8_t* left,
+inline void LoadSmoothVerticalPixels4(const uint8_t* LIBGAV1_RESTRICT above,
+ const uint8_t* LIBGAV1_RESTRICT left,
const int height, __m128i* pixels) {
__m128i top = Load4(above);
const __m128i bottom_left = _mm_set1_epi16(left[height - 1]);
@@ -1390,7 +1417,8 @@ inline void LoadSmoothVerticalPixels4(const uint8_t* above, const uint8_t* left,
// (256-w) counterparts. This is precomputed by the compiler when the weights
// table is visible to this module. Removing this visibility can cut speed by up
// to half in both 4xH and 8xH transforms.
-inline void LoadSmoothVerticalWeights4(const uint8_t* weight_array,
+inline void LoadSmoothVerticalWeights4(const uint8_t* LIBGAV1_RESTRICT
+ weight_array,
const int height, __m128i* weights) {
const __m128i inverter = _mm_set1_epi16(256);
@@ -1413,7 +1441,8 @@ inline void LoadSmoothVerticalWeights4(const uint8_t* weight_array,
}
inline void WriteSmoothVertical4xH(const __m128i* pixel, const __m128i* weight,
- const int height, uint8_t* dst,
+ const int height,
+ uint8_t* LIBGAV1_RESTRICT dst,
const ptrdiff_t stride) {
const __m128i pred_round = _mm_set1_epi32(128);
const __m128i mask_increment = _mm_set1_epi16(0x0202);
@@ -1438,9 +1467,10 @@ inline void WriteSmoothVertical4xH(const __m128i* pixel, const __m128i* weight,
}
}
-void SmoothVertical4x4_SSE4_1(void* const dest, const ptrdiff_t stride,
- const void* const top_row,
- const void* const left_column) {
+void SmoothVertical4x4_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+ const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const auto* const left = static_cast<const uint8_t*>(left_column);
const auto* const above = static_cast<const uint8_t*>(top_row);
auto* dst = static_cast<uint8_t*>(dest);
@@ -1453,9 +1483,10 @@ void SmoothVertical4x4_SSE4_1(void* const dest, const ptrdiff_t stride,
WriteSmoothVertical4xH(&pixels, weights, 4, dst, stride);
}
-void SmoothVertical4x8_SSE4_1(void* const dest, const ptrdiff_t stride,
- const void* const top_row,
- const void* const left_column) {
+void SmoothVertical4x8_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+ const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const auto* const left = static_cast<const uint8_t*>(left_column);
const auto* const above = static_cast<const uint8_t*>(top_row);
auto* dst = static_cast<uint8_t*>(dest);
@@ -1468,9 +1499,10 @@ void SmoothVertical4x8_SSE4_1(void* const dest, const ptrdiff_t stride,
WriteSmoothVertical4xH(&pixels, weights, 8, dst, stride);
}
-void SmoothVertical4x16_SSE4_1(void* const dest, const ptrdiff_t stride,
- const void* const top_row,
- const void* const left_column) {
+void SmoothVertical4x16_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+ const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const auto* const left = static_cast<const uint8_t*>(left_column);
const auto* const above = static_cast<const uint8_t*>(top_row);
auto* dst = static_cast<uint8_t*>(dest);
@@ -1485,9 +1517,10 @@ void SmoothVertical4x16_SSE4_1(void* const dest, const ptrdiff_t stride,
WriteSmoothVertical4xH(&pixels, &weights[2], 8, dst, stride);
}
-void SmoothVertical8x4_SSE4_1(void* const dest, const ptrdiff_t stride,
- const void* const top_row,
- const void* const left_column) {
+void SmoothVertical8x4_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+ const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
const __m128i bottom_left = _mm_set1_epi16(left_ptr[3]);
const __m128i weights = _mm_cvtepu8_epi16(Load4(kSmoothWeights));
@@ -1520,9 +1553,10 @@ void SmoothVertical8x4_SSE4_1(void* const dest, const ptrdiff_t stride,
WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y, scale);
}
-void SmoothVertical8x8_SSE4_1(void* const dest, const ptrdiff_t stride,
- const void* const top_row,
- const void* const left_column) {
+void SmoothVertical8x8_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+ const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
const __m128i bottom_left = _mm_set1_epi16(left_ptr[7]);
const __m128i weights = _mm_cvtepu8_epi16(LoadLo8(kSmoothWeights + 4));
@@ -1544,9 +1578,10 @@ void SmoothVertical8x8_SSE4_1(void* const dest, const ptrdiff_t stride,
}
}
-void SmoothVertical8x16_SSE4_1(void* const dest, const ptrdiff_t stride,
- const void* const top_row,
- const void* const left_column) {
+void SmoothVertical8x16_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+ const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
const __m128i bottom_left = _mm_set1_epi16(left_ptr[15]);
const __m128i weights = LoadUnaligned16(kSmoothWeights + 12);
@@ -1583,9 +1618,10 @@ void SmoothVertical8x16_SSE4_1(void* const dest, const ptrdiff_t stride,
}
}
-void SmoothVertical8x32_SSE4_1(void* const dest, const ptrdiff_t stride,
- const void* const top_row,
- const void* const left_column) {
+void SmoothVertical8x32_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+ const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
const __m128i zero = _mm_setzero_si128();
const __m128i bottom_left = _mm_set1_epi16(left_ptr[31]);
@@ -1649,9 +1685,10 @@ void SmoothVertical8x32_SSE4_1(void* const dest, const ptrdiff_t stride,
}
}
-void SmoothVertical16x4_SSE4_1(void* const dest, const ptrdiff_t stride,
- const void* const top_row,
- const void* const left_column) {
+void SmoothVertical16x4_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+ const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
auto* dst = static_cast<uint8_t*>(dest);
const __m128i bottom_left = _mm_set1_epi16(left_ptr[3]);
@@ -1694,9 +1731,10 @@ void SmoothVertical16x4_SSE4_1(void* const dest, const ptrdiff_t stride,
scale);
}
-void SmoothVertical16x8_SSE4_1(void* const dest, const ptrdiff_t stride,
- const void* const top_row,
- const void* const left_column) {
+void SmoothVertical16x8_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+ const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
auto* dst = static_cast<uint8_t*>(dest);
const __m128i bottom_left = _mm_set1_epi16(left_ptr[7]);
@@ -1722,9 +1760,10 @@ void SmoothVertical16x8_SSE4_1(void* const dest, const ptrdiff_t stride,
}
}
-void SmoothVertical16x16_SSE4_1(void* const dest, const ptrdiff_t stride,
- const void* const top_row,
- const void* const left_column) {
+void SmoothVertical16x16_SSE4_1(
+ void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
auto* dst = static_cast<uint8_t*>(dest);
const __m128i bottom_left = _mm_set1_epi16(left_ptr[15]);
@@ -1766,9 +1805,10 @@ void SmoothVertical16x16_SSE4_1(void* const dest, const ptrdiff_t stride,
}
}
-void SmoothVertical16x32_SSE4_1(void* const dest, const ptrdiff_t stride,
- const void* const top_row,
- const void* const left_column) {
+void SmoothVertical16x32_SSE4_1(
+ void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
auto* dst = static_cast<uint8_t*>(dest);
const __m128i bottom_left = _mm_set1_epi16(left_ptr[31]);
@@ -1839,9 +1879,10 @@ void SmoothVertical16x32_SSE4_1(void* const dest, const ptrdiff_t stride,
}
}
-void SmoothVertical16x64_SSE4_1(void* const dest, const ptrdiff_t stride,
- const void* const top_row,
- const void* const left_column) {
+void SmoothVertical16x64_SSE4_1(
+ void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
auto* dst = static_cast<uint8_t*>(dest);
const __m128i bottom_left = _mm_set1_epi16(left_ptr[63]);
@@ -1887,9 +1928,10 @@ void SmoothVertical16x64_SSE4_1(void* const dest, const ptrdiff_t stride,
}
}
-void SmoothVertical32x8_SSE4_1(void* const dest, const ptrdiff_t stride,
- const void* const top_row,
- const void* const left_column) {
+void SmoothVertical32x8_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+ const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
auto* dst = static_cast<uint8_t*>(dest);
@@ -1922,9 +1964,10 @@ void SmoothVertical32x8_SSE4_1(void* const dest, const ptrdiff_t stride,
}
}
-void SmoothVertical32x16_SSE4_1(void* const dest, const ptrdiff_t stride,
- const void* const top_row,
- const void* const left_column) {
+void SmoothVertical32x16_SSE4_1(
+ void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
auto* dst = static_cast<uint8_t*>(dest);
@@ -1975,9 +2018,10 @@ void SmoothVertical32x16_SSE4_1(void* const dest, const ptrdiff_t stride,
}
}
-void SmoothVertical32x32_SSE4_1(void* const dest, const ptrdiff_t stride,
- const void* const top_row,
- const void* const left_column) {
+void SmoothVertical32x32_SSE4_1(
+ void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
auto* dst = static_cast<uint8_t*>(dest);
const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
@@ -2063,9 +2107,10 @@ void SmoothVertical32x32_SSE4_1(void* const dest, const ptrdiff_t stride,
}
}
-void SmoothVertical32x64_SSE4_1(void* const dest, const ptrdiff_t stride,
- const void* const top_row,
- const void* const left_column) {
+void SmoothVertical32x64_SSE4_1(
+ void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
auto* dst = static_cast<uint8_t*>(dest);
const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
@@ -2120,9 +2165,10 @@ void SmoothVertical32x64_SSE4_1(void* const dest, const ptrdiff_t stride,
}
}
-void SmoothVertical64x16_SSE4_1(void* const dest, const ptrdiff_t stride,
- const void* const top_row,
- const void* const left_column) {
+void SmoothVertical64x16_SSE4_1(
+ void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
auto* dst = static_cast<uint8_t*>(dest);
const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
@@ -2192,9 +2238,10 @@ void SmoothVertical64x16_SSE4_1(void* const dest, const ptrdiff_t stride,
}
}
-void SmoothVertical64x32_SSE4_1(void* const dest, const ptrdiff_t stride,
- const void* const top_row,
- const void* const left_column) {
+void SmoothVertical64x32_SSE4_1(
+ void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
auto* dst = static_cast<uint8_t*>(dest);
const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
@@ -2311,9 +2358,10 @@ void SmoothVertical64x32_SSE4_1(void* const dest, const ptrdiff_t stride,
}
}
-void SmoothVertical64x64_SSE4_1(void* const dest, const ptrdiff_t stride,
- const void* const top_row,
- const void* const left_column) {
+void SmoothVertical64x64_SSE4_1(
+ void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
auto* dst = static_cast<uint8_t*>(dest);
const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
diff --git a/src/dsp/x86/intrapred_sse4.cc b/src/dsp/x86/intrapred_sse4.cc
index 063929d..556afed 100644
--- a/src/dsp/x86/intrapred_sse4.cc
+++ b/src/dsp/x86/intrapred_sse4.cc
@@ -90,11 +90,11 @@ struct DirectionalPredFuncs_SSE4_1 {
template <int width_log2, int height_log2, DcSumFunc top_sumfn,
DcSumFunc left_sumfn, DcStoreFunc storefn, int shiftk, int dc_mult>
-void DcPredFuncs_SSE4_1<width_log2, height_log2, top_sumfn, left_sumfn, storefn,
- shiftk, dc_mult>::DcTop(void* const dest,
- ptrdiff_t stride,
- const void* const top_row,
- const void* /*left_column*/) {
+void DcPredFuncs_SSE4_1<
+ width_log2, height_log2, top_sumfn, left_sumfn, storefn, shiftk,
+ dc_mult>::DcTop(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* /*left_column*/) {
const __m128i rounder = _mm_set1_epi32(1 << (width_log2 - 1));
const __m128i sum = top_sumfn(top_row);
const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, rounder), width_log2);
@@ -103,11 +103,11 @@ void DcPredFuncs_SSE4_1<width_log2, height_log2, top_sumfn, left_sumfn, storefn,
template <int width_log2, int height_log2, DcSumFunc top_sumfn,
DcSumFunc left_sumfn, DcStoreFunc storefn, int shiftk, int dc_mult>
-void DcPredFuncs_SSE4_1<width_log2, height_log2, top_sumfn, left_sumfn, storefn,
- shiftk,
- dc_mult>::DcLeft(void* const dest, ptrdiff_t stride,
- const void* /*top_row*/,
- const void* const left_column) {
+void DcPredFuncs_SSE4_1<
+ width_log2, height_log2, top_sumfn, left_sumfn, storefn, shiftk,
+ dc_mult>::DcLeft(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* /*top_row*/,
+ const void* LIBGAV1_RESTRICT const left_column) {
const __m128i rounder = _mm_set1_epi32(1 << (height_log2 - 1));
const __m128i sum = left_sumfn(left_column);
const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, rounder), height_log2);
@@ -116,10 +116,11 @@ void DcPredFuncs_SSE4_1<width_log2, height_log2, top_sumfn, left_sumfn, storefn,
template <int width_log2, int height_log2, DcSumFunc top_sumfn,
DcSumFunc left_sumfn, DcStoreFunc storefn, int shiftk, int dc_mult>
-void DcPredFuncs_SSE4_1<width_log2, height_log2, top_sumfn, left_sumfn, storefn,
- shiftk, dc_mult>::Dc(void* const dest, ptrdiff_t stride,
- const void* const top_row,
- const void* const left_column) {
+void DcPredFuncs_SSE4_1<
+ width_log2, height_log2, top_sumfn, left_sumfn, storefn, shiftk,
+ dc_mult>::Dc(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const __m128i rounder =
_mm_set1_epi32((1 << (width_log2 - 1)) + (1 << (height_log2 - 1)));
const __m128i sum_top = top_sumfn(top_row);
@@ -141,8 +142,8 @@ void DcPredFuncs_SSE4_1<width_log2, height_log2, top_sumfn, left_sumfn, storefn,
template <ColumnStoreFunc col_storefn>
void DirectionalPredFuncs_SSE4_1<col_storefn>::Horizontal(
- void* const dest, ptrdiff_t stride, const void* /*top_row*/,
- const void* const left_column) {
+ void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* /*top_row*/, const void* LIBGAV1_RESTRICT const left_column) {
col_storefn(dest, stride, left_column);
}
@@ -384,8 +385,9 @@ inline void WriteDuplicate64x4(void* const dest, ptrdiff_t stride,
// ColStoreN<height> copies each of the |height| values in |column| across its
// corresponding in dest.
template <WriteDuplicateFunc writefn>
-inline void ColStore4_SSE4_1(void* const dest, ptrdiff_t stride,
- const void* const column) {
+inline void ColStore4_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+ ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const column) {
const __m128i col_data = Load4(column);
const __m128i col_dup16 = _mm_unpacklo_epi8(col_data, col_data);
const __m128i col_dup32 = _mm_unpacklo_epi16(col_dup16, col_dup16);
@@ -393,8 +395,9 @@ inline void ColStore4_SSE4_1(void* const dest, ptrdiff_t stride,
}
template <WriteDuplicateFunc writefn>
-inline void ColStore8_SSE4_1(void* const dest, ptrdiff_t stride,
- const void* const column) {
+inline void ColStore8_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+ ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const column) {
const ptrdiff_t stride4 = stride << 2;
const __m128i col_data = LoadLo8(column);
const __m128i col_dup16 = _mm_unpacklo_epi8(col_data, col_data);
@@ -407,8 +410,9 @@ inline void ColStore8_SSE4_1(void* const dest, ptrdiff_t stride,
}
template <WriteDuplicateFunc writefn>
-inline void ColStore16_SSE4_1(void* const dest, ptrdiff_t stride,
- const void* const column) {
+inline void ColStore16_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+ ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const column) {
const ptrdiff_t stride4 = stride << 2;
const __m128i col_data = _mm_loadu_si128(static_cast<const __m128i*>(column));
const __m128i col_dup16_lo = _mm_unpacklo_epi8(col_data, col_data);
@@ -428,8 +432,9 @@ inline void ColStore16_SSE4_1(void* const dest, ptrdiff_t stride,
}
template <WriteDuplicateFunc writefn>
-inline void ColStore32_SSE4_1(void* const dest, ptrdiff_t stride,
- const void* const column) {
+inline void ColStore32_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+ ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const column) {
const ptrdiff_t stride4 = stride << 2;
auto* dst = static_cast<uint8_t*>(dest);
for (int y = 0; y < 32; y += 16) {
@@ -457,8 +462,9 @@ inline void ColStore32_SSE4_1(void* const dest, ptrdiff_t stride,
}
template <WriteDuplicateFunc writefn>
-inline void ColStore64_SSE4_1(void* const dest, ptrdiff_t stride,
- const void* const column) {
+inline void ColStore64_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+ ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const column) {
const ptrdiff_t stride4 = stride << 2;
auto* dst = static_cast<uint8_t*>(dest);
for (int y = 0; y < 64; y += 16) {
@@ -574,7 +580,7 @@ struct DirDefs {
};
template <int y_mask>
-inline void WritePaethLine4(uint8_t* dst, const __m128i& top,
+inline void WritePaethLine4(uint8_t* LIBGAV1_RESTRICT dst, const __m128i& top,
const __m128i& left, const __m128i& top_lefts,
const __m128i& top_dists, const __m128i& left_dists,
const __m128i& top_left_diffs) {
@@ -614,7 +620,7 @@ inline void WritePaethLine4(uint8_t* dst, const __m128i& top,
// could pay off to accommodate top_left_dists for cmpgt, and repack into epi8
// for the blends.
template <int y_mask>
-inline void WritePaethLine8(uint8_t* dst, const __m128i& top,
+inline void WritePaethLine8(uint8_t* LIBGAV1_RESTRICT dst, const __m128i& top,
const __m128i& left, const __m128i& top_lefts,
const __m128i& top_dists, const __m128i& left_dists,
const __m128i& top_left_diffs) {
@@ -658,7 +664,7 @@ inline void WritePaethLine8(uint8_t* dst, const __m128i& top,
// |left_dists| is provided alongside its spread out version because it doesn't
// change between calls and interacts with both kinds of packing.
template <int y_mask>
-inline void WritePaethLine16(uint8_t* dst, const __m128i& top,
+inline void WritePaethLine16(uint8_t* LIBGAV1_RESTRICT dst, const __m128i& top,
const __m128i& left, const __m128i& top_lefts,
const __m128i& top_dists,
const __m128i& left_dists,
@@ -712,8 +718,9 @@ inline void WritePaethLine16(uint8_t* dst, const __m128i& top,
_mm_storeu_si128(reinterpret_cast<__m128i*>(dst), pred);
}
-void Paeth4x4_SSE4_1(void* const dest, ptrdiff_t stride,
- const void* const top_row, const void* const left_column) {
+void Paeth4x4_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const __m128i left = _mm_cvtepu8_epi32(Load4(left_column));
const __m128i top = _mm_cvtepu8_epi32(Load4(top_row));
@@ -742,8 +749,9 @@ void Paeth4x4_SSE4_1(void* const dest, ptrdiff_t stride,
top_left_diff);
}
-void Paeth4x8_SSE4_1(void* const dest, ptrdiff_t stride,
- const void* const top_row, const void* const left_column) {
+void Paeth4x8_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const __m128i left = LoadLo8(left_column);
const __m128i left_lo = _mm_cvtepu8_epi32(left);
const __m128i left_hi = _mm_cvtepu8_epi32(_mm_srli_si128(left, 4));
@@ -787,9 +795,9 @@ void Paeth4x8_SSE4_1(void* const dest, ptrdiff_t stride,
top_left_diff);
}
-void Paeth4x16_SSE4_1(void* const dest, ptrdiff_t stride,
- const void* const top_row,
- const void* const left_column) {
+void Paeth4x16_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const __m128i left = LoadUnaligned16(left_column);
const __m128i left_0 = _mm_cvtepu8_epi32(left);
const __m128i left_1 = _mm_cvtepu8_epi32(_mm_srli_si128(left, 4));
@@ -862,8 +870,9 @@ void Paeth4x16_SSE4_1(void* const dest, ptrdiff_t stride,
top_left_diff);
}
-void Paeth8x4_SSE4_1(void* const dest, ptrdiff_t stride,
- const void* const top_row, const void* const left_column) {
+void Paeth8x4_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const __m128i left = _mm_cvtepu8_epi16(Load4(left_column));
const __m128i top = _mm_cvtepu8_epi16(LoadLo8(top_row));
const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
@@ -891,8 +900,9 @@ void Paeth8x4_SSE4_1(void* const dest, ptrdiff_t stride,
top_left_diff);
}
-void Paeth8x8_SSE4_1(void* const dest, ptrdiff_t stride,
- const void* const top_row, const void* const left_column) {
+void Paeth8x8_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column));
const __m128i top = _mm_cvtepu8_epi16(LoadLo8(top_row));
const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
@@ -932,9 +942,9 @@ void Paeth8x8_SSE4_1(void* const dest, ptrdiff_t stride,
top_left_diff);
}
-void Paeth8x16_SSE4_1(void* const dest, ptrdiff_t stride,
- const void* const top_row,
- const void* const left_column) {
+void Paeth8x16_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const __m128i left = LoadUnaligned16(left_column);
const __m128i left_lo = _mm_cvtepu8_epi16(left);
const __m128i left_hi = _mm_cvtepu8_epi16(_mm_srli_si128(left, 8));
@@ -1001,18 +1011,18 @@ void Paeth8x16_SSE4_1(void* const dest, ptrdiff_t stride,
left_dists, top_left_diff);
}
-void Paeth8x32_SSE4_1(void* const dest, ptrdiff_t stride,
- const void* const top_row,
- const void* const left_column) {
+void Paeth8x32_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
auto* const dst = static_cast<uint8_t*>(dest);
Paeth8x16_SSE4_1(dst, stride, top_row, left_column);
Paeth8x16_SSE4_1(dst + (stride << 4), stride, top_row, left_ptr + 16);
}
-void Paeth16x4_SSE4_1(void* const dest, ptrdiff_t stride,
- const void* const top_row,
- const void* const left_column) {
+void Paeth16x4_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const __m128i left = Load4(left_column);
const __m128i top = LoadUnaligned16(top_row);
const __m128i top_lo = _mm_cvtepu8_epi16(top);
@@ -1057,7 +1067,7 @@ void Paeth16x4_SSE4_1(void* const dest, ptrdiff_t stride,
// Inlined for calling with offsets in larger transform sizes, mainly to
// preserve top_left.
-inline void WritePaeth16x8(void* const dest, ptrdiff_t stride,
+inline void WritePaeth16x8(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
const uint8_t top_left, const __m128i top,
const __m128i left) {
const __m128i top_lo = _mm_cvtepu8_epi16(top);
@@ -1115,9 +1125,9 @@ inline void WritePaeth16x8(void* const dest, ptrdiff_t stride,
top_left_diff_lo, top_left_diff_hi);
}
-void Paeth16x8_SSE4_1(void* const dest, ptrdiff_t stride,
- const void* const top_row,
- const void* const left_column) {
+void Paeth16x8_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const __m128i top = LoadUnaligned16(top_row);
const __m128i left = LoadLo8(left_column);
const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
@@ -1213,18 +1223,18 @@ void WritePaeth16x16(void* const dest, ptrdiff_t stride, const uint8_t top_left,
top_left_diff_lo, top_left_diff_hi);
}
-void Paeth16x16_SSE4_1(void* const dest, ptrdiff_t stride,
- const void* const top_row,
- const void* const left_column) {
+void Paeth16x16_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const __m128i left = LoadUnaligned16(left_column);
const __m128i top = LoadUnaligned16(top_row);
const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
WritePaeth16x16(static_cast<uint8_t*>(dest), stride, top_ptr[-1], top, left);
}
-void Paeth16x32_SSE4_1(void* const dest, ptrdiff_t stride,
- const void* const top_row,
- const void* const left_column) {
+void Paeth16x32_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const __m128i left_0 = LoadUnaligned16(left_column);
const __m128i top = LoadUnaligned16(top_row);
const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
@@ -1236,9 +1246,9 @@ void Paeth16x32_SSE4_1(void* const dest, ptrdiff_t stride,
WritePaeth16x16(dst + (stride << 4), stride, top_left, top, left_1);
}
-void Paeth16x64_SSE4_1(void* const dest, ptrdiff_t stride,
- const void* const top_row,
- const void* const left_column) {
+void Paeth16x64_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const ptrdiff_t stride16 = stride << 4;
const __m128i left_0 = LoadUnaligned16(left_column);
const __m128i top = LoadUnaligned16(top_row);
@@ -1258,9 +1268,9 @@ void Paeth16x64_SSE4_1(void* const dest, ptrdiff_t stride,
WritePaeth16x16(dst, stride, top_left, top, left_3);
}
-void Paeth32x8_SSE4_1(void* const dest, ptrdiff_t stride,
- const void* const top_row,
- const void* const left_column) {
+void Paeth32x8_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const __m128i left = LoadLo8(left_column);
const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
const __m128i top_0 = LoadUnaligned16(top_row);
@@ -1271,9 +1281,9 @@ void Paeth32x8_SSE4_1(void* const dest, ptrdiff_t stride,
WritePaeth16x8(dst + 16, stride, top_left, top_1, left);
}
-void Paeth32x16_SSE4_1(void* const dest, ptrdiff_t stride,
- const void* const top_row,
- const void* const left_column) {
+void Paeth32x16_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const __m128i left = LoadUnaligned16(left_column);
const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
const __m128i top_0 = LoadUnaligned16(top_row);
@@ -1284,9 +1294,9 @@ void Paeth32x16_SSE4_1(void* const dest, ptrdiff_t stride,
WritePaeth16x16(dst + 16, stride, top_left, top_1, left);
}
-void Paeth32x32_SSE4_1(void* const dest, ptrdiff_t stride,
- const void* const top_row,
- const void* const left_column) {
+void Paeth32x32_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
const __m128i left_0 = LoadUnaligned16(left_ptr);
const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
@@ -1302,9 +1312,9 @@ void Paeth32x32_SSE4_1(void* const dest, ptrdiff_t stride,
WritePaeth16x16(dst + 16, stride, top_left, top_1, left_1);
}
-void Paeth32x64_SSE4_1(void* const dest, ptrdiff_t stride,
- const void* const top_row,
- const void* const left_column) {
+void Paeth32x64_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
const __m128i left_0 = LoadUnaligned16(left_ptr);
const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
@@ -1328,9 +1338,9 @@ void Paeth32x64_SSE4_1(void* const dest, ptrdiff_t stride,
WritePaeth16x16(dst + 16, stride, top_left, top_1, left_3);
}
-void Paeth64x16_SSE4_1(void* const dest, ptrdiff_t stride,
- const void* const top_row,
- const void* const left_column) {
+void Paeth64x16_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const __m128i left = LoadUnaligned16(left_column);
const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
const __m128i top_0 = LoadUnaligned16(top_ptr);
@@ -1345,9 +1355,9 @@ void Paeth64x16_SSE4_1(void* const dest, ptrdiff_t stride,
WritePaeth16x16(dst + 48, stride, top_left, top_3, left);
}
-void Paeth64x32_SSE4_1(void* const dest, ptrdiff_t stride,
- const void* const top_row,
- const void* const left_column) {
+void Paeth64x32_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
const __m128i left_0 = LoadUnaligned16(left_ptr);
const __m128i left_1 = LoadUnaligned16(left_ptr + 16);
@@ -1369,9 +1379,9 @@ void Paeth64x32_SSE4_1(void* const dest, ptrdiff_t stride,
WritePaeth16x16(dst + 48, stride, top_left, top_3, left_1);
}
-void Paeth64x64_SSE4_1(void* const dest, ptrdiff_t stride,
- const void* const top_row,
- const void* const left_column) {
+void Paeth64x64_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
const __m128i left_0 = LoadUnaligned16(left_ptr);
const __m128i left_1 = LoadUnaligned16(left_ptr + 16);
@@ -1793,7 +1803,6 @@ void Init8bpp() {
DirDefs::_64x64::Horizontal;
#endif
} // NOLINT(readability/fn_size)
-// TODO(petersonab): Split Init8bpp function into family-specific files.
} // namespace
} // namespace low_bitdepth
@@ -1937,16 +1946,18 @@ inline void WriteDuplicate64x4(void* const dest, ptrdiff_t stride,
// ColStoreN<height> copies each of the |height| values in |column| across its
// corresponding row in dest.
template <WriteDuplicateFunc writefn>
-inline void ColStore4_SSE4_1(void* const dest, ptrdiff_t stride,
- const void* const column) {
+inline void ColStore4_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+ ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const column) {
const __m128i col_data = LoadLo8(column);
const __m128i col_dup32 = _mm_unpacklo_epi16(col_data, col_data);
writefn(dest, stride, col_dup32);
}
template <WriteDuplicateFunc writefn>
-inline void ColStore8_SSE4_1(void* const dest, ptrdiff_t stride,
- const void* const column) {
+inline void ColStore8_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+ ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const column) {
const __m128i col_data = LoadUnaligned16(column);
const __m128i col_dup32_lo = _mm_unpacklo_epi16(col_data, col_data);
const __m128i col_dup32_hi = _mm_unpackhi_epi16(col_data, col_data);
@@ -1958,8 +1969,9 @@ inline void ColStore8_SSE4_1(void* const dest, ptrdiff_t stride,
}
template <WriteDuplicateFunc writefn>
-inline void ColStore16_SSE4_1(void* const dest, ptrdiff_t stride,
- const void* const column) {
+inline void ColStore16_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+ ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const column) {
const ptrdiff_t stride4 = stride << 2;
auto* dst = static_cast<uint8_t*>(dest);
for (int y = 0; y < 32; y += 16) {
@@ -1975,8 +1987,9 @@ inline void ColStore16_SSE4_1(void* const dest, ptrdiff_t stride,
}
template <WriteDuplicateFunc writefn>
-inline void ColStore32_SSE4_1(void* const dest, ptrdiff_t stride,
- const void* const column) {
+inline void ColStore32_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+ ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const column) {
const ptrdiff_t stride4 = stride << 2;
auto* dst = static_cast<uint8_t*>(dest);
for (int y = 0; y < 64; y += 16) {
@@ -1992,8 +2005,9 @@ inline void ColStore32_SSE4_1(void* const dest, ptrdiff_t stride,
}
template <WriteDuplicateFunc writefn>
-inline void ColStore64_SSE4_1(void* const dest, ptrdiff_t stride,
- const void* const column) {
+inline void ColStore64_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+ ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const column) {
const ptrdiff_t stride4 = stride << 2;
auto* dst = static_cast<uint8_t*>(dest);
for (int y = 0; y < 128; y += 16) {
diff --git a/src/dsp/x86/inverse_transform_sse4.cc b/src/dsp/x86/inverse_transform_sse4.cc
index 12c008f..e9ceb87 100644
--- a/src/dsp/x86/inverse_transform_sse4.cc
+++ b/src/dsp/x86/inverse_transform_sse4.cc
@@ -41,7 +41,8 @@ namespace {
#include "src/dsp/inverse_transform.inc"
template <int store_width, int store_count>
-LIBGAV1_ALWAYS_INLINE void StoreDst(int16_t* dst, int32_t stride, int32_t idx,
+LIBGAV1_ALWAYS_INLINE void StoreDst(int16_t* LIBGAV1_RESTRICT dst,
+ int32_t stride, int32_t idx,
const __m128i* s) {
// NOTE: It is expected that the compiler will unroll these loops.
if (store_width == 16) {
@@ -63,8 +64,8 @@ LIBGAV1_ALWAYS_INLINE void StoreDst(int16_t* dst, int32_t stride, int32_t idx,
}
template <int load_width, int load_count>
-LIBGAV1_ALWAYS_INLINE void LoadSrc(const int16_t* src, int32_t stride,
- int32_t idx, __m128i* x) {
+LIBGAV1_ALWAYS_INLINE void LoadSrc(const int16_t* LIBGAV1_RESTRICT src,
+ int32_t stride, int32_t idx, __m128i* x) {
// NOTE: It is expected that the compiler will unroll these loops.
if (load_width == 16) {
for (int i = 0; i < load_count; i += 4) {
@@ -1638,9 +1639,10 @@ LIBGAV1_ALWAYS_INLINE bool Identity4DcOnly(void* dest, int adjusted_tx_height,
LIBGAV1_ALWAYS_INLINE void Identity4ColumnStoreToFrame(
Array2DView<uint8_t> frame, const int start_x, const int start_y,
- const int tx_width, const int tx_height, const int16_t* source) {
+ const int tx_width, const int tx_height,
+ const int16_t* LIBGAV1_RESTRICT source) {
const int stride = frame.columns();
- uint8_t* dst = frame[start_y] + start_x;
+ uint8_t* LIBGAV1_RESTRICT dst = frame[start_y] + start_x;
const __m128i v_multiplier_fraction =
_mm_set1_epi16(static_cast<int16_t>(kIdentity4MultiplierFraction << 3));
@@ -1685,9 +1687,10 @@ LIBGAV1_ALWAYS_INLINE void Identity4ColumnStoreToFrame(
LIBGAV1_ALWAYS_INLINE void Identity4RowColumnStoreToFrame(
Array2DView<uint8_t> frame, const int start_x, const int start_y,
- const int tx_width, const int tx_height, const int16_t* source) {
+ const int tx_width, const int tx_height,
+ const int16_t* LIBGAV1_RESTRICT source) {
const int stride = frame.columns();
- uint8_t* dst = frame[start_y] + start_x;
+ uint8_t* LIBGAV1_RESTRICT dst = frame[start_y] + start_x;
const __m128i v_multiplier_fraction =
_mm_set1_epi16(static_cast<int16_t>(kIdentity4MultiplierFraction << 3));
@@ -1789,9 +1792,10 @@ LIBGAV1_ALWAYS_INLINE bool Identity8DcOnly(void* dest, int adjusted_tx_height,
LIBGAV1_ALWAYS_INLINE void Identity8ColumnStoreToFrame_SSE4_1(
Array2DView<uint8_t> frame, const int start_x, const int start_y,
- const int tx_width, const int tx_height, const int16_t* source) {
+ const int tx_width, const int tx_height,
+ const int16_t* LIBGAV1_RESTRICT source) {
const int stride = frame.columns();
- uint8_t* dst = frame[start_y] + start_x;
+ uint8_t* LIBGAV1_RESTRICT dst = frame[start_y] + start_x;
const __m128i v_eight = _mm_set1_epi16(8);
if (tx_width == 4) {
int i = 0;
@@ -1883,9 +1887,10 @@ LIBGAV1_ALWAYS_INLINE bool Identity16DcOnly(void* dest, int adjusted_tx_height,
LIBGAV1_ALWAYS_INLINE void Identity16ColumnStoreToFrame_SSE4_1(
Array2DView<uint8_t> frame, const int start_x, const int start_y,
- const int tx_width, const int tx_height, const int16_t* source) {
+ const int tx_width, const int tx_height,
+ const int16_t* LIBGAV1_RESTRICT source) {
const int stride = frame.columns();
- uint8_t* dst = frame[start_y] + start_x;
+ uint8_t* LIBGAV1_RESTRICT dst = frame[start_y] + start_x;
const __m128i v_eight = _mm_set1_epi16(8);
const __m128i v_multiplier =
_mm_set1_epi16(static_cast<int16_t>(kIdentity4MultiplierFraction << 4));
@@ -1966,9 +1971,10 @@ LIBGAV1_ALWAYS_INLINE bool Identity32DcOnly(void* dest,
LIBGAV1_ALWAYS_INLINE void Identity32ColumnStoreToFrame(
Array2DView<uint8_t> frame, const int start_x, const int start_y,
- const int tx_width, const int tx_height, const int16_t* source) {
+ const int tx_width, const int tx_height,
+ const int16_t* LIBGAV1_RESTRICT source) {
const int stride = frame.columns();
- uint8_t* dst = frame[start_y] + start_x;
+ uint8_t* LIBGAV1_RESTRICT dst = frame[start_y] + start_x;
const __m128i v_two = _mm_set1_epi16(2);
int i = 0;
@@ -1995,7 +2001,7 @@ LIBGAV1_ALWAYS_INLINE void Identity32ColumnStoreToFrame(
// Process 4 wht4 rows and columns.
LIBGAV1_ALWAYS_INLINE void Wht4_SSE4_1(Array2DView<uint8_t> frame,
const int start_x, const int start_y,
- const void* source,
+ const void* LIBGAV1_RESTRICT source,
const int adjusted_tx_height) {
const auto* const src = static_cast<const int16_t*>(source);
__m128i s[4], x[4];
@@ -2058,12 +2064,11 @@ LIBGAV1_ALWAYS_INLINE void Wht4_SSE4_1(Array2DView<uint8_t> frame,
// Store to frame.
const int stride = frame.columns();
- uint8_t* dst = frame[start_y] + start_x;
+ uint8_t* LIBGAV1_RESTRICT dst = frame[start_y] + start_x;
for (int row = 0; row < 4; ++row) {
const __m128i frame_data = Load4(dst);
const __m128i a = _mm_cvtepu8_epi16(frame_data);
- // Saturate to prevent overflowing int16_t
- const __m128i b = _mm_adds_epi16(a, s[row]);
+ const __m128i b = _mm_add_epi16(a, s[row]);
Store4(dst, _mm_packus_epi16(b, b));
dst += stride;
}
@@ -2075,13 +2080,13 @@ LIBGAV1_ALWAYS_INLINE void Wht4_SSE4_1(Array2DView<uint8_t> frame,
template <bool enable_flip_rows = false>
LIBGAV1_ALWAYS_INLINE void StoreToFrameWithRound(
Array2DView<uint8_t> frame, const int start_x, const int start_y,
- const int tx_width, const int tx_height, const int16_t* source,
- TransformType tx_type) {
+ const int tx_width, const int tx_height,
+ const int16_t* LIBGAV1_RESTRICT source, TransformType tx_type) {
const bool flip_rows =
enable_flip_rows ? kTransformFlipRowsMask.Contains(tx_type) : false;
const __m128i v_eight = _mm_set1_epi16(8);
const int stride = frame.columns();
- uint8_t* dst = frame[start_y] + start_x;
+ uint8_t* LIBGAV1_RESTRICT dst = frame[start_y] + start_x;
if (tx_width == 4) {
for (int i = 0; i < tx_height; ++i) {
const int row = flip_rows ? (tx_height - i - 1) * 4 : i * 4;
@@ -2262,8 +2267,10 @@ void Dct4TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
void Dct4TransformLoopColumn_SSE4_1(TransformType tx_type,
TransformSize tx_size,
- int adjusted_tx_height, void* src_buffer,
- int start_x, int start_y, void* dst_frame) {
+ int adjusted_tx_height,
+ void* LIBGAV1_RESTRICT src_buffer,
+ int start_x, int start_y,
+ void* LIBGAV1_RESTRICT dst_frame) {
auto* src = static_cast<int16_t*>(src_buffer);
const int tx_width = kTransformWidth[tx_size];
@@ -2325,8 +2332,10 @@ void Dct8TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
void Dct8TransformLoopColumn_SSE4_1(TransformType tx_type,
TransformSize tx_size,
- int adjusted_tx_height, void* src_buffer,
- int start_x, int start_y, void* dst_frame) {
+ int adjusted_tx_height,
+ void* LIBGAV1_RESTRICT src_buffer,
+ int start_x, int start_y,
+ void* LIBGAV1_RESTRICT dst_frame) {
auto* src = static_cast<int16_t*>(src_buffer);
const int tx_width = kTransformWidth[tx_size];
@@ -2386,9 +2395,10 @@ void Dct16TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
void Dct16TransformLoopColumn_SSE4_1(TransformType tx_type,
TransformSize tx_size,
- int adjusted_tx_height, void* src_buffer,
+ int adjusted_tx_height,
+ void* LIBGAV1_RESTRICT src_buffer,
int start_x, int start_y,
- void* dst_frame) {
+ void* LIBGAV1_RESTRICT dst_frame) {
auto* src = static_cast<int16_t*>(src_buffer);
const int tx_width = kTransformWidth[tx_size];
@@ -2441,9 +2451,10 @@ void Dct32TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
void Dct32TransformLoopColumn_SSE4_1(TransformType tx_type,
TransformSize tx_size,
- int adjusted_tx_height, void* src_buffer,
+ int adjusted_tx_height,
+ void* LIBGAV1_RESTRICT src_buffer,
int start_x, int start_y,
- void* dst_frame) {
+ void* LIBGAV1_RESTRICT dst_frame) {
auto* src = static_cast<int16_t*>(src_buffer);
const int tx_width = kTransformWidth[tx_size];
@@ -2486,9 +2497,10 @@ void Dct64TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
void Dct64TransformLoopColumn_SSE4_1(TransformType tx_type,
TransformSize tx_size,
- int adjusted_tx_height, void* src_buffer,
+ int adjusted_tx_height,
+ void* LIBGAV1_RESTRICT src_buffer,
int start_x, int start_y,
- void* dst_frame) {
+ void* LIBGAV1_RESTRICT dst_frame) {
auto* src = static_cast<int16_t*>(src_buffer);
const int tx_width = kTransformWidth[tx_size];
@@ -2535,9 +2547,10 @@ void Adst4TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
void Adst4TransformLoopColumn_SSE4_1(TransformType tx_type,
TransformSize tx_size,
- int adjusted_tx_height, void* src_buffer,
+ int adjusted_tx_height,
+ void* LIBGAV1_RESTRICT src_buffer,
int start_x, int start_y,
- void* dst_frame) {
+ void* LIBGAV1_RESTRICT dst_frame) {
auto* src = static_cast<int16_t*>(src_buffer);
const int tx_width = kTransformWidth[tx_size];
@@ -2594,9 +2607,10 @@ void Adst8TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
void Adst8TransformLoopColumn_SSE4_1(TransformType tx_type,
TransformSize tx_size,
- int adjusted_tx_height, void* src_buffer,
+ int adjusted_tx_height,
+ void* LIBGAV1_RESTRICT src_buffer,
int start_x, int start_y,
- void* dst_frame) {
+ void* LIBGAV1_RESTRICT dst_frame) {
auto* src = static_cast<int16_t*>(src_buffer);
const int tx_width = kTransformWidth[tx_size];
@@ -2658,9 +2672,10 @@ void Adst16TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
void Adst16TransformLoopColumn_SSE4_1(TransformType tx_type,
TransformSize tx_size,
- int adjusted_tx_height, void* src_buffer,
+ int adjusted_tx_height,
+ void* LIBGAV1_RESTRICT src_buffer,
int start_x, int start_y,
- void* dst_frame) {
+ void* LIBGAV1_RESTRICT dst_frame) {
auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
auto* src = static_cast<int16_t*>(src_buffer);
const int tx_width = kTransformWidth[tx_size];
@@ -2727,8 +2742,9 @@ void Identity4TransformLoopRow_SSE4_1(TransformType tx_type,
void Identity4TransformLoopColumn_SSE4_1(TransformType tx_type,
TransformSize tx_size,
int adjusted_tx_height,
- void* src_buffer, int start_x,
- int start_y, void* dst_frame) {
+ void* LIBGAV1_RESTRICT src_buffer,
+ int start_x, int start_y,
+ void* LIBGAV1_RESTRICT dst_frame) {
auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
auto* src = static_cast<int16_t*>(src_buffer);
const int tx_width = kTransformWidth[tx_size];
@@ -2799,8 +2815,9 @@ void Identity8TransformLoopRow_SSE4_1(TransformType tx_type,
void Identity8TransformLoopColumn_SSE4_1(TransformType tx_type,
TransformSize tx_size,
int adjusted_tx_height,
- void* src_buffer, int start_x,
- int start_y, void* dst_frame) {
+ void* LIBGAV1_RESTRICT src_buffer,
+ int start_x, int start_y,
+ void* LIBGAV1_RESTRICT dst_frame) {
auto* src = static_cast<int16_t*>(src_buffer);
const int tx_width = kTransformWidth[tx_size];
@@ -2839,8 +2856,9 @@ void Identity16TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
void Identity16TransformLoopColumn_SSE4_1(TransformType tx_type,
TransformSize tx_size,
int adjusted_tx_height,
- void* src_buffer, int start_x,
- int start_y, void* dst_frame) {
+ void* LIBGAV1_RESTRICT src_buffer,
+ int start_x, int start_y,
+ void* LIBGAV1_RESTRICT dst_frame) {
auto* src = static_cast<int16_t*>(src_buffer);
const int tx_width = kTransformWidth[tx_size];
@@ -2884,8 +2902,9 @@ void Identity32TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
void Identity32TransformLoopColumn_SSE4_1(TransformType /*tx_type*/,
TransformSize tx_size,
int adjusted_tx_height,
- void* src_buffer, int start_x,
- int start_y, void* dst_frame) {
+ void* LIBGAV1_RESTRICT src_buffer,
+ int start_x, int start_y,
+ void* LIBGAV1_RESTRICT dst_frame) {
auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
auto* src = static_cast<int16_t*>(src_buffer);
const int tx_width = kTransformWidth[tx_size];
@@ -2907,8 +2926,10 @@ void Wht4TransformLoopRow_SSE4_1(TransformType tx_type, TransformSize tx_size,
void Wht4TransformLoopColumn_SSE4_1(TransformType tx_type,
TransformSize tx_size,
- int adjusted_tx_height, void* src_buffer,
- int start_x, int start_y, void* dst_frame) {
+ int adjusted_tx_height,
+ void* LIBGAV1_RESTRICT src_buffer,
+ int start_x, int start_y,
+ void* LIBGAV1_RESTRICT dst_frame) {
assert(tx_type == kTransformTypeDctDct);
assert(tx_size == kTransformSize4x4);
static_cast<void>(tx_type);
@@ -2928,88 +2949,88 @@ void Init8bpp() {
assert(dsp != nullptr);
// Maximum transform size for Dct is 64.
-#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize4_1DTransformDct)
- dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kRow] =
+#if DSP_ENABLED_8BPP_SSE4_1(Transform1dSize4_Transform1dDct)
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize4][kRow] =
Dct4TransformLoopRow_SSE4_1;
- dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kColumn] =
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize4][kColumn] =
Dct4TransformLoopColumn_SSE4_1;
#endif
-#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize8_1DTransformDct)
- dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kRow] =
+#if DSP_ENABLED_8BPP_SSE4_1(Transform1dSize8_Transform1dDct)
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize8][kRow] =
Dct8TransformLoopRow_SSE4_1;
- dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kColumn] =
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize8][kColumn] =
Dct8TransformLoopColumn_SSE4_1;
#endif
-#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize16_1DTransformDct)
- dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kRow] =
+#if DSP_ENABLED_8BPP_SSE4_1(Transform1dSize16_Transform1dDct)
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize16][kRow] =
Dct16TransformLoopRow_SSE4_1;
- dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kColumn] =
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize16][kColumn] =
Dct16TransformLoopColumn_SSE4_1;
#endif
-#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize32_1DTransformDct)
- dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kRow] =
+#if DSP_ENABLED_8BPP_SSE4_1(Transform1dSize32_Transform1dDct)
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize32][kRow] =
Dct32TransformLoopRow_SSE4_1;
- dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kColumn] =
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize32][kColumn] =
Dct32TransformLoopColumn_SSE4_1;
#endif
-#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize64_1DTransformDct)
- dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kRow] =
+#if DSP_ENABLED_8BPP_SSE4_1(Transform1dSize64_Transform1dDct)
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize64][kRow] =
Dct64TransformLoopRow_SSE4_1;
- dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kColumn] =
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize64][kColumn] =
Dct64TransformLoopColumn_SSE4_1;
#endif
// Maximum transform size for Adst is 16.
-#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize4_1DTransformAdst)
- dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kRow] =
+#if DSP_ENABLED_8BPP_SSE4_1(Transform1dSize4_Transform1dAdst)
+ dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize4][kRow] =
Adst4TransformLoopRow_SSE4_1;
- dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kColumn] =
+ dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize4][kColumn] =
Adst4TransformLoopColumn_SSE4_1;
#endif
-#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize8_1DTransformAdst)
- dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kRow] =
+#if DSP_ENABLED_8BPP_SSE4_1(Transform1dSize8_Transform1dAdst)
+ dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize8][kRow] =
Adst8TransformLoopRow_SSE4_1;
- dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kColumn] =
+ dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize8][kColumn] =
Adst8TransformLoopColumn_SSE4_1;
#endif
-#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize16_1DTransformAdst)
- dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kRow] =
+#if DSP_ENABLED_8BPP_SSE4_1(Transform1dSize16_Transform1dAdst)
+ dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize16][kRow] =
Adst16TransformLoopRow_SSE4_1;
- dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kColumn] =
+ dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize16][kColumn] =
Adst16TransformLoopColumn_SSE4_1;
#endif
// Maximum transform size for Identity transform is 32.
-#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize4_1DTransformIdentity)
- dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kRow] =
+#if DSP_ENABLED_8BPP_SSE4_1(Transform1dSize4_Transform1dIdentity)
+ dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize4][kRow] =
Identity4TransformLoopRow_SSE4_1;
- dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kColumn] =
+ dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize4][kColumn] =
Identity4TransformLoopColumn_SSE4_1;
#endif
-#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize8_1DTransformIdentity)
- dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kRow] =
+#if DSP_ENABLED_8BPP_SSE4_1(Transform1dSize8_Transform1dIdentity)
+ dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize8][kRow] =
Identity8TransformLoopRow_SSE4_1;
- dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kColumn] =
+ dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize8][kColumn] =
Identity8TransformLoopColumn_SSE4_1;
#endif
-#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize16_1DTransformIdentity)
- dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kRow] =
+#if DSP_ENABLED_8BPP_SSE4_1(Transform1dSize16_Transform1dIdentity)
+ dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize16][kRow] =
Identity16TransformLoopRow_SSE4_1;
- dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kColumn] =
+ dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize16][kColumn] =
Identity16TransformLoopColumn_SSE4_1;
#endif
-#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize32_1DTransformIdentity)
- dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize32][kRow] =
+#if DSP_ENABLED_8BPP_SSE4_1(Transform1dSize32_Transform1dIdentity)
+ dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize32][kRow] =
Identity32TransformLoopRow_SSE4_1;
- dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize32][kColumn] =
+ dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize32][kColumn] =
Identity32TransformLoopColumn_SSE4_1;
#endif
// Maximum transform size for Wht is 4.
-#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize4_1DTransformWht)
- dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kRow] =
+#if DSP_ENABLED_8BPP_SSE4_1(Transform1dSize4_Transform1dWht)
+ dsp->inverse_transforms[kTransform1dWht][kTransform1dSize4][kRow] =
Wht4TransformLoopRow_SSE4_1;
- dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kColumn] =
+ dsp->inverse_transforms[kTransform1dWht][kTransform1dSize4][kColumn] =
Wht4TransformLoopColumn_SSE4_1;
#endif
}
diff --git a/src/dsp/x86/inverse_transform_sse4.h b/src/dsp/x86/inverse_transform_sse4.h
index 106084b..c31e88b 100644
--- a/src/dsp/x86/inverse_transform_sse4.h
+++ b/src/dsp/x86/inverse_transform_sse4.h
@@ -34,56 +34,56 @@ void InverseTransformInit_SSE4_1();
// optimization being enabled, signal the sse4 implementation should be used.
#if LIBGAV1_TARGETING_SSE4_1
-#ifndef LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformDct
-#define LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformDct LIBGAV1_CPU_SSE4_1
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize4_Transform1dDct
+#define LIBGAV1_Dsp8bpp_Transform1dSize4_Transform1dDct LIBGAV1_CPU_SSE4_1
#endif
-#ifndef LIBGAV1_Dsp8bpp_1DTransformSize8_1DTransformDct
-#define LIBGAV1_Dsp8bpp_1DTransformSize8_1DTransformDct LIBGAV1_CPU_SSE4_1
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize8_Transform1dDct
+#define LIBGAV1_Dsp8bpp_Transform1dSize8_Transform1dDct LIBGAV1_CPU_SSE4_1
#endif
-#ifndef LIBGAV1_Dsp8bpp_1DTransformSize16_1DTransformDct
-#define LIBGAV1_Dsp8bpp_1DTransformSize16_1DTransformDct LIBGAV1_CPU_SSE4_1
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize16_Transform1dDct
+#define LIBGAV1_Dsp8bpp_Transform1dSize16_Transform1dDct LIBGAV1_CPU_SSE4_1
#endif
-#ifndef LIBGAV1_Dsp8bpp_1DTransformSize32_1DTransformDct
-#define LIBGAV1_Dsp8bpp_1DTransformSize32_1DTransformDct LIBGAV1_CPU_SSE4_1
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize32_Transform1dDct
+#define LIBGAV1_Dsp8bpp_Transform1dSize32_Transform1dDct LIBGAV1_CPU_SSE4_1
#endif
-#ifndef LIBGAV1_Dsp8bpp_1DTransformSize64_1DTransformDct
-#define LIBGAV1_Dsp8bpp_1DTransformSize64_1DTransformDct LIBGAV1_CPU_SSE4_1
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize64_Transform1dDct
+#define LIBGAV1_Dsp8bpp_Transform1dSize64_Transform1dDct LIBGAV1_CPU_SSE4_1
#endif
-#ifndef LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformAdst
-#define LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformAdst LIBGAV1_CPU_SSE4_1
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize4_Transform1dAdst
+#define LIBGAV1_Dsp8bpp_Transform1dSize4_Transform1dAdst LIBGAV1_CPU_SSE4_1
#endif
-#ifndef LIBGAV1_Dsp8bpp_1DTransformSize8_1DTransformAdst
-#define LIBGAV1_Dsp8bpp_1DTransformSize8_1DTransformAdst LIBGAV1_CPU_SSE4_1
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize8_Transform1dAdst
+#define LIBGAV1_Dsp8bpp_Transform1dSize8_Transform1dAdst LIBGAV1_CPU_SSE4_1
#endif
-#ifndef LIBGAV1_Dsp8bpp_1DTransformSize16_1DTransformAdst
-#define LIBGAV1_Dsp8bpp_1DTransformSize16_1DTransformAdst LIBGAV1_CPU_SSE4_1
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize16_Transform1dAdst
+#define LIBGAV1_Dsp8bpp_Transform1dSize16_Transform1dAdst LIBGAV1_CPU_SSE4_1
#endif
-#ifndef LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformIdentity
-#define LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformIdentity LIBGAV1_CPU_SSE4_1
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize4_Transform1dIdentity
+#define LIBGAV1_Dsp8bpp_Transform1dSize4_Transform1dIdentity LIBGAV1_CPU_SSE4_1
#endif
-#ifndef LIBGAV1_Dsp8bpp_1DTransformSize8_1DTransformIdentity
-#define LIBGAV1_Dsp8bpp_1DTransformSize8_1DTransformIdentity LIBGAV1_CPU_SSE4_1
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize8_Transform1dIdentity
+#define LIBGAV1_Dsp8bpp_Transform1dSize8_Transform1dIdentity LIBGAV1_CPU_SSE4_1
#endif
-#ifndef LIBGAV1_Dsp8bpp_1DTransformSize16_1DTransformIdentity
-#define LIBGAV1_Dsp8bpp_1DTransformSize16_1DTransformIdentity LIBGAV1_CPU_SSE4_1
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize16_Transform1dIdentity
+#define LIBGAV1_Dsp8bpp_Transform1dSize16_Transform1dIdentity LIBGAV1_CPU_SSE4_1
#endif
-#ifndef LIBGAV1_Dsp8bpp_1DTransformSize32_1DTransformIdentity
-#define LIBGAV1_Dsp8bpp_1DTransformSize32_1DTransformIdentity LIBGAV1_CPU_SSE4_1
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize32_Transform1dIdentity
+#define LIBGAV1_Dsp8bpp_Transform1dSize32_Transform1dIdentity LIBGAV1_CPU_SSE4_1
#endif
-#ifndef LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformWht
-#define LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformWht LIBGAV1_CPU_SSE4_1
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize4_Transform1dWht
+#define LIBGAV1_Dsp8bpp_Transform1dSize4_Transform1dWht LIBGAV1_CPU_SSE4_1
#endif
#endif // LIBGAV1_TARGETING_SSE4_1
#endif // LIBGAV1_SRC_DSP_X86_INVERSE_TRANSFORM_SSE4_H_
diff --git a/src/dsp/x86/loop_restoration_10bit_avx2.cc b/src/dsp/x86/loop_restoration_10bit_avx2.cc
index b38f322..daf5c42 100644
--- a/src/dsp/x86/loop_restoration_10bit_avx2.cc
+++ b/src/dsp/x86/loop_restoration_10bit_avx2.cc
@@ -472,11 +472,14 @@ inline void WienerVerticalTap1(const int16_t* wiener_buffer,
}
void WienerFilter_AVX2(
- const RestorationUnitInfo& restoration_info, const void* const source,
- const ptrdiff_t stride, const void* const top_border,
- const ptrdiff_t top_border_stride, const void* const bottom_border,
+ const RestorationUnitInfo& LIBGAV1_RESTRICT restoration_info,
+ const void* LIBGAV1_RESTRICT const source, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_border,
+ const ptrdiff_t top_border_stride,
+ const void* LIBGAV1_RESTRICT const bottom_border,
const ptrdiff_t bottom_border_stride, const int width, const int height,
- RestorationBuffer* const restoration_buffer, void* const dest) {
+ RestorationBuffer* LIBGAV1_RESTRICT const restoration_buffer,
+ void* LIBGAV1_RESTRICT const dest) {
const int16_t* const number_leading_zero_coefficients =
restoration_info.wiener_info.number_leading_zero_coefficients;
const int number_rows_to_skip = std::max(
@@ -3097,11 +3100,14 @@ inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
// in the end of each row. It is safe to overwrite the output as it will not be
// part of the visible frame.
void SelfGuidedFilter_AVX2(
- const RestorationUnitInfo& restoration_info, const void* const source,
- const ptrdiff_t stride, const void* const top_border,
- const ptrdiff_t top_border_stride, const void* const bottom_border,
+ const RestorationUnitInfo& LIBGAV1_RESTRICT restoration_info,
+ const void* LIBGAV1_RESTRICT const source, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_border,
+ const ptrdiff_t top_border_stride,
+ const void* LIBGAV1_RESTRICT const bottom_border,
const ptrdiff_t bottom_border_stride, const int width, const int height,
- RestorationBuffer* const restoration_buffer, void* const dest) {
+ RestorationBuffer* LIBGAV1_RESTRICT const restoration_buffer,
+ void* LIBGAV1_RESTRICT const dest) {
const int index = restoration_info.sgr_proj_info.index;
const int radius_pass_0 = kSgrProjParams[index][0]; // 2 or 0
const int radius_pass_1 = kSgrProjParams[index][2]; // 1 or 0
diff --git a/src/dsp/x86/loop_restoration_10bit_sse4.cc b/src/dsp/x86/loop_restoration_10bit_sse4.cc
index 96380e3..6625d51 100644
--- a/src/dsp/x86/loop_restoration_10bit_sse4.cc
+++ b/src/dsp/x86/loop_restoration_10bit_sse4.cc
@@ -429,11 +429,14 @@ inline void WienerVerticalTap1(const int16_t* wiener_buffer,
}
void WienerFilter_SSE4_1(
- const RestorationUnitInfo& restoration_info, const void* const source,
- const ptrdiff_t stride, const void* const top_border,
- const ptrdiff_t top_border_stride, const void* const bottom_border,
+ const RestorationUnitInfo& LIBGAV1_RESTRICT restoration_info,
+ const void* LIBGAV1_RESTRICT const source, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_border,
+ const ptrdiff_t top_border_stride,
+ const void* LIBGAV1_RESTRICT const bottom_border,
const ptrdiff_t bottom_border_stride, const int width, const int height,
- RestorationBuffer* const restoration_buffer, void* const dest) {
+ RestorationBuffer* LIBGAV1_RESTRICT const restoration_buffer,
+ void* LIBGAV1_RESTRICT const dest) {
const int16_t* const number_leading_zero_coefficients =
restoration_info.wiener_info.number_leading_zero_coefficients;
const int number_rows_to_skip = std::max(
@@ -2465,11 +2468,14 @@ inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
// in the end of each row. It is safe to overwrite the output as it will not be
// part of the visible frame.
void SelfGuidedFilter_SSE4_1(
- const RestorationUnitInfo& restoration_info, const void* const source,
- const ptrdiff_t stride, const void* const top_border,
- const ptrdiff_t top_border_stride, const void* const bottom_border,
+ const RestorationUnitInfo& LIBGAV1_RESTRICT restoration_info,
+ const void* LIBGAV1_RESTRICT const source, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_border,
+ const ptrdiff_t top_border_stride,
+ const void* LIBGAV1_RESTRICT const bottom_border,
const ptrdiff_t bottom_border_stride, const int width, const int height,
- RestorationBuffer* const restoration_buffer, void* const dest) {
+ RestorationBuffer* LIBGAV1_RESTRICT const restoration_buffer,
+ void* LIBGAV1_RESTRICT const dest) {
const int index = restoration_info.sgr_proj_info.index;
const int radius_pass_0 = kSgrProjParams[index][0]; // 2 or 0
const int radius_pass_1 = kSgrProjParams[index][2]; // 1 or 0
diff --git a/src/dsp/x86/loop_restoration_avx2.cc b/src/dsp/x86/loop_restoration_avx2.cc
index 351a324..30e8a22 100644
--- a/src/dsp/x86/loop_restoration_avx2.cc
+++ b/src/dsp/x86/loop_restoration_avx2.cc
@@ -483,11 +483,14 @@ inline void WienerVerticalTap1(const int16_t* wiener_buffer,
}
void WienerFilter_AVX2(
- const RestorationUnitInfo& restoration_info, const void* const source,
- const ptrdiff_t stride, const void* const top_border,
- const ptrdiff_t top_border_stride, const void* const bottom_border,
+ const RestorationUnitInfo& LIBGAV1_RESTRICT restoration_info,
+ const void* LIBGAV1_RESTRICT const source, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_border,
+ const ptrdiff_t top_border_stride,
+ const void* LIBGAV1_RESTRICT const bottom_border,
const ptrdiff_t bottom_border_stride, const int width, const int height,
- RestorationBuffer* const restoration_buffer, void* const dest) {
+ RestorationBuffer* LIBGAV1_RESTRICT const restoration_buffer,
+ void* LIBGAV1_RESTRICT const dest) {
const int16_t* const number_leading_zero_coefficients =
restoration_info.wiener_info.number_leading_zero_coefficients;
const int number_rows_to_skip = std::max(
@@ -2880,11 +2883,14 @@ inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
// in the end of each row. It is safe to overwrite the output as it will not be
// part of the visible frame.
void SelfGuidedFilter_AVX2(
- const RestorationUnitInfo& restoration_info, const void* const source,
- const ptrdiff_t stride, const void* const top_border,
- const ptrdiff_t top_border_stride, const void* const bottom_border,
+ const RestorationUnitInfo& LIBGAV1_RESTRICT restoration_info,
+ const void* LIBGAV1_RESTRICT const source, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_border,
+ const ptrdiff_t top_border_stride,
+ const void* LIBGAV1_RESTRICT const bottom_border,
const ptrdiff_t bottom_border_stride, const int width, const int height,
- RestorationBuffer* const restoration_buffer, void* const dest) {
+ RestorationBuffer* LIBGAV1_RESTRICT const restoration_buffer,
+ void* LIBGAV1_RESTRICT const dest) {
const int index = restoration_info.sgr_proj_info.index;
const int radius_pass_0 = kSgrProjParams[index][0]; // 2 or 0
const int radius_pass_1 = kSgrProjParams[index][2]; // 1 or 0
diff --git a/src/dsp/x86/loop_restoration_sse4.cc b/src/dsp/x86/loop_restoration_sse4.cc
index 273bcc8..3363f0e 100644
--- a/src/dsp/x86/loop_restoration_sse4.cc
+++ b/src/dsp/x86/loop_restoration_sse4.cc
@@ -482,11 +482,14 @@ inline void WienerVerticalTap1(const int16_t* wiener_buffer,
}
void WienerFilter_SSE4_1(
- const RestorationUnitInfo& restoration_info, const void* const source,
- const ptrdiff_t stride, const void* const top_border,
- const ptrdiff_t top_border_stride, const void* const bottom_border,
+ const RestorationUnitInfo& LIBGAV1_RESTRICT restoration_info,
+ const void* LIBGAV1_RESTRICT const source, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_border,
+ const ptrdiff_t top_border_stride,
+ const void* LIBGAV1_RESTRICT const bottom_border,
const ptrdiff_t bottom_border_stride, const int width, const int height,
- RestorationBuffer* const restoration_buffer, void* const dest) {
+ RestorationBuffer* LIBGAV1_RESTRICT const restoration_buffer,
+ void* LIBGAV1_RESTRICT const dest) {
const int16_t* const number_leading_zero_coefficients =
restoration_info.wiener_info.number_leading_zero_coefficients;
const int number_rows_to_skip = std::max(
@@ -2510,11 +2513,14 @@ inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
// in the end of each row. It is safe to overwrite the output as it will not be
// part of the visible frame.
void SelfGuidedFilter_SSE4_1(
- const RestorationUnitInfo& restoration_info, const void* const source,
- const ptrdiff_t stride, const void* const top_border,
- const ptrdiff_t top_border_stride, const void* const bottom_border,
+ const RestorationUnitInfo& LIBGAV1_RESTRICT restoration_info,
+ const void* LIBGAV1_RESTRICT const source, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_border,
+ const ptrdiff_t top_border_stride,
+ const void* LIBGAV1_RESTRICT const bottom_border,
const ptrdiff_t bottom_border_stride, const int width, const int height,
- RestorationBuffer* const restoration_buffer, void* const dest) {
+ RestorationBuffer* LIBGAV1_RESTRICT const restoration_buffer,
+ void* LIBGAV1_RESTRICT const dest) {
const int index = restoration_info.sgr_proj_info.index;
const int radius_pass_0 = kSgrProjParams[index][0]; // 2 or 0
const int radius_pass_1 = kSgrProjParams[index][2]; // 1 or 0
diff --git a/src/dsp/x86/mask_blend_sse4.cc b/src/dsp/x86/mask_blend_sse4.cc
index 2e836af..a18444b 100644
--- a/src/dsp/x86/mask_blend_sse4.cc
+++ b/src/dsp/x86/mask_blend_sse4.cc
@@ -36,7 +36,8 @@ namespace {
// Width can only be 4 when it is subsampled from a block of width 8, hence
// subsampling_x is always 1 when this function is called.
template <int subsampling_x, int subsampling_y>
-inline __m128i GetMask4x2(const uint8_t* mask, ptrdiff_t mask_stride) {
+inline __m128i GetMask4x2(const uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
if (subsampling_x == 1) {
const __m128i mask_val_0 = _mm_cvtepu8_epi16(LoadLo8(mask));
const __m128i mask_val_1 =
@@ -62,7 +63,8 @@ inline __m128i GetMask4x2(const uint8_t* mask, ptrdiff_t mask_stride) {
// 16-bit is also the lowest packing for hadd, but without subsampling there is
// an unfortunate conversion required.
template <int subsampling_x, int subsampling_y>
-inline __m128i GetMask8(const uint8_t* mask, ptrdiff_t stride) {
+inline __m128i GetMask8(const uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t stride) {
if (subsampling_x == 1) {
const __m128i row_vals = LoadUnaligned16(mask);
@@ -89,7 +91,8 @@ inline __m128i GetMask8(const uint8_t* mask, ptrdiff_t stride) {
// when is_inter_intra is true, the prediction values are brought to 8-bit
// packing as well.
template <int subsampling_x, int subsampling_y>
-inline __m128i GetInterIntraMask8(const uint8_t* mask, ptrdiff_t stride) {
+inline __m128i GetInterIntraMask8(const uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t stride) {
if (subsampling_x == 1) {
const __m128i row_vals = LoadUnaligned16(mask);
@@ -116,10 +119,11 @@ inline __m128i GetInterIntraMask8(const uint8_t* mask, ptrdiff_t stride) {
return mask_val;
}
-inline void WriteMaskBlendLine4x2(const int16_t* const pred_0,
- const int16_t* const pred_1,
+inline void WriteMaskBlendLine4x2(const int16_t* LIBGAV1_RESTRICT const pred_0,
+ const int16_t* LIBGAV1_RESTRICT const pred_1,
const __m128i pred_mask_0,
- const __m128i pred_mask_1, uint8_t* dst,
+ const __m128i pred_mask_1,
+ uint8_t* LIBGAV1_RESTRICT dst,
const ptrdiff_t dst_stride) {
const __m128i pred_val_0 = LoadAligned16(pred_0);
const __m128i pred_val_1 = LoadAligned16(pred_1);
@@ -145,9 +149,11 @@ inline void WriteMaskBlendLine4x2(const int16_t* const pred_0,
}
template <int subsampling_x, int subsampling_y>
-inline void MaskBlending4x4_SSE4(const int16_t* pred_0, const int16_t* pred_1,
- const uint8_t* mask,
- const ptrdiff_t mask_stride, uint8_t* dst,
+inline void MaskBlending4x4_SSE4(const int16_t* LIBGAV1_RESTRICT pred_0,
+ const int16_t* LIBGAV1_RESTRICT pred_1,
+ const uint8_t* LIBGAV1_RESTRICT mask,
+ const ptrdiff_t mask_stride,
+ uint8_t* LIBGAV1_RESTRICT dst,
const ptrdiff_t dst_stride) {
const __m128i mask_inverter = _mm_set1_epi16(64);
__m128i pred_mask_0 =
@@ -167,10 +173,12 @@ inline void MaskBlending4x4_SSE4(const int16_t* pred_0, const int16_t* pred_1,
}
template <int subsampling_x, int subsampling_y>
-inline void MaskBlending4xH_SSE4(const int16_t* pred_0, const int16_t* pred_1,
- const uint8_t* const mask_ptr,
+inline void MaskBlending4xH_SSE4(const int16_t* LIBGAV1_RESTRICT pred_0,
+ const int16_t* LIBGAV1_RESTRICT pred_1,
+ const uint8_t* LIBGAV1_RESTRICT const mask_ptr,
const ptrdiff_t mask_stride, const int height,
- uint8_t* dst, const ptrdiff_t dst_stride) {
+ uint8_t* LIBGAV1_RESTRICT dst,
+ const ptrdiff_t dst_stride) {
const uint8_t* mask = mask_ptr;
if (height == 4) {
MaskBlending4x4_SSE4<subsampling_x, subsampling_y>(
@@ -222,11 +230,12 @@ inline void MaskBlending4xH_SSE4(const int16_t* pred_0, const int16_t* pred_1,
}
template <int subsampling_x, int subsampling_y>
-inline void MaskBlend_SSE4(const void* prediction_0, const void* prediction_1,
+inline void MaskBlend_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
const ptrdiff_t /*prediction_stride_1*/,
- const uint8_t* const mask_ptr,
+ const uint8_t* LIBGAV1_RESTRICT const mask_ptr,
const ptrdiff_t mask_stride, const int width,
- const int height, void* dest,
+ const int height, void* LIBGAV1_RESTRICT dest,
const ptrdiff_t dst_stride) {
auto* dst = static_cast<uint8_t*>(dest);
const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
@@ -277,11 +286,10 @@ inline void MaskBlend_SSE4(const void* prediction_0, const void* prediction_1,
} while (++y < height);
}
-inline void InterIntraWriteMaskBlendLine8bpp4x2(const uint8_t* const pred_0,
- uint8_t* const pred_1,
- const ptrdiff_t pred_stride_1,
- const __m128i pred_mask_0,
- const __m128i pred_mask_1) {
+inline void InterIntraWriteMaskBlendLine8bpp4x2(
+ const uint8_t* LIBGAV1_RESTRICT const pred_0,
+ uint8_t* LIBGAV1_RESTRICT const pred_1, const ptrdiff_t pred_stride_1,
+ const __m128i pred_mask_0, const __m128i pred_mask_1) {
const __m128i pred_mask = _mm_unpacklo_epi8(pred_mask_0, pred_mask_1);
const __m128i pred_val_0 = LoadLo8(pred_0);
@@ -301,11 +309,10 @@ inline void InterIntraWriteMaskBlendLine8bpp4x2(const uint8_t* const pred_0,
}
template <int subsampling_x, int subsampling_y>
-inline void InterIntraMaskBlending8bpp4x4_SSE4(const uint8_t* pred_0,
- uint8_t* pred_1,
- const ptrdiff_t pred_stride_1,
- const uint8_t* mask,
- const ptrdiff_t mask_stride) {
+inline void InterIntraMaskBlending8bpp4x4_SSE4(
+ const uint8_t* LIBGAV1_RESTRICT pred_0, uint8_t* LIBGAV1_RESTRICT pred_1,
+ const ptrdiff_t pred_stride_1, const uint8_t* LIBGAV1_RESTRICT mask,
+ const ptrdiff_t mask_stride) {
const __m128i mask_inverter = _mm_set1_epi8(64);
const __m128i pred_mask_u16_first =
GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
@@ -328,12 +335,11 @@ inline void InterIntraMaskBlending8bpp4x4_SSE4(const uint8_t* pred_0,
}
template <int subsampling_x, int subsampling_y>
-inline void InterIntraMaskBlending8bpp4xH_SSE4(const uint8_t* pred_0,
- uint8_t* pred_1,
- const ptrdiff_t pred_stride_1,
- const uint8_t* const mask_ptr,
- const ptrdiff_t mask_stride,
- const int height) {
+inline void InterIntraMaskBlending8bpp4xH_SSE4(
+ const uint8_t* LIBGAV1_RESTRICT pred_0, uint8_t* LIBGAV1_RESTRICT pred_1,
+ const ptrdiff_t pred_stride_1,
+ const uint8_t* LIBGAV1_RESTRICT const mask_ptr, const ptrdiff_t mask_stride,
+ const int height) {
const uint8_t* mask = mask_ptr;
if (height == 4) {
InterIntraMaskBlending8bpp4x4_SSE4<subsampling_x, subsampling_y>(
@@ -358,12 +364,11 @@ inline void InterIntraMaskBlending8bpp4xH_SSE4(const uint8_t* pred_0,
}
template <int subsampling_x, int subsampling_y>
-void InterIntraMaskBlend8bpp_SSE4(const uint8_t* prediction_0,
- uint8_t* prediction_1,
- const ptrdiff_t prediction_stride_1,
- const uint8_t* const mask_ptr,
- const ptrdiff_t mask_stride, const int width,
- const int height) {
+void InterIntraMaskBlend8bpp_SSE4(
+ const uint8_t* LIBGAV1_RESTRICT prediction_0,
+ uint8_t* LIBGAV1_RESTRICT prediction_1, const ptrdiff_t prediction_stride_1,
+ const uint8_t* LIBGAV1_RESTRICT const mask_ptr, const ptrdiff_t mask_stride,
+ const int width, const int height) {
if (width == 4) {
InterIntraMaskBlending8bpp4xH_SSE4<subsampling_x, subsampling_y>(
prediction_0, prediction_1, prediction_stride_1, mask_ptr, mask_stride,
@@ -503,10 +508,11 @@ inline __m128i GetMask8(const uint8_t* mask, const ptrdiff_t stride,
}
inline void WriteMaskBlendLine10bpp4x2_SSE4_1(
- const uint16_t* pred_0, const uint16_t* pred_1,
- const ptrdiff_t pred_stride_1, const __m128i& pred_mask_0,
- const __m128i& pred_mask_1, const __m128i& offset, const __m128i& max,
- const __m128i& shift4, uint16_t* dst, const ptrdiff_t dst_stride) {
+ const uint16_t* LIBGAV1_RESTRICT pred_0,
+ const uint16_t* LIBGAV1_RESTRICT pred_1, const ptrdiff_t pred_stride_1,
+ const __m128i& pred_mask_0, const __m128i& pred_mask_1,
+ const __m128i& offset, const __m128i& max, const __m128i& shift4,
+ uint16_t* LIBGAV1_RESTRICT dst, const ptrdiff_t dst_stride) {
const __m128i pred_val_0 = LoadUnaligned16(pred_0);
const __m128i pred_val_1 = LoadHi8(LoadLo8(pred_1), pred_1 + pred_stride_1);
@@ -544,11 +550,12 @@ inline void WriteMaskBlendLine10bpp4x2_SSE4_1(
}
template <int subsampling_x, int subsampling_y>
-inline void MaskBlend10bpp4x4_SSE4_1(const uint16_t* pred_0,
- const uint16_t* pred_1,
+inline void MaskBlend10bpp4x4_SSE4_1(const uint16_t* LIBGAV1_RESTRICT pred_0,
+ const uint16_t* LIBGAV1_RESTRICT pred_1,
const ptrdiff_t pred_stride_1,
- const uint8_t* mask,
- const ptrdiff_t mask_stride, uint16_t* dst,
+ const uint8_t* LIBGAV1_RESTRICT mask,
+ const ptrdiff_t mask_stride,
+ uint16_t* LIBGAV1_RESTRICT dst,
const ptrdiff_t dst_stride) {
const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse);
const __m128i zero = _mm_setzero_si128();
@@ -575,13 +582,12 @@ inline void MaskBlend10bpp4x4_SSE4_1(const uint16_t* pred_0,
}
template <int subsampling_x, int subsampling_y>
-inline void MaskBlend10bpp4xH_SSE4_1(const uint16_t* pred_0,
- const uint16_t* pred_1,
- const ptrdiff_t pred_stride_1,
- const uint8_t* const mask_ptr,
- const ptrdiff_t mask_stride,
- const int height, uint16_t* dst,
- const ptrdiff_t dst_stride) {
+inline void MaskBlend10bpp4xH_SSE4_1(
+ const uint16_t* LIBGAV1_RESTRICT pred_0,
+ const uint16_t* LIBGAV1_RESTRICT pred_1, const ptrdiff_t pred_stride_1,
+ const uint8_t* LIBGAV1_RESTRICT const mask_ptr, const ptrdiff_t mask_stride,
+ const int height, uint16_t* LIBGAV1_RESTRICT dst,
+ const ptrdiff_t dst_stride) {
const uint8_t* mask = mask_ptr;
if (height == 4) {
MaskBlend10bpp4x4_SSE4_1<subsampling_x, subsampling_y>(
@@ -648,13 +654,13 @@ inline void MaskBlend10bpp4xH_SSE4_1(const uint16_t* pred_0,
}
template <int subsampling_x, int subsampling_y>
-inline void MaskBlend10bpp_SSE4_1(const void* prediction_0,
- const void* prediction_1,
- const ptrdiff_t prediction_stride_1,
- const uint8_t* const mask_ptr,
- const ptrdiff_t mask_stride, const int width,
- const int height, void* dest,
- const ptrdiff_t dest_stride) {
+inline void MaskBlend10bpp_SSE4_1(
+ const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ const ptrdiff_t prediction_stride_1,
+ const uint8_t* LIBGAV1_RESTRICT const mask_ptr, const ptrdiff_t mask_stride,
+ const int width, const int height, void* LIBGAV1_RESTRICT dest,
+ const ptrdiff_t dest_stride) {
auto* dst = static_cast<uint16_t*>(dest);
const ptrdiff_t dst_stride = dest_stride / sizeof(dst[0]);
const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
@@ -725,10 +731,11 @@ inline void MaskBlend10bpp_SSE4_1(const void* prediction_0,
}
inline void InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(
- const uint16_t* prediction_0, const uint16_t* prediction_1,
+ const uint16_t* LIBGAV1_RESTRICT prediction_0,
+ const uint16_t* LIBGAV1_RESTRICT prediction_1,
const ptrdiff_t pred_stride_1, const __m128i& pred_mask_0,
- const __m128i& pred_mask_1, const __m128i& shift6, uint16_t* dst,
- const ptrdiff_t dst_stride) {
+ const __m128i& pred_mask_1, const __m128i& shift6,
+ uint16_t* LIBGAV1_RESTRICT dst, const ptrdiff_t dst_stride) {
const __m128i pred_val_0 = LoadUnaligned16(prediction_0);
const __m128i pred_val_1 =
LoadHi8(LoadLo8(prediction_1), prediction_1 + pred_stride_1);
@@ -751,9 +758,10 @@ inline void InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(
template <int subsampling_x, int subsampling_y>
inline void InterIntraMaskBlend10bpp4x4_SSE4_1(
- const uint16_t* pred_0, const uint16_t* pred_1,
- const ptrdiff_t pred_stride_1, const uint8_t* mask,
- const ptrdiff_t mask_stride, uint16_t* dst, const ptrdiff_t dst_stride) {
+ const uint16_t* LIBGAV1_RESTRICT pred_0,
+ const uint16_t* LIBGAV1_RESTRICT pred_1, const ptrdiff_t pred_stride_1,
+ const uint8_t* LIBGAV1_RESTRICT mask, const ptrdiff_t mask_stride,
+ uint16_t* LIBGAV1_RESTRICT dst, const ptrdiff_t dst_stride) {
const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse);
const __m128i shift6 = _mm_set1_epi32((1 << 6) >> 1);
const __m128i zero = _mm_setzero_si128();
@@ -777,13 +785,12 @@ inline void InterIntraMaskBlend10bpp4x4_SSE4_1(
}
template <int subsampling_x, int subsampling_y>
-inline void InterIntraMaskBlend10bpp4xH_SSE4_1(const uint16_t* pred_0,
- const uint16_t* pred_1,
- const ptrdiff_t pred_stride_1,
- const uint8_t* const mask_ptr,
- const ptrdiff_t mask_stride,
- const int height, uint16_t* dst,
- const ptrdiff_t dst_stride) {
+inline void InterIntraMaskBlend10bpp4xH_SSE4_1(
+ const uint16_t* LIBGAV1_RESTRICT pred_0,
+ const uint16_t* LIBGAV1_RESTRICT pred_1, const ptrdiff_t pred_stride_1,
+ const uint8_t* LIBGAV1_RESTRICT const mask_ptr, const ptrdiff_t mask_stride,
+ const int height, uint16_t* LIBGAV1_RESTRICT dst,
+ const ptrdiff_t dst_stride) {
const uint8_t* mask = mask_ptr;
if (height == 4) {
InterIntraMaskBlend10bpp4x4_SSE4_1<subsampling_x, subsampling_y>(
@@ -848,9 +855,11 @@ inline void InterIntraMaskBlend10bpp4xH_SSE4_1(const uint16_t* pred_0,
template <int subsampling_x, int subsampling_y>
inline void InterIntraMaskBlend10bpp_SSE4_1(
- const void* prediction_0, const void* prediction_1,
- const ptrdiff_t prediction_stride_1, const uint8_t* const mask_ptr,
- const ptrdiff_t mask_stride, const int width, const int height, void* dest,
+ const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ const ptrdiff_t prediction_stride_1,
+ const uint8_t* LIBGAV1_RESTRICT const mask_ptr, const ptrdiff_t mask_stride,
+ const int width, const int height, void* LIBGAV1_RESTRICT dest,
const ptrdiff_t dest_stride) {
auto* dst = static_cast<uint16_t*>(dest);
const ptrdiff_t dst_stride = dest_stride / sizeof(dst[0]);
diff --git a/src/dsp/x86/motion_field_projection_sse4.cc b/src/dsp/x86/motion_field_projection_sse4.cc
index e3f2cce..5641531 100644
--- a/src/dsp/x86/motion_field_projection_sse4.cc
+++ b/src/dsp/x86/motion_field_projection_sse4.cc
@@ -360,27 +360,12 @@ void MotionFieldProjectionKernel_SSE4_1(
} while (++y8 < y8_end);
}
-void Init8bpp() {
- Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
- assert(dsp != nullptr);
- dsp->motion_field_projection_kernel = MotionFieldProjectionKernel_SSE4_1;
-}
-
-#if LIBGAV1_MAX_BITDEPTH >= 10
-void Init10bpp() {
- Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
- assert(dsp != nullptr);
- dsp->motion_field_projection_kernel = MotionFieldProjectionKernel_SSE4_1;
-}
-#endif
-
} // namespace
void MotionFieldProjectionInit_SSE4_1() {
- Init8bpp();
-#if LIBGAV1_MAX_BITDEPTH >= 10
- Init10bpp();
-#endif
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+ dsp->motion_field_projection_kernel = MotionFieldProjectionKernel_SSE4_1;
}
} // namespace dsp
diff --git a/src/dsp/x86/motion_vector_search_sse4.cc b/src/dsp/x86/motion_vector_search_sse4.cc
index 7f5f035..dacc6ec 100644
--- a/src/dsp/x86/motion_vector_search_sse4.cc
+++ b/src/dsp/x86/motion_vector_search_sse4.cc
@@ -64,7 +64,7 @@ inline __m128i MvProjectionClip(const __m128i mvs[2],
}
inline __m128i MvProjectionCompoundClip(
- const MotionVector* const temporal_mvs,
+ const MotionVector* LIBGAV1_RESTRICT const temporal_mvs,
const int8_t temporal_reference_offsets[2],
const int reference_offsets[2]) {
const auto* const tmvs = reinterpret_cast<const int32_t*>(temporal_mvs);
@@ -83,8 +83,8 @@ inline __m128i MvProjectionCompoundClip(
}
inline __m128i MvProjectionSingleClip(
- const MotionVector* const temporal_mvs,
- const int8_t* const temporal_reference_offsets,
+ const MotionVector* LIBGAV1_RESTRICT const temporal_mvs,
+ const int8_t* LIBGAV1_RESTRICT const temporal_reference_offsets,
const int reference_offset) {
const auto* const tmvs = reinterpret_cast<const int16_t*>(temporal_mvs);
const __m128i temporal_mv = LoadAligned16(tmvs);
@@ -126,9 +126,10 @@ inline void ForceInteger(const __m128i mv, void* const candidate_mvs) {
}
void MvProjectionCompoundLowPrecision_SSE4_1(
- const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
+ const MotionVector* LIBGAV1_RESTRICT temporal_mvs,
+ const int8_t* LIBGAV1_RESTRICT temporal_reference_offsets,
const int reference_offsets[2], const int count,
- CompoundMotionVector* candidate_mvs) {
+ CompoundMotionVector* LIBGAV1_RESTRICT candidate_mvs) {
// |reference_offsets| non-zero check usually equals true and is ignored.
// To facilitate the compilers, make a local copy of |reference_offsets|.
const int offsets[2] = {reference_offsets[0], reference_offsets[1]};
@@ -143,9 +144,10 @@ void MvProjectionCompoundLowPrecision_SSE4_1(
}
void MvProjectionCompoundForceInteger_SSE4_1(
- const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
+ const MotionVector* LIBGAV1_RESTRICT temporal_mvs,
+ const int8_t* LIBGAV1_RESTRICT temporal_reference_offsets,
const int reference_offsets[2], const int count,
- CompoundMotionVector* candidate_mvs) {
+ CompoundMotionVector* LIBGAV1_RESTRICT candidate_mvs) {
// |reference_offsets| non-zero check usually equals true and is ignored.
// To facilitate the compilers, make a local copy of |reference_offsets|.
const int offsets[2] = {reference_offsets[0], reference_offsets[1]};
@@ -160,9 +162,10 @@ void MvProjectionCompoundForceInteger_SSE4_1(
}
void MvProjectionCompoundHighPrecision_SSE4_1(
- const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
+ const MotionVector* LIBGAV1_RESTRICT temporal_mvs,
+ const int8_t* LIBGAV1_RESTRICT temporal_reference_offsets,
const int reference_offsets[2], const int count,
- CompoundMotionVector* candidate_mvs) {
+ CompoundMotionVector* LIBGAV1_RESTRICT candidate_mvs) {
// |reference_offsets| non-zero check usually equals true and is ignored.
// To facilitate the compilers, make a local copy of |reference_offsets|.
const int offsets[2] = {reference_offsets[0], reference_offsets[1]};
@@ -177,8 +180,10 @@ void MvProjectionCompoundHighPrecision_SSE4_1(
}
void MvProjectionSingleLowPrecision_SSE4_1(
- const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
- const int reference_offset, const int count, MotionVector* candidate_mvs) {
+ const MotionVector* LIBGAV1_RESTRICT temporal_mvs,
+ const int8_t* LIBGAV1_RESTRICT temporal_reference_offsets,
+ const int reference_offset, const int count,
+ MotionVector* LIBGAV1_RESTRICT candidate_mvs) {
// Up to three more elements could be calculated.
int i = 0;
do {
@@ -190,8 +195,10 @@ void MvProjectionSingleLowPrecision_SSE4_1(
}
void MvProjectionSingleForceInteger_SSE4_1(
- const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
- const int reference_offset, const int count, MotionVector* candidate_mvs) {
+ const MotionVector* LIBGAV1_RESTRICT temporal_mvs,
+ const int8_t* LIBGAV1_RESTRICT temporal_reference_offsets,
+ const int reference_offset, const int count,
+ MotionVector* LIBGAV1_RESTRICT candidate_mvs) {
// Up to three more elements could be calculated.
int i = 0;
do {
@@ -203,8 +210,10 @@ void MvProjectionSingleForceInteger_SSE4_1(
}
void MvProjectionSingleHighPrecision_SSE4_1(
- const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
- const int reference_offset, const int count, MotionVector* candidate_mvs) {
+ const MotionVector* LIBGAV1_RESTRICT temporal_mvs,
+ const int8_t* LIBGAV1_RESTRICT temporal_reference_offsets,
+ const int reference_offset, const int count,
+ MotionVector* LIBGAV1_RESTRICT candidate_mvs) {
// Up to three more elements could be calculated.
int i = 0;
do {
@@ -215,20 +224,10 @@ void MvProjectionSingleHighPrecision_SSE4_1(
} while (i < count);
}
-void Init8bpp() {
- Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
- assert(dsp != nullptr);
- dsp->mv_projection_compound[0] = MvProjectionCompoundLowPrecision_SSE4_1;
- dsp->mv_projection_compound[1] = MvProjectionCompoundForceInteger_SSE4_1;
- dsp->mv_projection_compound[2] = MvProjectionCompoundHighPrecision_SSE4_1;
- dsp->mv_projection_single[0] = MvProjectionSingleLowPrecision_SSE4_1;
- dsp->mv_projection_single[1] = MvProjectionSingleForceInteger_SSE4_1;
- dsp->mv_projection_single[2] = MvProjectionSingleHighPrecision_SSE4_1;
-}
+} // namespace
-#if LIBGAV1_MAX_BITDEPTH >= 10
-void Init10bpp() {
- Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+void MotionVectorSearchInit_SSE4_1() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
assert(dsp != nullptr);
dsp->mv_projection_compound[0] = MvProjectionCompoundLowPrecision_SSE4_1;
dsp->mv_projection_compound[1] = MvProjectionCompoundForceInteger_SSE4_1;
@@ -237,16 +236,6 @@ void Init10bpp() {
dsp->mv_projection_single[1] = MvProjectionSingleForceInteger_SSE4_1;
dsp->mv_projection_single[2] = MvProjectionSingleHighPrecision_SSE4_1;
}
-#endif
-
-} // namespace
-
-void MotionVectorSearchInit_SSE4_1() {
- Init8bpp();
-#if LIBGAV1_MAX_BITDEPTH >= 10
- Init10bpp();
-#endif
-}
} // namespace dsp
} // namespace libgav1
diff --git a/src/dsp/x86/obmc_sse4.cc b/src/dsp/x86/obmc_sse4.cc
index c34a7f7..8ce23b4 100644
--- a/src/dsp/x86/obmc_sse4.cc
+++ b/src/dsp/x86/obmc_sse4.cc
@@ -37,8 +37,9 @@ namespace {
#include "src/dsp/obmc.inc"
inline void OverlapBlendFromLeft2xH_SSE4_1(
- uint8_t* const prediction, const ptrdiff_t prediction_stride,
- const int height, const uint8_t* const obmc_prediction,
+ uint8_t* LIBGAV1_RESTRICT const prediction,
+ const ptrdiff_t prediction_stride, const int height,
+ const uint8_t* LIBGAV1_RESTRICT const obmc_prediction,
const ptrdiff_t obmc_prediction_stride) {
uint8_t* pred = prediction;
const uint8_t* obmc_pred = obmc_prediction;
@@ -68,8 +69,9 @@ inline void OverlapBlendFromLeft2xH_SSE4_1(
}
inline void OverlapBlendFromLeft4xH_SSE4_1(
- uint8_t* const prediction, const ptrdiff_t prediction_stride,
- const int height, const uint8_t* const obmc_prediction,
+ uint8_t* LIBGAV1_RESTRICT const prediction,
+ const ptrdiff_t prediction_stride, const int height,
+ const uint8_t* LIBGAV1_RESTRICT const obmc_prediction,
const ptrdiff_t obmc_prediction_stride) {
uint8_t* pred = prediction;
const uint8_t* obmc_pred = obmc_prediction;
@@ -106,8 +108,9 @@ inline void OverlapBlendFromLeft4xH_SSE4_1(
}
inline void OverlapBlendFromLeft8xH_SSE4_1(
- uint8_t* const prediction, const ptrdiff_t prediction_stride,
- const int height, const uint8_t* const obmc_prediction,
+ uint8_t* LIBGAV1_RESTRICT const prediction,
+ const ptrdiff_t prediction_stride, const int height,
+ const uint8_t* LIBGAV1_RESTRICT const obmc_prediction,
const ptrdiff_t obmc_prediction_stride) {
uint8_t* pred = prediction;
const uint8_t* obmc_pred = obmc_prediction;
@@ -130,13 +133,15 @@ inline void OverlapBlendFromLeft8xH_SSE4_1(
} while (--y != 0);
}
-void OverlapBlendFromLeft_SSE4_1(void* const prediction,
- const ptrdiff_t prediction_stride,
- const int width, const int height,
- const void* const obmc_prediction,
- const ptrdiff_t obmc_prediction_stride) {
+void OverlapBlendFromLeft_SSE4_1(
+ void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t prediction_stride,
+ const int width, const int height,
+ const void* LIBGAV1_RESTRICT const obmc_prediction,
+ const ptrdiff_t obmc_prediction_stride) {
auto* pred = static_cast<uint8_t*>(prediction);
const auto* obmc_pred = static_cast<const uint8_t*>(obmc_prediction);
+ assert(width >= 2);
+ assert(height >= 4);
if (width == 2) {
OverlapBlendFromLeft2xH_SSE4_1(pred, prediction_stride, height, obmc_pred,
@@ -185,8 +190,9 @@ void OverlapBlendFromLeft_SSE4_1(void* const prediction,
}
inline void OverlapBlendFromTop4xH_SSE4_1(
- uint8_t* const prediction, const ptrdiff_t prediction_stride,
- const int height, const uint8_t* const obmc_prediction,
+ uint8_t* LIBGAV1_RESTRICT const prediction,
+ const ptrdiff_t prediction_stride, const int height,
+ const uint8_t* LIBGAV1_RESTRICT const obmc_prediction,
const ptrdiff_t obmc_prediction_stride) {
uint8_t* pred = prediction;
const uint8_t* obmc_pred = obmc_prediction;
@@ -227,8 +233,9 @@ inline void OverlapBlendFromTop4xH_SSE4_1(
}
inline void OverlapBlendFromTop8xH_SSE4_1(
- uint8_t* const prediction, const ptrdiff_t prediction_stride,
- const int height, const uint8_t* const obmc_prediction,
+ uint8_t* LIBGAV1_RESTRICT const prediction,
+ const ptrdiff_t prediction_stride, const int height,
+ const uint8_t* LIBGAV1_RESTRICT const obmc_prediction,
const ptrdiff_t obmc_prediction_stride) {
uint8_t* pred = prediction;
const uint8_t* obmc_pred = obmc_prediction;
@@ -253,15 +260,17 @@ inline void OverlapBlendFromTop8xH_SSE4_1(
} while (--y != 0);
}
-void OverlapBlendFromTop_SSE4_1(void* const prediction,
- const ptrdiff_t prediction_stride,
- const int width, const int height,
- const void* const obmc_prediction,
- const ptrdiff_t obmc_prediction_stride) {
+void OverlapBlendFromTop_SSE4_1(
+ void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t prediction_stride,
+ const int width, const int height,
+ const void* LIBGAV1_RESTRICT const obmc_prediction,
+ const ptrdiff_t obmc_prediction_stride) {
auto* pred = static_cast<uint8_t*>(prediction);
const auto* obmc_pred = static_cast<const uint8_t*>(obmc_prediction);
+ assert(width >= 4);
+ assert(height >= 2);
- if (width <= 4) {
+ if (width == 4) {
OverlapBlendFromTop4xH_SSE4_1(pred, prediction_stride, height, obmc_pred,
obmc_prediction_stride);
return;
@@ -323,8 +332,9 @@ namespace {
constexpr int kRoundBitsObmcBlend = 6;
inline void OverlapBlendFromLeft2xH_SSE4_1(
- uint16_t* const prediction, const ptrdiff_t pred_stride, const int height,
- const uint16_t* const obmc_prediction, const ptrdiff_t obmc_pred_stride) {
+ uint16_t* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride,
+ const int height, const uint16_t* LIBGAV1_RESTRICT const obmc_prediction,
+ const ptrdiff_t obmc_pred_stride) {
uint16_t* pred = prediction;
const uint16_t* obmc_pred = obmc_prediction;
const ptrdiff_t pred_stride2 = pred_stride << 1;
@@ -353,8 +363,9 @@ inline void OverlapBlendFromLeft2xH_SSE4_1(
}
inline void OverlapBlendFromLeft4xH_SSE4_1(
- uint16_t* const prediction, const ptrdiff_t pred_stride, const int height,
- const uint16_t* const obmc_prediction, const ptrdiff_t obmc_pred_stride) {
+ uint16_t* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride,
+ const int height, const uint16_t* LIBGAV1_RESTRICT const obmc_prediction,
+ const ptrdiff_t obmc_pred_stride) {
uint16_t* pred = prediction;
const uint16_t* obmc_pred = obmc_prediction;
const ptrdiff_t pred_stride2 = pred_stride << 1;
@@ -385,16 +396,18 @@ inline void OverlapBlendFromLeft4xH_SSE4_1(
} while (y != 0);
}
-void OverlapBlendFromLeft10bpp_SSE4_1(void* const prediction,
- const ptrdiff_t prediction_stride,
- const int width, const int height,
- const void* const obmc_prediction,
- const ptrdiff_t obmc_prediction_stride) {
+void OverlapBlendFromLeft10bpp_SSE4_1(
+ void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t prediction_stride,
+ const int width, const int height,
+ const void* LIBGAV1_RESTRICT const obmc_prediction,
+ const ptrdiff_t obmc_prediction_stride) {
auto* pred = static_cast<uint16_t*>(prediction);
const auto* obmc_pred = static_cast<const uint16_t*>(obmc_prediction);
const ptrdiff_t pred_stride = prediction_stride / sizeof(pred[0]);
const ptrdiff_t obmc_pred_stride =
obmc_prediction_stride / sizeof(obmc_pred[0]);
+ assert(width >= 2);
+ assert(height >= 4);
if (width == 2) {
OverlapBlendFromLeft2xH_SSE4_1(pred, pred_stride, height, obmc_pred,
@@ -437,54 +450,10 @@ void OverlapBlendFromLeft10bpp_SSE4_1(void* const prediction,
} while (x < width);
}
-inline void OverlapBlendFromTop2xH_SSE4_1(uint16_t* const prediction,
- const ptrdiff_t pred_stride,
- const int height,
- const uint16_t* const obmc_prediction,
- const ptrdiff_t obmc_pred_stride) {
- uint16_t* pred = prediction;
- const uint16_t* obmc_pred = obmc_prediction;
- const __m128i mask_inverter = _mm_set1_epi16(64);
- const __m128i mask_shuffler = _mm_set_epi32(0x01010101, 0x01010101, 0, 0);
- const __m128i mask_preinverter = _mm_set1_epi16(-256 | 1);
- const uint8_t* mask = kObmcMask + height - 2;
- const int compute_height =
- height - (height >> 2); // compute_height based on 8-bit opt
- const ptrdiff_t pred_stride2 = pred_stride << 1;
- const ptrdiff_t obmc_pred_stride2 = obmc_pred_stride << 1;
- int y = 0;
- do {
- // First mask in the first half, second mask in the second half.
- const __m128i mask_val = _mm_shuffle_epi8(Load4(mask + y), mask_shuffler);
- const __m128i masks =
- _mm_sub_epi8(mask_inverter, _mm_sign_epi8(mask_val, mask_preinverter));
- const __m128i masks_lo = _mm_cvtepi8_epi16(masks);
- const __m128i masks_hi = _mm_cvtepi8_epi16(_mm_srli_si128(masks, 8));
-
- const __m128i pred_val = LoadHi8(LoadLo8(pred), pred + pred_stride);
- const __m128i obmc_pred_val =
- LoadHi8(LoadLo8(obmc_pred), obmc_pred + obmc_pred_stride);
- const __m128i terms_lo = _mm_unpacklo_epi16(obmc_pred_val, pred_val);
- const __m128i terms_hi = _mm_unpackhi_epi16(obmc_pred_val, pred_val);
- const __m128i result_lo = RightShiftWithRounding_U32(
- _mm_madd_epi16(terms_lo, masks_lo), kRoundBitsObmcBlend);
- const __m128i result_hi = RightShiftWithRounding_U32(
- _mm_madd_epi16(terms_hi, masks_hi), kRoundBitsObmcBlend);
- const __m128i packed_result = _mm_packus_epi32(result_lo, result_hi);
-
- Store4(pred, packed_result);
- Store4(pred + pred_stride, _mm_srli_si128(packed_result, 8));
- pred += pred_stride2;
- obmc_pred += obmc_pred_stride2;
- y += 2;
- } while (y < compute_height);
-}
-
-inline void OverlapBlendFromTop4xH_SSE4_1(uint16_t* const prediction,
- const ptrdiff_t pred_stride,
- const int height,
- const uint16_t* const obmc_prediction,
- const ptrdiff_t obmc_pred_stride) {
+inline void OverlapBlendFromTop4xH_SSE4_1(
+ uint16_t* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride,
+ const int height, const uint16_t* LIBGAV1_RESTRICT const obmc_prediction,
+ const ptrdiff_t obmc_pred_stride) {
uint16_t* pred = prediction;
const uint16_t* obmc_pred = obmc_prediction;
const __m128i mask_inverter = _mm_set1_epi16(64);
@@ -522,22 +491,19 @@ inline void OverlapBlendFromTop4xH_SSE4_1(uint16_t* const prediction,
} while (y < compute_height);
}
-void OverlapBlendFromTop10bpp_SSE4_1(void* const prediction,
- const ptrdiff_t prediction_stride,
- const int width, const int height,
- const void* const obmc_prediction,
- const ptrdiff_t obmc_prediction_stride) {
+void OverlapBlendFromTop10bpp_SSE4_1(
+ void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t prediction_stride,
+ const int width, const int height,
+ const void* LIBGAV1_RESTRICT const obmc_prediction,
+ const ptrdiff_t obmc_prediction_stride) {
auto* pred = static_cast<uint16_t*>(prediction);
const auto* obmc_pred = static_cast<const uint16_t*>(obmc_prediction);
const ptrdiff_t pred_stride = prediction_stride / sizeof(pred[0]);
const ptrdiff_t obmc_pred_stride =
obmc_prediction_stride / sizeof(obmc_pred[0]);
+ assert(width >= 4);
+ assert(height >= 2);
- if (width == 2) {
- OverlapBlendFromTop2xH_SSE4_1(pred, pred_stride, height, obmc_pred,
- obmc_pred_stride);
- return;
- }
if (width == 4) {
OverlapBlendFromTop4xH_SSE4_1(pred, pred_stride, height, obmc_pred,
obmc_pred_stride);
diff --git a/src/dsp/x86/super_res_sse4.cc b/src/dsp/x86/super_res_sse4.cc
index 85d05bc..458d94e 100644
--- a/src/dsp/x86/super_res_sse4.cc
+++ b/src/dsp/x86/super_res_sse4.cc
@@ -90,11 +90,13 @@ void SuperResCoefficients_SSE4_1(const int upscaled_width,
} while (--x != 0);
}
-void SuperRes_SSE4_1(const void* const coefficients, void* const source,
+void SuperRes_SSE4_1(const void* LIBGAV1_RESTRICT const coefficients,
+ void* LIBGAV1_RESTRICT const source,
const ptrdiff_t source_stride, const int height,
const int downscaled_width, const int upscaled_width,
const int initial_subpixel_x, const int step,
- void* const dest, const ptrdiff_t dest_stride) {
+ void* LIBGAV1_RESTRICT const dest,
+ const ptrdiff_t dest_stride) {
auto* src = static_cast<uint8_t*>(source) - DivideBy2(kSuperResFilterTaps);
auto* dst = static_cast<uint8_t*>(dest);
int y = height;
@@ -227,11 +229,13 @@ void SuperResCoefficients_SSE4_1(const int upscaled_width,
}
template <int bitdepth>
-void SuperRes_SSE4_1(const void* const coefficients, void* const source,
+void SuperRes_SSE4_1(const void* LIBGAV1_RESTRICT const coefficients,
+ void* LIBGAV1_RESTRICT const source,
const ptrdiff_t source_stride, const int height,
const int downscaled_width, const int upscaled_width,
const int initial_subpixel_x, const int step,
- void* const dest, const ptrdiff_t dest_stride) {
+ void* LIBGAV1_RESTRICT const dest,
+ const ptrdiff_t dest_stride) {
auto* src = static_cast<uint16_t*>(source) - DivideBy2(kSuperResFilterTaps);
auto* dst = static_cast<uint16_t*>(dest);
int y = height;
diff --git a/src/dsp/x86/warp_sse4.cc b/src/dsp/x86/warp_sse4.cc
index 9ddfeac..5830894 100644
--- a/src/dsp/x86/warp_sse4.cc
+++ b/src/dsp/x86/warp_sse4.cc
@@ -101,7 +101,7 @@ inline void HorizontalFilter(const int sx4, const int16_t alpha,
template <bool is_compound>
inline void WriteVerticalFilter(const __m128i filter[8],
const int16_t intermediate_result[15][8], int y,
- void* dst_row) {
+ void* LIBGAV1_RESTRICT dst_row) {
constexpr int kRoundBitsVertical =
is_compound ? kInterRoundBitsCompoundVertical : kInterRoundBitsVertical;
__m128i sum_low = _mm_set1_epi32(kOffsetRemoval);
@@ -136,8 +136,9 @@ inline void WriteVerticalFilter(const __m128i filter[8],
template <bool is_compound>
inline void WriteVerticalFilter(const __m128i filter[8],
- const int16_t* intermediate_result_column,
- void* dst_row) {
+ const int16_t* LIBGAV1_RESTRICT
+ intermediate_result_column,
+ void* LIBGAV1_RESTRICT dst_row) {
constexpr int kRoundBitsVertical =
is_compound ? kInterRoundBitsCompoundVertical : kInterRoundBitsVertical;
__m128i sum_low = _mm_setzero_si128();
@@ -167,7 +168,7 @@ inline void WriteVerticalFilter(const __m128i filter[8],
template <bool is_compound, typename DestType>
inline void VerticalFilter(const int16_t source[15][8], int y4, int gamma,
- int delta, DestType* dest_row,
+ int delta, DestType* LIBGAV1_RESTRICT dest_row,
ptrdiff_t dest_stride) {
int sy4 = (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta);
for (int y = 0; y < 8; ++y) {
@@ -187,8 +188,9 @@ inline void VerticalFilter(const int16_t source[15][8], int y4, int gamma,
}
template <bool is_compound, typename DestType>
-inline void VerticalFilter(const int16_t* source_cols, int y4, int gamma,
- int delta, DestType* dest_row,
+inline void VerticalFilter(const int16_t* LIBGAV1_RESTRICT source_cols, int y4,
+ int gamma, int delta,
+ DestType* LIBGAV1_RESTRICT dest_row,
ptrdiff_t dest_stride) {
int sy4 = (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta);
for (int y = 0; y < 8; ++y) {
@@ -208,9 +210,11 @@ inline void VerticalFilter(const int16_t* source_cols, int y4, int gamma,
}
template <bool is_compound, typename DestType>
-inline void WarpRegion1(const uint8_t* src, ptrdiff_t source_stride,
- int source_width, int source_height, int ix4, int iy4,
- DestType* dst_row, ptrdiff_t dest_stride) {
+inline void WarpRegion1(const uint8_t* LIBGAV1_RESTRICT src,
+ ptrdiff_t source_stride, int source_width,
+ int source_height, int ix4, int iy4,
+ DestType* LIBGAV1_RESTRICT dst_row,
+ ptrdiff_t dest_stride) {
// Region 1
// Points to the left or right border of the first row of |src|.
const uint8_t* first_row_border =
@@ -244,10 +248,12 @@ inline void WarpRegion1(const uint8_t* src, ptrdiff_t source_stride,
}
template <bool is_compound, typename DestType>
-inline void WarpRegion2(const uint8_t* src, ptrdiff_t source_stride,
- int source_width, int y4, int ix4, int iy4, int gamma,
- int delta, int16_t intermediate_result_column[15],
- DestType* dst_row, ptrdiff_t dest_stride) {
+inline void WarpRegion2(const uint8_t* LIBGAV1_RESTRICT src,
+ ptrdiff_t source_stride, int source_width, int y4,
+ int ix4, int iy4, int gamma, int delta,
+ int16_t intermediate_result_column[15],
+ DestType* LIBGAV1_RESTRICT dst_row,
+ ptrdiff_t dest_stride) {
// Region 2.
// Points to the left or right border of the first row of |src|.
const uint8_t* first_row_border =
@@ -283,9 +289,10 @@ inline void WarpRegion2(const uint8_t* src, ptrdiff_t source_stride,
}
template <bool is_compound, typename DestType>
-inline void WarpRegion3(const uint8_t* src, ptrdiff_t source_stride,
- int source_height, int alpha, int beta, int x4, int ix4,
- int iy4, int16_t intermediate_result[15][8]) {
+inline void WarpRegion3(const uint8_t* LIBGAV1_RESTRICT src,
+ ptrdiff_t source_stride, int source_height, int alpha,
+ int beta, int x4, int ix4, int iy4,
+ int16_t intermediate_result[15][8]) {
// Region 3
// At this point, we know ix4 - 7 < source_width - 1 and ix4 + 7 > 0.
@@ -315,9 +322,9 @@ inline void WarpRegion3(const uint8_t* src, ptrdiff_t source_stride,
}
template <bool is_compound, typename DestType>
-inline void WarpRegion4(const uint8_t* src, ptrdiff_t source_stride, int alpha,
- int beta, int x4, int ix4, int iy4,
- int16_t intermediate_result[15][8]) {
+inline void WarpRegion4(const uint8_t* LIBGAV1_RESTRICT src,
+ ptrdiff_t source_stride, int alpha, int beta, int x4,
+ int ix4, int iy4, int16_t intermediate_result[15][8]) {
// Region 4.
// At this point, we know ix4 - 7 < source_width - 1 and ix4 + 7 > 0.
@@ -351,12 +358,14 @@ inline void WarpRegion4(const uint8_t* src, ptrdiff_t source_stride, int alpha,
}
template <bool is_compound, typename DestType>
-inline void HandleWarpBlock(const uint8_t* src, ptrdiff_t source_stride,
- int source_width, int source_height,
- const int* warp_params, int subsampling_x,
- int subsampling_y, int src_x, int src_y,
- int16_t alpha, int16_t beta, int16_t gamma,
- int16_t delta, DestType* dst_row,
+inline void HandleWarpBlock(const uint8_t* LIBGAV1_RESTRICT src,
+ ptrdiff_t source_stride, int source_width,
+ int source_height,
+ const int* LIBGAV1_RESTRICT warp_params,
+ int subsampling_x, int subsampling_y, int src_x,
+ int src_y, int16_t alpha, int16_t beta,
+ int16_t gamma, int16_t delta,
+ DestType* LIBGAV1_RESTRICT dst_row,
ptrdiff_t dest_stride) {
union {
// Intermediate_result is the output of the horizontal filtering and
@@ -460,11 +469,12 @@ inline void HandleWarpBlock(const uint8_t* src, ptrdiff_t source_stride,
}
template <bool is_compound>
-void Warp_SSE4_1(const void* source, ptrdiff_t source_stride, int source_width,
- int source_height, const int* warp_params, int subsampling_x,
+void Warp_SSE4_1(const void* LIBGAV1_RESTRICT source, ptrdiff_t source_stride,
+ int source_width, int source_height,
+ const int* LIBGAV1_RESTRICT warp_params, int subsampling_x,
int subsampling_y, int block_start_x, int block_start_y,
int block_width, int block_height, int16_t alpha, int16_t beta,
- int16_t gamma, int16_t delta, void* dest,
+ int16_t gamma, int16_t delta, void* LIBGAV1_RESTRICT dest,
ptrdiff_t dest_stride) {
const auto* const src = static_cast<const uint8_t*>(source);
using DestType =
diff --git a/src/dsp/x86/weight_mask_sse4.cc b/src/dsp/x86/weight_mask_sse4.cc
index 08a1739..69cb784 100644
--- a/src/dsp/x86/weight_mask_sse4.cc
+++ b/src/dsp/x86/weight_mask_sse4.cc
@@ -37,8 +37,9 @@ namespace {
constexpr int kRoundingBits8bpp = 4;
template <bool mask_is_inverse, bool is_store_16>
-inline void WeightMask16_SSE4(const int16_t* prediction_0,
- const int16_t* prediction_1, uint8_t* mask,
+inline void WeightMask16_SSE4(const int16_t* LIBGAV1_RESTRICT prediction_0,
+ const int16_t* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
ptrdiff_t mask_stride) {
const __m128i pred_00 = LoadAligned16(prediction_0);
const __m128i pred_10 = LoadAligned16(prediction_1);
@@ -86,8 +87,9 @@ inline void WeightMask16_SSE4(const int16_t* prediction_0,
mask += mask_stride << 1
template <bool mask_is_inverse>
-void WeightMask8x8_SSE4(const void* prediction_0, const void* prediction_1,
- uint8_t* mask, ptrdiff_t mask_stride) {
+void WeightMask8x8_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask, ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
@@ -98,8 +100,10 @@ void WeightMask8x8_SSE4(const void* prediction_0, const void* prediction_1,
}
template <bool mask_is_inverse>
-void WeightMask8x16_SSE4(const void* prediction_0, const void* prediction_1,
- uint8_t* mask, ptrdiff_t mask_stride) {
+void WeightMask8x16_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
int y3 = 3;
@@ -112,8 +116,10 @@ void WeightMask8x16_SSE4(const void* prediction_0, const void* prediction_1,
}
template <bool mask_is_inverse>
-void WeightMask8x32_SSE4(const void* prediction_0, const void* prediction_1,
- uint8_t* mask, ptrdiff_t mask_stride) {
+void WeightMask8x32_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
int y5 = 5;
@@ -135,8 +141,10 @@ void WeightMask8x32_SSE4(const void* prediction_0, const void* prediction_1,
mask += mask_stride
template <bool mask_is_inverse>
-void WeightMask16x8_SSE4(const void* prediction_0, const void* prediction_1,
- uint8_t* mask, ptrdiff_t mask_stride) {
+void WeightMask16x8_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
int y = 7;
@@ -147,8 +155,10 @@ void WeightMask16x8_SSE4(const void* prediction_0, const void* prediction_1,
}
template <bool mask_is_inverse>
-void WeightMask16x16_SSE4(const void* prediction_0, const void* prediction_1,
- uint8_t* mask, ptrdiff_t mask_stride) {
+void WeightMask16x16_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
int y3 = 5;
@@ -161,8 +171,10 @@ void WeightMask16x16_SSE4(const void* prediction_0, const void* prediction_1,
}
template <bool mask_is_inverse>
-void WeightMask16x32_SSE4(const void* prediction_0, const void* prediction_1,
- uint8_t* mask, ptrdiff_t mask_stride) {
+void WeightMask16x32_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
int y5 = 6;
@@ -178,8 +190,10 @@ void WeightMask16x32_SSE4(const void* prediction_0, const void* prediction_1,
}
template <bool mask_is_inverse>
-void WeightMask16x64_SSE4(const void* prediction_0, const void* prediction_1,
- uint8_t* mask, ptrdiff_t mask_stride) {
+void WeightMask16x64_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
int y3 = 21;
@@ -203,8 +217,10 @@ void WeightMask16x64_SSE4(const void* prediction_0, const void* prediction_1,
mask += mask_stride
template <bool mask_is_inverse>
-void WeightMask32x8_SSE4(const void* prediction_0, const void* prediction_1,
- uint8_t* mask, ptrdiff_t mask_stride) {
+void WeightMask32x8_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
WEIGHT32_AND_STRIDE;
@@ -218,8 +234,10 @@ void WeightMask32x8_SSE4(const void* prediction_0, const void* prediction_1,
}
template <bool mask_is_inverse>
-void WeightMask32x16_SSE4(const void* prediction_0, const void* prediction_1,
- uint8_t* mask, ptrdiff_t mask_stride) {
+void WeightMask32x16_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
int y3 = 5;
@@ -232,8 +250,10 @@ void WeightMask32x16_SSE4(const void* prediction_0, const void* prediction_1,
}
template <bool mask_is_inverse>
-void WeightMask32x32_SSE4(const void* prediction_0, const void* prediction_1,
- uint8_t* mask, ptrdiff_t mask_stride) {
+void WeightMask32x32_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
int y5 = 6;
@@ -249,8 +269,10 @@ void WeightMask32x32_SSE4(const void* prediction_0, const void* prediction_1,
}
template <bool mask_is_inverse>
-void WeightMask32x64_SSE4(const void* prediction_0, const void* prediction_1,
- uint8_t* mask, ptrdiff_t mask_stride) {
+void WeightMask32x64_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
int y3 = 21;
@@ -278,8 +300,10 @@ void WeightMask32x64_SSE4(const void* prediction_0, const void* prediction_1,
mask += mask_stride
template <bool mask_is_inverse>
-void WeightMask64x16_SSE4(const void* prediction_0, const void* prediction_1,
- uint8_t* mask, ptrdiff_t mask_stride) {
+void WeightMask64x16_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
int y3 = 0;
@@ -292,8 +316,10 @@ void WeightMask64x16_SSE4(const void* prediction_0, const void* prediction_1,
}
template <bool mask_is_inverse>
-void WeightMask64x32_SSE4(const void* prediction_0, const void* prediction_1,
- uint8_t* mask, ptrdiff_t mask_stride) {
+void WeightMask64x32_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
int y5 = 0;
@@ -309,8 +335,10 @@ void WeightMask64x32_SSE4(const void* prediction_0, const void* prediction_1,
}
template <bool mask_is_inverse>
-void WeightMask64x64_SSE4(const void* prediction_0, const void* prediction_1,
- uint8_t* mask, ptrdiff_t mask_stride) {
+void WeightMask64x64_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
int y3 = 0;
@@ -323,8 +351,10 @@ void WeightMask64x64_SSE4(const void* prediction_0, const void* prediction_1,
}
template <bool mask_is_inverse>
-void WeightMask64x128_SSE4(const void* prediction_0, const void* prediction_1,
- uint8_t* mask, ptrdiff_t mask_stride) {
+void WeightMask64x128_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
int y3 = 0;
@@ -338,8 +368,10 @@ void WeightMask64x128_SSE4(const void* prediction_0, const void* prediction_1,
}
template <bool mask_is_inverse>
-void WeightMask128x64_SSE4(const void* prediction_0, const void* prediction_1,
- uint8_t* mask, ptrdiff_t mask_stride) {
+void WeightMask128x64_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
int y3 = 0;
@@ -380,8 +412,10 @@ void WeightMask128x64_SSE4(const void* prediction_0, const void* prediction_1,
}
template <bool mask_is_inverse>
-void WeightMask128x128_SSE4(const void* prediction_0, const void* prediction_1,
- uint8_t* mask, ptrdiff_t mask_stride) {
+void WeightMask128x128_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
int y3 = 0;
@@ -467,9 +501,10 @@ constexpr int kRoundingBits10bpp = 6;
constexpr int kScaledDiffShift = 4;
template <bool mask_is_inverse, bool is_store_16>
-inline void WeightMask16_10bpp_SSE4(const uint16_t* prediction_0,
- const uint16_t* prediction_1, uint8_t* mask,
- ptrdiff_t mask_stride) {
+inline void WeightMask16_10bpp_SSE4(
+ const uint16_t* LIBGAV1_RESTRICT prediction_0,
+ const uint16_t* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask, ptrdiff_t mask_stride) {
const __m128i diff_offset = _mm_set1_epi8(38);
const __m128i mask_ceiling = _mm_set1_epi8(64);
const __m128i zero = _mm_setzero_si128();
@@ -538,8 +573,9 @@ inline void WeightMask16_10bpp_SSE4(const uint16_t* prediction_0,
mask += mask_stride << 1
template <bool mask_is_inverse>
-void WeightMask8x8_10bpp_SSE4(const void* prediction_0,
- const void* prediction_1, uint8_t* mask,
+void WeightMask8x8_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
@@ -551,8 +587,9 @@ void WeightMask8x8_10bpp_SSE4(const void* prediction_0,
}
template <bool mask_is_inverse>
-void WeightMask8x16_10bpp_SSE4(const void* prediction_0,
- const void* prediction_1, uint8_t* mask,
+void WeightMask8x16_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
@@ -566,8 +603,9 @@ void WeightMask8x16_10bpp_SSE4(const void* prediction_0,
}
template <bool mask_is_inverse>
-void WeightMask8x32_10bpp_SSE4(const void* prediction_0,
- const void* prediction_1, uint8_t* mask,
+void WeightMask8x32_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
@@ -591,8 +629,9 @@ void WeightMask8x32_10bpp_SSE4(const void* prediction_0,
mask += mask_stride
template <bool mask_is_inverse>
-void WeightMask16x8_10bpp_SSE4(const void* prediction_0,
- const void* prediction_1, uint8_t* mask,
+void WeightMask16x8_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
@@ -604,8 +643,9 @@ void WeightMask16x8_10bpp_SSE4(const void* prediction_0,
}
template <bool mask_is_inverse>
-void WeightMask16x16_10bpp_SSE4(const void* prediction_0,
- const void* prediction_1, uint8_t* mask,
+void WeightMask16x16_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
@@ -619,8 +659,9 @@ void WeightMask16x16_10bpp_SSE4(const void* prediction_0,
}
template <bool mask_is_inverse>
-void WeightMask16x32_10bpp_SSE4(const void* prediction_0,
- const void* prediction_1, uint8_t* mask,
+void WeightMask16x32_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
@@ -637,8 +678,9 @@ void WeightMask16x32_10bpp_SSE4(const void* prediction_0,
}
template <bool mask_is_inverse>
-void WeightMask16x64_10bpp_SSE4(const void* prediction_0,
- const void* prediction_1, uint8_t* mask,
+void WeightMask16x64_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
@@ -664,8 +706,9 @@ void WeightMask16x64_10bpp_SSE4(const void* prediction_0,
mask += mask_stride
template <bool mask_is_inverse>
-void WeightMask32x8_10bpp_SSE4(const void* prediction_0,
- const void* prediction_1, uint8_t* mask,
+void WeightMask32x8_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
@@ -680,8 +723,9 @@ void WeightMask32x8_10bpp_SSE4(const void* prediction_0,
}
template <bool mask_is_inverse>
-void WeightMask32x16_10bpp_SSE4(const void* prediction_0,
- const void* prediction_1, uint8_t* mask,
+void WeightMask32x16_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
@@ -695,8 +739,9 @@ void WeightMask32x16_10bpp_SSE4(const void* prediction_0,
}
template <bool mask_is_inverse>
-void WeightMask32x32_10bpp_SSE4(const void* prediction_0,
- const void* prediction_1, uint8_t* mask,
+void WeightMask32x32_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
@@ -713,8 +758,9 @@ void WeightMask32x32_10bpp_SSE4(const void* prediction_0,
}
template <bool mask_is_inverse>
-void WeightMask32x64_10bpp_SSE4(const void* prediction_0,
- const void* prediction_1, uint8_t* mask,
+void WeightMask32x64_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
@@ -744,8 +790,9 @@ void WeightMask32x64_10bpp_SSE4(const void* prediction_0,
mask += mask_stride
template <bool mask_is_inverse>
-void WeightMask64x16_10bpp_SSE4(const void* prediction_0,
- const void* prediction_1, uint8_t* mask,
+void WeightMask64x16_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
@@ -759,8 +806,9 @@ void WeightMask64x16_10bpp_SSE4(const void* prediction_0,
}
template <bool mask_is_inverse>
-void WeightMask64x32_10bpp_SSE4(const void* prediction_0,
- const void* prediction_1, uint8_t* mask,
+void WeightMask64x32_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
@@ -777,8 +825,9 @@ void WeightMask64x32_10bpp_SSE4(const void* prediction_0,
}
template <bool mask_is_inverse>
-void WeightMask64x64_10bpp_SSE4(const void* prediction_0,
- const void* prediction_1, uint8_t* mask,
+void WeightMask64x64_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
@@ -792,8 +841,9 @@ void WeightMask64x64_10bpp_SSE4(const void* prediction_0,
}
template <bool mask_is_inverse>
-void WeightMask64x128_10bpp_SSE4(const void* prediction_0,
- const void* prediction_1, uint8_t* mask,
+void WeightMask64x128_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
@@ -808,8 +858,9 @@ void WeightMask64x128_10bpp_SSE4(const void* prediction_0,
}
template <bool mask_is_inverse>
-void WeightMask128x64_10bpp_SSE4(const void* prediction_0,
- const void* prediction_1, uint8_t* mask,
+void WeightMask128x64_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
@@ -851,8 +902,9 @@ void WeightMask128x64_10bpp_SSE4(const void* prediction_0,
}
template <bool mask_is_inverse>
-void WeightMask128x128_10bpp_SSE4(const void* prediction_0,
- const void* prediction_1, uint8_t* mask,
+void WeightMask128x128_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);