diff options
author | Boyuan Yang <byang@debian.org> | 2021-11-07 08:50:18 -0500 |
---|---|---|
committer | Boyuan Yang <byang@debian.org> | 2021-11-07 08:50:18 -0500 |
commit | 320ef65362608ee1148c299d8d5d7618af34e470 (patch) | |
tree | c47911c219d1e35b8b0771e9e0176eff0e0d08ec /src/dsp/x86 | |
parent | 2381d803c76105f44717d75f089ec37f51e5cfe4 (diff) | |
download | libgav1-320ef65362608ee1148c299d8d5d7618af34e470.tar.gz libgav1-320ef65362608ee1148c299d8d5d7618af34e470.tar.bz2 libgav1-320ef65362608ee1148c299d8d5d7618af34e470.zip |
New upstream version 0.17.0
Diffstat (limited to 'src/dsp/x86')
27 files changed, 1368 insertions, 1086 deletions
diff --git a/src/dsp/x86/average_blend_sse4.cc b/src/dsp/x86/average_blend_sse4.cc index ec9f589..911c5a9 100644 --- a/src/dsp/x86/average_blend_sse4.cc +++ b/src/dsp/x86/average_blend_sse4.cc @@ -35,8 +35,9 @@ namespace { constexpr int kInterPostRoundBit = 4; -inline void AverageBlend4Row(const int16_t* prediction_0, - const int16_t* prediction_1, uint8_t* dest) { +inline void AverageBlend4Row(const int16_t* LIBGAV1_RESTRICT prediction_0, + const int16_t* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT dest) { const __m128i pred_0 = LoadLo8(prediction_0); const __m128i pred_1 = LoadLo8(prediction_1); __m128i res = _mm_add_epi16(pred_0, pred_1); @@ -44,8 +45,9 @@ inline void AverageBlend4Row(const int16_t* prediction_0, Store4(dest, _mm_packus_epi16(res, res)); } -inline void AverageBlend8Row(const int16_t* prediction_0, - const int16_t* prediction_1, uint8_t* dest) { +inline void AverageBlend8Row(const int16_t* LIBGAV1_RESTRICT prediction_0, + const int16_t* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT dest) { const __m128i pred_0 = LoadAligned16(prediction_0); const __m128i pred_1 = LoadAligned16(prediction_1); __m128i res = _mm_add_epi16(pred_0, pred_1); @@ -53,9 +55,10 @@ inline void AverageBlend8Row(const int16_t* prediction_0, StoreLo8(dest, _mm_packus_epi16(res, res)); } -inline void AverageBlendLargeRow(const int16_t* prediction_0, - const int16_t* prediction_1, const int width, - uint8_t* dest) { +inline void AverageBlendLargeRow(const int16_t* LIBGAV1_RESTRICT prediction_0, + const int16_t* LIBGAV1_RESTRICT prediction_1, + const int width, + uint8_t* LIBGAV1_RESTRICT dest) { int x = 0; do { const __m128i pred_00 = LoadAligned16(&prediction_0[x]); @@ -71,8 +74,10 @@ inline void AverageBlendLargeRow(const int16_t* prediction_0, } while (x < width); } -void AverageBlend_SSE4_1(const void* prediction_0, const void* prediction_1, - const int width, const int height, void* const dest, +void AverageBlend_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + const int width, const int height, + void* LIBGAV1_RESTRICT const dest, const ptrdiff_t dest_stride) { auto* dst = static_cast<uint8_t*>(dest); const auto* pred_0 = static_cast<const int16_t*>(prediction_0); @@ -148,11 +153,11 @@ namespace { constexpr int kInterPostRoundBitPlusOne = 5; template <const int width, const int offset> -inline void AverageBlendRow(const uint16_t* prediction_0, - const uint16_t* prediction_1, +inline void AverageBlendRow(const uint16_t* LIBGAV1_RESTRICT prediction_0, + const uint16_t* LIBGAV1_RESTRICT prediction_1, const __m128i& compound_offset, const __m128i& round_offset, const __m128i& max, - const __m128i& zero, uint16_t* dst, + const __m128i& zero, uint16_t* LIBGAV1_RESTRICT dst, const ptrdiff_t dest_stride) { // pred_0/1 max range is 16b. const __m128i pred_0 = LoadUnaligned16(prediction_0 + offset); @@ -182,9 +187,10 @@ inline void AverageBlendRow(const uint16_t* prediction_0, StoreHi8(dst + dest_stride, result); } -void AverageBlend10bpp_SSE4_1(const void* prediction_0, - const void* prediction_1, const int width, - const int height, void* const dest, +void AverageBlend10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + const int width, const int height, + void* LIBGAV1_RESTRICT const dest, const ptrdiff_t dst_stride) { auto* dst = static_cast<uint16_t*>(dest); const ptrdiff_t dest_stride = dst_stride / sizeof(dst[0]); diff --git a/src/dsp/x86/cdef_avx2.cc b/src/dsp/x86/cdef_avx2.cc index d41dc38..01a2b9f 100644 --- a/src/dsp/x86/cdef_avx2.cc +++ b/src/dsp/x86/cdef_avx2.cc @@ -269,8 +269,8 @@ LIBGAV1_ALWAYS_INLINE void AddPartial_D7_D5(__m256i* v_src, __m256i* partial_lo, _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_pair_add[3], 10)); } -LIBGAV1_ALWAYS_INLINE void AddPartial(const uint8_t* src, ptrdiff_t stride, - __m256i* partial) { +LIBGAV1_ALWAYS_INLINE void AddPartial(const uint8_t* LIBGAV1_RESTRICT src, + ptrdiff_t stride, __m256i* partial) { // 8x8 input // 00 01 02 03 04 05 06 07 // 10 11 12 13 14 15 16 17 @@ -451,8 +451,10 @@ inline void Cost2And6_Pair(uint32_t* cost, const __m256i partial_a, cost[6] = _mm_cvtsi128_si32(_mm_srli_si128(sums, 8)); } -void CdefDirection_AVX2(const void* const source, ptrdiff_t stride, - uint8_t* const direction, int* const variance) { +void CdefDirection_AVX2(const void* LIBGAV1_RESTRICT const source, + ptrdiff_t stride, + uint8_t* LIBGAV1_RESTRICT const direction, + int* LIBGAV1_RESTRICT const variance) { assert(direction != nullptr); assert(variance != nullptr); const auto* src = static_cast<const uint8_t*>(source); @@ -500,8 +502,9 @@ void CdefDirection_AVX2(const void* const source, ptrdiff_t stride, // CdefFilter // Load 4 vectors based on the given |direction|. -inline void LoadDirection(const uint16_t* const src, const ptrdiff_t stride, - __m128i* output, const int direction) { +inline void LoadDirection(const uint16_t* LIBGAV1_RESTRICT const src, + const ptrdiff_t stride, __m128i* output, + const int direction) { // Each |direction| describes a different set of source values. Expand this // set by negating each set. For |direction| == 0 this gives a diagonal line // from top right to bottom left. The first value is y, the second x. Negative @@ -525,8 +528,9 @@ inline void LoadDirection(const uint16_t* const src, const ptrdiff_t stride, // Load 4 vectors based on the given |direction|. Use when |block_width| == 4 to // do 2 rows at a time. -void LoadDirection4(const uint16_t* const src, const ptrdiff_t stride, - __m128i* output, const int direction) { +void LoadDirection4(const uint16_t* LIBGAV1_RESTRICT const src, + const ptrdiff_t stride, __m128i* output, + const int direction) { const int y_0 = kCdefDirections[direction][0][0]; const int x_0 = kCdefDirections[direction][0][1]; const int y_1 = kCdefDirections[direction][1][0]; @@ -569,11 +573,11 @@ inline __m256i ApplyConstrainAndTap(const __m256i& pixel, const __m256i& val, } template <int width, bool enable_primary = true, bool enable_secondary = true> -void CdefFilter_AVX2(const uint16_t* src, const ptrdiff_t src_stride, - const int height, const int primary_strength, - const int secondary_strength, const int damping, - const int direction, void* dest, - const ptrdiff_t dst_stride) { +void CdefFilter_AVX2(const uint16_t* LIBGAV1_RESTRICT src, + const ptrdiff_t src_stride, const int height, + const int primary_strength, const int secondary_strength, + const int damping, const int direction, + void* LIBGAV1_RESTRICT dest, const ptrdiff_t dst_stride) { static_assert(width == 8 || width == 4, "Invalid CDEF width."); static_assert(enable_primary || enable_secondary, ""); constexpr bool clipping_required = enable_primary && enable_secondary; diff --git a/src/dsp/x86/cdef_sse4.cc b/src/dsp/x86/cdef_sse4.cc index 6ede778..6c48844 100644 --- a/src/dsp/x86/cdef_sse4.cc +++ b/src/dsp/x86/cdef_sse4.cc @@ -241,8 +241,8 @@ LIBGAV1_ALWAYS_INLINE void AddPartial_D5_D7(__m128i* v_src, __m128i* partial_lo, *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_pair_add[3], 10)); } -LIBGAV1_ALWAYS_INLINE void AddPartial(const uint8_t* src, ptrdiff_t stride, - __m128i* partial_lo, +LIBGAV1_ALWAYS_INLINE void AddPartial(const uint8_t* LIBGAV1_RESTRICT src, + ptrdiff_t stride, __m128i* partial_lo, __m128i* partial_hi) { // 8x8 input // 00 01 02 03 04 05 06 07 @@ -395,8 +395,10 @@ inline uint32_t SquareSum_S16(const __m128i a) { return SumVector_S32(square); } -void CdefDirection_SSE4_1(const void* const source, ptrdiff_t stride, - uint8_t* const direction, int* const variance) { +void CdefDirection_SSE4_1(const void* LIBGAV1_RESTRICT const source, + ptrdiff_t stride, + uint8_t* LIBGAV1_RESTRICT const direction, + int* LIBGAV1_RESTRICT const variance) { assert(direction != nullptr); assert(variance != nullptr); const auto* src = static_cast<const uint8_t*>(source); @@ -438,8 +440,9 @@ void CdefDirection_SSE4_1(const void* const source, ptrdiff_t stride, // CdefFilter // Load 4 vectors based on the given |direction|. -inline void LoadDirection(const uint16_t* const src, const ptrdiff_t stride, - __m128i* output, const int direction) { +inline void LoadDirection(const uint16_t* LIBGAV1_RESTRICT const src, + const ptrdiff_t stride, __m128i* output, + const int direction) { // Each |direction| describes a different set of source values. Expand this // set by negating each set. For |direction| == 0 this gives a diagonal line // from top right to bottom left. The first value is y, the second x. Negative @@ -463,8 +466,9 @@ inline void LoadDirection(const uint16_t* const src, const ptrdiff_t stride, // Load 4 vectors based on the given |direction|. Use when |block_width| == 4 to // do 2 rows at a time. -void LoadDirection4(const uint16_t* const src, const ptrdiff_t stride, - __m128i* output, const int direction) { +void LoadDirection4(const uint16_t* LIBGAV1_RESTRICT const src, + const ptrdiff_t stride, __m128i* output, + const int direction) { const int y_0 = kCdefDirections[direction][0][0]; const int x_0 = kCdefDirections[direction][0][1]; const int y_1 = kCdefDirections[direction][1][0]; @@ -507,10 +511,11 @@ inline __m128i ApplyConstrainAndTap(const __m128i& pixel, const __m128i& val, } template <int width, bool enable_primary = true, bool enable_secondary = true> -void CdefFilter_SSE4_1(const uint16_t* src, const ptrdiff_t src_stride, - const int height, const int primary_strength, - const int secondary_strength, const int damping, - const int direction, void* dest, +void CdefFilter_SSE4_1(const uint16_t* LIBGAV1_RESTRICT src, + const ptrdiff_t src_stride, const int height, + const int primary_strength, const int secondary_strength, + const int damping, const int direction, + void* LIBGAV1_RESTRICT dest, const ptrdiff_t dst_stride) { static_assert(width == 8 || width == 4, "Invalid CDEF width."); static_assert(enable_primary || enable_secondary, ""); diff --git a/src/dsp/x86/common_avx2_test.cc b/src/dsp/x86/common_avx2_test.cc new file mode 100644 index 0000000..2062683 --- /dev/null +++ b/src/dsp/x86/common_avx2_test.cc @@ -0,0 +1,67 @@ +// Copyright 2021 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/x86/common_avx2.h" + +#include "gtest/gtest.h" + +#if LIBGAV1_TARGETING_AVX2 + +#include <cstdint> + +#include "src/utils/common.h" + +namespace libgav1 { +namespace dsp { +namespace { + +// Show that RightShiftWithRounding_S16() is equal to +// RightShiftWithRounding() only for values less than or equal to +// INT16_MAX - ((1 << bits) >> 1). In particular, if bits == 16, then +// RightShiftWithRounding_S16() is equal to RightShiftWithRounding() only for +// negative values. +TEST(CommonDspTest, AVX2RightShiftWithRoundingS16) { + for (int bits = 0; bits < 16; ++bits) { + const int bias = (1 << bits) >> 1; + for (int32_t value = INT16_MIN; value <= INT16_MAX; ++value) { + const __m256i v_val_d = _mm256_set1_epi16(value); + const __m256i v_result_d = RightShiftWithRounding_S16(v_val_d, bits); + // Note _mm256_extract_epi16 is avoided for compatibility with Visual + // Studio < 2017. + const int16_t result = + _mm_extract_epi16(_mm256_extracti128_si256(v_result_d, 0), 0); + const int32_t expected = RightShiftWithRounding(value, bits); + if (value <= INT16_MAX - bias) { + EXPECT_EQ(result, expected) << "value: " << value << ", bits: " << bits; + } else { + EXPECT_EQ(expected, 1 << (15 - bits)); + EXPECT_EQ(result, -expected) + << "value: " << value << ", bits: " << bits; + } + } + } +} + +} // namespace +} // namespace dsp +} // namespace libgav1 + +#else // !LIBGAV1_TARGETING_AVX2 + +TEST(CommonDspTest, AVX2) { + GTEST_SKIP() << "Build this module for x86(-64) with AVX2 enabled to enable " + "the tests."; +} + +#endif // LIBGAV1_TARGETING_AVX2 diff --git a/src/dsp/x86/common_sse4_test.cc b/src/dsp/x86/common_sse4_test.cc new file mode 100644 index 0000000..4ea811a --- /dev/null +++ b/src/dsp/x86/common_sse4_test.cc @@ -0,0 +1,64 @@ +// Copyright 2021 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/x86/common_sse4.h" + +#include "gtest/gtest.h" + +#if LIBGAV1_TARGETING_SSE4_1 + +#include <cstdint> + +#include "src/utils/common.h" + +namespace libgav1 { +namespace dsp { +namespace { + +// Show that RightShiftWithRounding_S16() is equal to +// RightShiftWithRounding() only for values less than or equal to +// INT16_MAX - ((1 << bits) >> 1). In particular, if bits == 16, then +// RightShiftWithRounding_S16() is equal to RightShiftWithRounding() only for +// negative values. +TEST(CommonDspTest, SSE4RightShiftWithRoundingS16) { + for (int bits = 0; bits < 16; ++bits) { + const int bias = (1 << bits) >> 1; + for (int32_t value = INT16_MIN; value <= INT16_MAX; ++value) { + const __m128i v_val_d = _mm_set1_epi16(value); + const __m128i v_result_d = RightShiftWithRounding_S16(v_val_d, bits); + const int16_t result = _mm_extract_epi16(v_result_d, 0); + const int32_t expected = RightShiftWithRounding(value, bits); + if (value <= INT16_MAX - bias) { + EXPECT_EQ(result, expected) << "value: " << value << ", bits: " << bits; + } else { + EXPECT_EQ(expected, 1 << (15 - bits)); + EXPECT_EQ(result, -expected) + << "value: " << value << ", bits: " << bits; + } + } + } +} + +} // namespace +} // namespace dsp +} // namespace libgav1 + +#else // !LIBGAV1_TARGETING_SSE4_1 + +TEST(CommonDspTest, SSE4) { + GTEST_SKIP() << "Build this module for x86(-64) with SSE4 enabled to enable " + "the tests."; +} + +#endif // LIBGAV1_TARGETING_SSE4_1 diff --git a/src/dsp/x86/convolve_avx2.cc b/src/dsp/x86/convolve_avx2.cc index 2ecb77c..4126ca9 100644 --- a/src/dsp/x86/convolve_avx2.cc +++ b/src/dsp/x86/convolve_avx2.cc @@ -127,10 +127,11 @@ __m256i HorizontalTaps8To16(const __m256i* const src, // Filter 2xh sizes. template <int num_taps, int filter_index, bool is_2d = false, bool is_compound = false> -void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride, - void* const dest, const ptrdiff_t pred_stride, - const int /*width*/, const int height, - const __m128i* const v_tap) { +void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src, + const ptrdiff_t src_stride, + void* LIBGAV1_RESTRICT const dest, + const ptrdiff_t pred_stride, const int /*width*/, + const int height, const __m128i* const v_tap) { auto* dest8 = static_cast<uint8_t*>(dest); auto* dest16 = static_cast<uint16_t*>(dest); @@ -195,10 +196,11 @@ void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride, // Filter widths >= 4. template <int num_taps, int filter_index, bool is_2d = false, bool is_compound = false> -void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride, - void* const dest, const ptrdiff_t pred_stride, - const int width, const int height, - const __m256i* const v_tap) { +void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src, + const ptrdiff_t src_stride, + void* LIBGAV1_RESTRICT const dest, + const ptrdiff_t pred_stride, const int width, + const int height, const __m256i* const v_tap) { auto* dest8 = static_cast<uint8_t*>(dest); auto* dest16 = static_cast<uint16_t*>(dest); @@ -467,7 +469,8 @@ __m256i SimpleSum2DVerticalTaps(const __m256i* const src, } template <int num_taps, bool is_compound = false> -void Filter2DVertical16xH(const uint16_t* src, void* const dst, +void Filter2DVertical16xH(const uint16_t* LIBGAV1_RESTRICT src, + void* LIBGAV1_RESTRICT const dst, const ptrdiff_t dst_stride, const int width, const int height, const __m256i* const taps) { assert(width >= 8); @@ -542,9 +545,10 @@ void Filter2DVertical16xH(const uint16_t* src, void* const dst, template <bool is_2d = false, bool is_compound = false> LIBGAV1_ALWAYS_INLINE void DoHorizontalPass2xH( - const uint8_t* const src, const ptrdiff_t src_stride, void* const dst, - const ptrdiff_t dst_stride, const int width, const int height, - const int filter_id, const int filter_index) { + const uint8_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride, + void* LIBGAV1_RESTRICT const dst, const ptrdiff_t dst_stride, + const int width, const int height, const int filter_id, + const int filter_index) { assert(filter_id != 0); __m128i v_tap[4]; const __m128i v_horizontal_filter = @@ -567,9 +571,10 @@ LIBGAV1_ALWAYS_INLINE void DoHorizontalPass2xH( template <bool is_2d = false, bool is_compound = false> LIBGAV1_ALWAYS_INLINE void DoHorizontalPass( - const uint8_t* const src, const ptrdiff_t src_stride, void* const dst, - const ptrdiff_t dst_stride, const int width, const int height, - const int filter_id, const int filter_index) { + const uint8_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride, + void* LIBGAV1_RESTRICT const dst, const ptrdiff_t dst_stride, + const int width, const int height, const int filter_id, + const int filter_index) { assert(filter_id != 0); __m256i v_tap[4]; const __m128i v_horizontal_filter = @@ -602,13 +607,13 @@ LIBGAV1_ALWAYS_INLINE void DoHorizontalPass( } } -void Convolve2D_AVX2(const void* const reference, +void Convolve2D_AVX2(const void* LIBGAV1_RESTRICT const reference, const ptrdiff_t reference_stride, const int horizontal_filter_index, const int vertical_filter_index, const int horizontal_filter_id, const int vertical_filter_id, const int width, - const int height, void* prediction, + const int height, void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) { const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width); const int vert_filter_index = GetFilterIndex(vertical_filter_index, height); @@ -774,10 +779,11 @@ __m256i SumVerticalTaps(const __m256i* const srcs, const __m256i* const v_tap) { } template <int filter_index, bool is_compound = false> -void FilterVertical32xH(const uint8_t* src, const ptrdiff_t src_stride, - void* const dst, const ptrdiff_t dst_stride, - const int width, const int height, - const __m256i* const v_tap) { +void FilterVertical32xH(const uint8_t* LIBGAV1_RESTRICT src, + const ptrdiff_t src_stride, + void* LIBGAV1_RESTRICT const dst, + const ptrdiff_t dst_stride, const int width, + const int height, const __m256i* const v_tap) { const int num_taps = GetNumTapsInFilter(filter_index); const int next_row = num_taps - 1; auto* dst8 = static_cast<uint8_t*>(dst); @@ -856,10 +862,11 @@ void FilterVertical32xH(const uint8_t* src, const ptrdiff_t src_stride, } template <int filter_index, bool is_compound = false> -void FilterVertical16xH(const uint8_t* src, const ptrdiff_t src_stride, - void* const dst, const ptrdiff_t dst_stride, - const int /*width*/, const int height, - const __m256i* const v_tap) { +void FilterVertical16xH(const uint8_t* LIBGAV1_RESTRICT src, + const ptrdiff_t src_stride, + void* LIBGAV1_RESTRICT const dst, + const ptrdiff_t dst_stride, const int /*width*/, + const int height, const __m256i* const v_tap) { const int num_taps = GetNumTapsInFilter(filter_index); const int next_row = num_taps; auto* dst8 = static_cast<uint8_t*>(dst); @@ -958,10 +965,11 @@ void FilterVertical16xH(const uint8_t* src, const ptrdiff_t src_stride, } template <int filter_index, bool is_compound = false> -void FilterVertical8xH(const uint8_t* src, const ptrdiff_t src_stride, - void* const dst, const ptrdiff_t dst_stride, - const int /*width*/, const int height, - const __m256i* const v_tap) { +void FilterVertical8xH(const uint8_t* LIBGAV1_RESTRICT src, + const ptrdiff_t src_stride, + void* LIBGAV1_RESTRICT const dst, + const ptrdiff_t dst_stride, const int /*width*/, + const int height, const __m256i* const v_tap) { const int num_taps = GetNumTapsInFilter(filter_index); const int next_row = num_taps; auto* dst8 = static_cast<uint8_t*>(dst); @@ -1055,10 +1063,11 @@ void FilterVertical8xH(const uint8_t* src, const ptrdiff_t src_stride, } template <int filter_index, bool is_compound = false> -void FilterVertical8xH(const uint8_t* src, const ptrdiff_t src_stride, - void* const dst, const ptrdiff_t dst_stride, - const int /*width*/, const int height, - const __m128i* const v_tap) { +void FilterVertical8xH(const uint8_t* LIBGAV1_RESTRICT src, + const ptrdiff_t src_stride, + void* LIBGAV1_RESTRICT const dst, + const ptrdiff_t dst_stride, const int /*width*/, + const int height, const __m128i* const v_tap) { const int num_taps = GetNumTapsInFilter(filter_index); const int next_row = num_taps - 1; auto* dst8 = static_cast<uint8_t*>(dst); @@ -1119,13 +1128,13 @@ void FilterVertical8xH(const uint8_t* src, const ptrdiff_t src_stride, } while (--y != 0); } -void ConvolveVertical_AVX2(const void* const reference, +void ConvolveVertical_AVX2(const void* LIBGAV1_RESTRICT const reference, const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/, const int vertical_filter_index, const int /*horizontal_filter_id*/, const int vertical_filter_id, const int width, - const int height, void* prediction, + const int height, void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) { const int filter_index = GetFilterIndex(vertical_filter_index, height); const int vertical_taps = GetNumTapsInFilter(filter_index); @@ -1257,11 +1266,11 @@ void ConvolveVertical_AVX2(const void* const reference, } void ConvolveCompoundVertical_AVX2( - const void* const reference, const ptrdiff_t reference_stride, - const int /*horizontal_filter_index*/, const int vertical_filter_index, - const int /*horizontal_filter_id*/, const int vertical_filter_id, - const int width, const int height, void* prediction, - const ptrdiff_t /*pred_stride*/) { + const void* LIBGAV1_RESTRICT const reference, + const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/, + const int vertical_filter_index, const int /*horizontal_filter_id*/, + const int vertical_filter_id, const int width, const int height, + void* LIBGAV1_RESTRICT prediction, const ptrdiff_t /*pred_stride*/) { const int filter_index = GetFilterIndex(vertical_filter_index, height); const int vertical_taps = GetNumTapsInFilter(filter_index); const ptrdiff_t src_stride = reference_stride; @@ -1366,14 +1375,12 @@ void ConvolveCompoundVertical_AVX2( } } -void ConvolveHorizontal_AVX2(const void* const reference, - const ptrdiff_t reference_stride, - const int horizontal_filter_index, - const int /*vertical_filter_index*/, - const int horizontal_filter_id, - const int /*vertical_filter_id*/, const int width, - const int height, void* prediction, - const ptrdiff_t pred_stride) { +void ConvolveHorizontal_AVX2( + const void* LIBGAV1_RESTRICT const reference, + const ptrdiff_t reference_stride, const int horizontal_filter_index, + const int /*vertical_filter_index*/, const int horizontal_filter_id, + const int /*vertical_filter_id*/, const int width, const int height, + void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) { const int filter_index = GetFilterIndex(horizontal_filter_index, width); // Set |src| to the outermost tap. const auto* src = static_cast<const uint8_t*>(reference) - kHorizontalOffset; @@ -1390,11 +1397,11 @@ void ConvolveHorizontal_AVX2(const void* const reference, } void ConvolveCompoundHorizontal_AVX2( - const void* const reference, const ptrdiff_t reference_stride, - const int horizontal_filter_index, const int /*vertical_filter_index*/, - const int horizontal_filter_id, const int /*vertical_filter_id*/, - const int width, const int height, void* prediction, - const ptrdiff_t pred_stride) { + const void* LIBGAV1_RESTRICT const reference, + const ptrdiff_t reference_stride, const int horizontal_filter_index, + const int /*vertical_filter_index*/, const int horizontal_filter_id, + const int /*vertical_filter_id*/, const int width, const int height, + void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) { const int filter_index = GetFilterIndex(horizontal_filter_index, width); // Set |src| to the outermost tap. const auto* src = static_cast<const uint8_t*>(reference) - kHorizontalOffset; @@ -1415,14 +1422,12 @@ void ConvolveCompoundHorizontal_AVX2( filter_index); } -void ConvolveCompound2D_AVX2(const void* const reference, - const ptrdiff_t reference_stride, - const int horizontal_filter_index, - const int vertical_filter_index, - const int horizontal_filter_id, - const int vertical_filter_id, const int width, - const int height, void* prediction, - const ptrdiff_t pred_stride) { +void ConvolveCompound2D_AVX2( + const void* LIBGAV1_RESTRICT const reference, + const ptrdiff_t reference_stride, const int horizontal_filter_index, + const int vertical_filter_index, const int horizontal_filter_id, + const int vertical_filter_id, const int width, const int height, + void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) { const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width); const int vert_filter_index = GetFilterIndex(vertical_filter_index, height); const int vertical_taps = GetNumTapsInFilter(vert_filter_index); diff --git a/src/dsp/x86/convolve_sse4.cc b/src/dsp/x86/convolve_sse4.cc index 9b72fe4..f7e5a71 100644 --- a/src/dsp/x86/convolve_sse4.cc +++ b/src/dsp/x86/convolve_sse4.cc @@ -37,7 +37,7 @@ namespace { #include "src/dsp/x86/convolve_sse4.inc" template <int filter_index> -__m128i SumHorizontalTaps(const uint8_t* const src, +__m128i SumHorizontalTaps(const uint8_t* LIBGAV1_RESTRICT const src, const __m128i* const v_tap) { __m128i v_src[4]; const __m128i src_long = LoadUnaligned16(src); @@ -68,7 +68,7 @@ __m128i SumHorizontalTaps(const uint8_t* const src, } template <int filter_index> -__m128i SimpleHorizontalTaps(const uint8_t* const src, +__m128i SimpleHorizontalTaps(const uint8_t* LIBGAV1_RESTRICT const src, const __m128i* const v_tap) { __m128i sum = SumHorizontalTaps<filter_index>(src, v_tap); @@ -84,7 +84,7 @@ __m128i SimpleHorizontalTaps(const uint8_t* const src, } template <int filter_index> -__m128i HorizontalTaps8To16(const uint8_t* const src, +__m128i HorizontalTaps8To16(const uint8_t* LIBGAV1_RESTRICT const src, const __m128i* const v_tap) { const __m128i sum = SumHorizontalTaps<filter_index>(src, v_tap); @@ -93,10 +93,11 @@ __m128i HorizontalTaps8To16(const uint8_t* const src, template <int num_taps, int filter_index, bool is_2d = false, bool is_compound = false> -void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride, - void* const dest, const ptrdiff_t pred_stride, - const int width, const int height, - const __m128i* const v_tap) { +void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src, + const ptrdiff_t src_stride, + void* LIBGAV1_RESTRICT const dest, + const ptrdiff_t pred_stride, const int width, + const int height, const __m128i* const v_tap) { auto* dest8 = static_cast<uint8_t*>(dest); auto* dest16 = static_cast<uint16_t*>(dest); @@ -206,9 +207,10 @@ void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride, template <bool is_2d = false, bool is_compound = false> LIBGAV1_ALWAYS_INLINE void DoHorizontalPass( - const uint8_t* const src, const ptrdiff_t src_stride, void* const dst, - const ptrdiff_t dst_stride, const int width, const int height, - const int filter_id, const int filter_index) { + const uint8_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride, + void* LIBGAV1_RESTRICT const dst, const ptrdiff_t dst_stride, + const int width, const int height, const int filter_id, + const int filter_index) { assert(filter_id != 0); __m128i v_tap[4]; const __m128i v_horizontal_filter = @@ -241,13 +243,13 @@ LIBGAV1_ALWAYS_INLINE void DoHorizontalPass( } } -void Convolve2D_SSE4_1(const void* const reference, +void Convolve2D_SSE4_1(const void* LIBGAV1_RESTRICT const reference, const ptrdiff_t reference_stride, const int horizontal_filter_index, const int vertical_filter_index, const int horizontal_filter_id, const int vertical_filter_id, const int width, - const int height, void* prediction, + const int height, void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) { const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width); const int vert_filter_index = GetFilterIndex(vertical_filter_index, height); @@ -328,10 +330,11 @@ void Convolve2D_SSE4_1(const void* const reference, } template <int filter_index, bool is_compound = false> -void FilterVertical(const uint8_t* src, const ptrdiff_t src_stride, - void* const dst, const ptrdiff_t dst_stride, - const int width, const int height, - const __m128i* const v_tap) { +void FilterVertical(const uint8_t* LIBGAV1_RESTRICT src, + const ptrdiff_t src_stride, + void* LIBGAV1_RESTRICT const dst, + const ptrdiff_t dst_stride, const int width, + const int height, const __m128i* const v_tap) { const int num_taps = GetNumTapsInFilter(filter_index); const int next_row = num_taps - 1; auto* dst8 = static_cast<uint8_t*>(dst); @@ -400,14 +403,12 @@ void FilterVertical(const uint8_t* src, const ptrdiff_t src_stride, } while (x < width); } -void ConvolveVertical_SSE4_1(const void* const reference, - const ptrdiff_t reference_stride, - const int /*horizontal_filter_index*/, - const int vertical_filter_index, - const int /*horizontal_filter_id*/, - const int vertical_filter_id, const int width, - const int height, void* prediction, - const ptrdiff_t pred_stride) { +void ConvolveVertical_SSE4_1( + const void* LIBGAV1_RESTRICT const reference, + const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/, + const int vertical_filter_index, const int /*horizontal_filter_id*/, + const int vertical_filter_id, const int width, const int height, + void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) { const int filter_index = GetFilterIndex(vertical_filter_index, height); const int vertical_taps = GetNumTapsInFilter(filter_index); const ptrdiff_t src_stride = reference_stride; @@ -477,14 +478,12 @@ void ConvolveVertical_SSE4_1(const void* const reference, } } -void ConvolveCompoundCopy_SSE4(const void* const reference, - const ptrdiff_t reference_stride, - const int /*horizontal_filter_index*/, - const int /*vertical_filter_index*/, - const int /*horizontal_filter_id*/, - const int /*vertical_filter_id*/, - const int width, const int height, - void* prediction, const ptrdiff_t pred_stride) { +void ConvolveCompoundCopy_SSE4( + const void* LIBGAV1_RESTRICT const reference, + const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/, + const int /*vertical_filter_index*/, const int /*horizontal_filter_id*/, + const int /*vertical_filter_id*/, const int width, const int height, + void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) { const auto* src = static_cast<const uint8_t*>(reference); const ptrdiff_t src_stride = reference_stride; auto* dest = static_cast<uint16_t*>(prediction); @@ -539,11 +538,11 @@ void ConvolveCompoundCopy_SSE4(const void* const reference, } void ConvolveCompoundVertical_SSE4_1( - const void* const reference, const ptrdiff_t reference_stride, - const int /*horizontal_filter_index*/, const int vertical_filter_index, - const int /*horizontal_filter_id*/, const int vertical_filter_id, - const int width, const int height, void* prediction, - const ptrdiff_t /*pred_stride*/) { + const void* LIBGAV1_RESTRICT const reference, + const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/, + const int vertical_filter_index, const int /*horizontal_filter_id*/, + const int vertical_filter_id, const int width, const int height, + void* LIBGAV1_RESTRICT prediction, const ptrdiff_t /*pred_stride*/) { const int filter_index = GetFilterIndex(vertical_filter_index, height); const int vertical_taps = GetNumTapsInFilter(filter_index); const ptrdiff_t src_stride = reference_stride; @@ -608,14 +607,12 @@ void ConvolveCompoundVertical_SSE4_1( } } -void ConvolveHorizontal_SSE4_1(const void* const reference, - const ptrdiff_t reference_stride, - const int horizontal_filter_index, - const int /*vertical_filter_index*/, - const int horizontal_filter_id, - const int /*vertical_filter_id*/, - const int width, const int height, - void* prediction, const ptrdiff_t pred_stride) { +void ConvolveHorizontal_SSE4_1( + const void* LIBGAV1_RESTRICT const reference, + const ptrdiff_t reference_stride, const int horizontal_filter_index, + const int /*vertical_filter_index*/, const int horizontal_filter_id, + const int /*vertical_filter_id*/, const int width, const int height, + void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) { const int filter_index = GetFilterIndex(horizontal_filter_index, width); // Set |src| to the outermost tap. const auto* src = static_cast<const uint8_t*>(reference) - kHorizontalOffset; @@ -626,11 +623,11 @@ void ConvolveHorizontal_SSE4_1(const void* const reference, } void ConvolveCompoundHorizontal_SSE4_1( - const void* const reference, const ptrdiff_t reference_stride, - const int horizontal_filter_index, const int /*vertical_filter_index*/, - const int horizontal_filter_id, const int /*vertical_filter_id*/, - const int width, const int height, void* prediction, - const ptrdiff_t /*pred_stride*/) { + const void* LIBGAV1_RESTRICT const reference, + const ptrdiff_t reference_stride, const int horizontal_filter_index, + const int /*vertical_filter_index*/, const int horizontal_filter_id, + const int /*vertical_filter_id*/, const int width, const int height, + void* LIBGAV1_RESTRICT prediction, const ptrdiff_t /*pred_stride*/) { const int filter_index = GetFilterIndex(horizontal_filter_index, width); const auto* src = static_cast<const uint8_t*>(reference) - kHorizontalOffset; auto* dest = static_cast<uint16_t*>(prediction); @@ -640,14 +637,12 @@ void ConvolveCompoundHorizontal_SSE4_1( filter_index); } -void ConvolveCompound2D_SSE4_1(const void* const reference, - const ptrdiff_t reference_stride, - const int horizontal_filter_index, - const int vertical_filter_index, - const int horizontal_filter_id, - const int vertical_filter_id, const int width, - const int height, void* prediction, - const ptrdiff_t /*pred_stride*/) { +void ConvolveCompound2D_SSE4_1( + const void* LIBGAV1_RESTRICT const reference, + const ptrdiff_t reference_stride, const int horizontal_filter_index, + const int vertical_filter_index, const int horizontal_filter_id, + const int vertical_filter_id, const int width, const int height, + void* LIBGAV1_RESTRICT prediction, const ptrdiff_t /*pred_stride*/) { // The output of the horizontal filter, i.e. the intermediate_result, is // guaranteed to fit in int16_t. alignas(16) uint16_t @@ -835,7 +830,8 @@ inline void GetHalfSubPixelFilter(__m128i* output) { // exceed 4 when width <= 4, |grade_x| is set to 1 regardless of the value of // |step_x|. template <int num_taps, int grade_x> -inline void PrepareSourceVectors(const uint8_t* src, const __m128i src_indices, +inline void PrepareSourceVectors(const uint8_t* LIBGAV1_RESTRICT src, + const __m128i src_indices, __m128i* const source /*[num_taps >> 1]*/) { // |used_bytes| is only computed in msan builds. Mask away unused bytes for // msan because it incorrectly models the outcome of the shuffles in some @@ -900,10 +896,11 @@ inline __m128i HorizontalScaleIndices(const __m128i subpel_indices) { } template <int grade_x, int filter_index, int num_taps> -inline void ConvolveHorizontalScale(const uint8_t* src, ptrdiff_t src_stride, - int width, int subpixel_x, int step_x, +inline void ConvolveHorizontalScale(const uint8_t* LIBGAV1_RESTRICT src, + ptrdiff_t src_stride, int width, + int subpixel_x, int step_x, int intermediate_height, - int16_t* intermediate) { + int16_t* LIBGAV1_RESTRICT intermediate) { // Account for the 0-taps that precede the 2 nonzero taps. const int kernel_offset = (8 - num_taps) >> 1; const int ref_x = subpixel_x >> kScaleSubPixelBits; @@ -946,11 +943,11 @@ inline void ConvolveHorizontalScale(const uint8_t* src, ptrdiff_t src_stride, } // |width| >= 8 + int16_t* intermediate_x = intermediate; int x = 0; do { const uint8_t* src_x = &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset]; - int16_t* intermediate_x = intermediate + x; // Only add steps to the 10-bit truncated p to avoid overflow. const __m128i p_fraction = _mm_set1_epi16(p & 1023); const __m128i subpel_indices = _mm_add_epi16(index_steps, p_fraction); @@ -976,7 +973,8 @@ inline void ConvolveHorizontalScale(const uint8_t* src, ptrdiff_t src_stride, } template <int num_taps> -inline void PrepareVerticalTaps(const int8_t* taps, __m128i* output) { +inline void PrepareVerticalTaps(const int8_t* LIBGAV1_RESTRICT taps, + __m128i* output) { // Avoid overreading the filter due to starting at kernel_offset. // The only danger of overread is in the final filter, which has 4 taps. const __m128i filter = @@ -1072,10 +1070,12 @@ __m128i Sum2DVerticalTaps4x2(const __m128i* const src, const __m128i* taps_lo, // |width_class| is 2, 4, or 8, according to the Store function that should be // used. template <int num_taps, int width_class, bool is_compound> -inline void ConvolveVerticalScale(const int16_t* src, const int width, - const int subpixel_y, const int filter_index, - const int step_y, const int height, - void* dest, const ptrdiff_t dest_stride) { +inline void ConvolveVerticalScale(const int16_t* LIBGAV1_RESTRICT src, + const int intermediate_height, + const int width, const int subpixel_y, + const int filter_index, const int step_y, + const int height, void* LIBGAV1_RESTRICT dest, + const ptrdiff_t dest_stride) { constexpr ptrdiff_t src_stride = kIntermediateStride; constexpr int kernel_offset = (8 - num_taps) / 2; const int16_t* src_y = src; @@ -1138,15 +1138,19 @@ inline void ConvolveVerticalScale(const int16_t* src, const int width, // |width_class| >= 8 __m128i filter_taps[num_taps >> 1]; - do { // y > 0 - src_y = src + (p >> kScaleSubPixelBits) * src_stride; - const int filter_id = (p >> 6) & kSubPixelMask; - const int8_t* filter = - kHalfSubPixelFilters[filter_index][filter_id] + kernel_offset; - PrepareVerticalTaps<num_taps>(filter, filter_taps); - - int x = 0; - do { // x < width + int x = 0; + do { // x < width + auto* dest_y = static_cast<uint8_t*>(dest) + x; + auto* dest16_y = static_cast<uint16_t*>(dest) + x; + int p = subpixel_y & 1023; + int y = height; + do { // y > 0 + const int filter_id = (p >> 6) & kSubPixelMask; + const int8_t* filter = + kHalfSubPixelFilters[filter_index][filter_id] + kernel_offset; + PrepareVerticalTaps<num_taps>(filter, filter_taps); + + src_y = src + (p >> kScaleSubPixelBits) * src_stride; for (int i = 0; i < num_taps; ++i) { s[i] = LoadUnaligned16(src_y + i * src_stride); } @@ -1154,38 +1158,36 @@ inline void ConvolveVerticalScale(const int16_t* src, const int width, const __m128i sums = Sum2DVerticalTaps<num_taps, is_compound>(s, filter_taps); if (is_compound) { - StoreUnaligned16(dest16_y + x, sums); + StoreUnaligned16(dest16_y, sums); } else { - StoreLo8(dest_y + x, _mm_packus_epi16(sums, sums)); + StoreLo8(dest_y, _mm_packus_epi16(sums, sums)); } - x += 8; - src_y += 8; - } while (x < width); - p += step_y; - dest_y += dest_stride; - dest16_y += dest_stride; - } while (--y != 0); + p += step_y; + dest_y += dest_stride; + dest16_y += dest_stride; + } while (--y != 0); + src += kIntermediateStride * intermediate_height; + x += 8; + } while (x < width); } template <bool is_compound> -void ConvolveScale2D_SSE4_1(const void* const reference, +void ConvolveScale2D_SSE4_1(const void* LIBGAV1_RESTRICT const reference, const ptrdiff_t reference_stride, const int horizontal_filter_index, const int vertical_filter_index, const int subpixel_x, const int subpixel_y, const int step_x, const int step_y, const int width, - const int height, void* prediction, + const int height, void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) { const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width); const int vert_filter_index = GetFilterIndex(vertical_filter_index, height); assert(step_x <= 2048); // The output of the horizontal filter, i.e. the intermediate_result, is // guaranteed to fit in int16_t. - // TODO(petersonab): Reduce intermediate block stride to width to make smaller - // blocks faster. alignas(16) int16_t - intermediate_result[kMaxSuperBlockSizeInPixels * - (2 * kMaxSuperBlockSizeInPixels + kSubPixelTaps)]; + intermediate_result[kIntermediateAllocWidth * + (2 * kIntermediateAllocWidth + kSubPixelTaps)]; const int num_vert_taps = GetNumTapsInFilter(vert_filter_index); const int intermediate_height = (((height - 1) * step_y + (1 << kScaleSubPixelBits) - 1) >> @@ -1282,76 +1284,78 @@ void ConvolveScale2D_SSE4_1(const void* const reference, case 1: if (!is_compound && width == 2) { ConvolveVerticalScale<6, 2, is_compound>( - intermediate, width, subpixel_y, vert_filter_index, step_y, height, - prediction, pred_stride); + intermediate, intermediate_height, width, subpixel_y, + vert_filter_index, step_y, height, prediction, pred_stride); } else if (width == 4) { ConvolveVerticalScale<6, 4, is_compound>( - intermediate, width, subpixel_y, vert_filter_index, step_y, height, - prediction, pred_stride); + intermediate, intermediate_height, width, subpixel_y, + vert_filter_index, step_y, height, prediction, pred_stride); } else { ConvolveVerticalScale<6, 8, is_compound>( - intermediate, width, subpixel_y, vert_filter_index, step_y, height, - prediction, pred_stride); + intermediate, intermediate_height, width, subpixel_y, + vert_filter_index, step_y, height, prediction, pred_stride); } break; case 2: if (!is_compound && width == 2) { ConvolveVerticalScale<8, 2, is_compound>( - intermediate, width, subpixel_y, vert_filter_index, step_y, height, - prediction, pred_stride); + intermediate, intermediate_height, width, subpixel_y, + vert_filter_index, step_y, height, prediction, pred_stride); } else if (width == 4) { ConvolveVerticalScale<8, 4, is_compound>( - intermediate, width, subpixel_y, vert_filter_index, step_y, height, - prediction, pred_stride); + intermediate, intermediate_height, width, subpixel_y, + vert_filter_index, step_y, height, prediction, pred_stride); } else { ConvolveVerticalScale<8, 8, is_compound>( - intermediate, width, subpixel_y, vert_filter_index, step_y, height, - prediction, pred_stride); + intermediate, intermediate_height, width, subpixel_y, + vert_filter_index, step_y, height, prediction, pred_stride); } break; case 3: if (!is_compound && width == 2) { ConvolveVerticalScale<2, 2, is_compound>( - intermediate, width, subpixel_y, vert_filter_index, step_y, height, - prediction, pred_stride); + intermediate, intermediate_height, width, subpixel_y, + vert_filter_index, step_y, height, prediction, pred_stride); } else if (width == 4) { ConvolveVerticalScale<2, 4, is_compound>( - intermediate, width, subpixel_y, vert_filter_index, step_y, height, - prediction, pred_stride); + intermediate, intermediate_height, width, subpixel_y, + vert_filter_index, step_y, height, prediction, pred_stride); } else { ConvolveVerticalScale<2, 8, is_compound>( - intermediate, width, subpixel_y, vert_filter_index, step_y, height, - prediction, pred_stride); + intermediate, intermediate_height, width, subpixel_y, + vert_filter_index, step_y, height, prediction, pred_stride); } break; default: assert(vert_filter_index == 4 || vert_filter_index == 5); if (!is_compound && width == 2) { ConvolveVerticalScale<4, 2, is_compound>( - intermediate, width, subpixel_y, vert_filter_index, step_y, height, - prediction, pred_stride); + intermediate, intermediate_height, width, subpixel_y, + vert_filter_index, step_y, height, prediction, pred_stride); } else if (width == 4) { ConvolveVerticalScale<4, 4, is_compound>( - intermediate, width, subpixel_y, vert_filter_index, step_y, height, - prediction, pred_stride); + intermediate, intermediate_height, width, subpixel_y, + vert_filter_index, step_y, height, prediction, pred_stride); } else { ConvolveVerticalScale<4, 8, is_compound>( - intermediate, width, subpixel_y, vert_filter_index, step_y, height, - prediction, pred_stride); + intermediate, intermediate_height, width, subpixel_y, + vert_filter_index, step_y, height, prediction, pred_stride); } } } -inline void HalfAddHorizontal(const uint8_t* src, uint8_t* dst) { +inline void HalfAddHorizontal(const uint8_t* LIBGAV1_RESTRICT src, + uint8_t* LIBGAV1_RESTRICT dst) { const __m128i left = LoadUnaligned16(src); const __m128i right = LoadUnaligned16(src + 1); StoreUnaligned16(dst, _mm_avg_epu8(left, right)); } template <int width> -inline void IntraBlockCopyHorizontal(const uint8_t* src, +inline void IntraBlockCopyHorizontal(const uint8_t* LIBGAV1_RESTRICT src, const ptrdiff_t src_stride, - const int height, uint8_t* dst, + const int height, + uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t dst_stride) { const ptrdiff_t src_remainder_stride = src_stride - (width - 16); const ptrdiff_t dst_remainder_stride = dst_stride - (width - 16); @@ -1392,10 +1396,11 @@ inline void IntraBlockCopyHorizontal(const uint8_t* src, } void ConvolveIntraBlockCopyHorizontal_SSE4_1( - const void* const reference, const ptrdiff_t reference_stride, - const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/, - const int /*subpixel_x*/, const int /*subpixel_y*/, const int width, - const int height, void* const prediction, const ptrdiff_t pred_stride) { + const void* LIBGAV1_RESTRICT const reference, + const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/, + const int /*vertical_filter_index*/, const int /*subpixel_x*/, + const int /*subpixel_y*/, const int width, const int height, + void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) { const auto* src = static_cast<const uint8_t*>(reference); auto* dest = static_cast<uint8_t*>(prediction); @@ -1464,9 +1469,10 @@ void ConvolveIntraBlockCopyHorizontal_SSE4_1( } template <int width> -inline void IntraBlockCopyVertical(const uint8_t* src, +inline void IntraBlockCopyVertical(const uint8_t* LIBGAV1_RESTRICT src, const ptrdiff_t src_stride, const int height, - uint8_t* dst, const ptrdiff_t dst_stride) { + uint8_t* LIBGAV1_RESTRICT dst, + const ptrdiff_t dst_stride) { const ptrdiff_t src_remainder_stride = src_stride - (width - 16); const ptrdiff_t dst_remainder_stride = dst_stride - (width - 16); __m128i row[8], below[8]; @@ -1553,11 +1559,11 @@ inline void IntraBlockCopyVertical(const uint8_t* src, } void ConvolveIntraBlockCopyVertical_SSE4_1( - const void* const reference, const ptrdiff_t reference_stride, - const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/, - const int /*horizontal_filter_id*/, const int /*vertical_filter_id*/, - const int width, const int height, void* const prediction, - const ptrdiff_t pred_stride) { + const void* LIBGAV1_RESTRICT const reference, + const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/, + const int /*vertical_filter_index*/, const int /*horizontal_filter_id*/, + const int /*vertical_filter_id*/, const int width, const int height, + void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) { const auto* src = static_cast<const uint8_t*>(reference); auto* dest = static_cast<uint8_t*>(prediction); @@ -1622,7 +1628,8 @@ void ConvolveIntraBlockCopyVertical_SSE4_1( } // Load then add two uint8_t vectors. Return the uint16_t vector result. -inline __m128i LoadU8AndAddLong(const uint8_t* src, const uint8_t* src1) { +inline __m128i LoadU8AndAddLong(const uint8_t* LIBGAV1_RESTRICT src, + const uint8_t* LIBGAV1_RESTRICT src1) { const __m128i a = _mm_cvtepu8_epi16(LoadLo8(src)); const __m128i b = _mm_cvtepu8_epi16(LoadLo8(src1)); return _mm_add_epi16(a, b); @@ -1637,8 +1644,9 @@ inline __m128i AddU16RightShift2AndPack(__m128i v0, __m128i v1) { } template <int width> -inline void IntraBlockCopy2D(const uint8_t* src, const ptrdiff_t src_stride, - const int height, uint8_t* dst, +inline void IntraBlockCopy2D(const uint8_t* LIBGAV1_RESTRICT src, + const ptrdiff_t src_stride, const int height, + uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t dst_stride) { const ptrdiff_t src_remainder_stride = src_stride - (width - 8); const ptrdiff_t dst_remainder_stride = dst_stride - (width - 8); @@ -1793,11 +1801,11 @@ inline void IntraBlockCopy2D(const uint8_t* src, const ptrdiff_t src_stride, } void ConvolveIntraBlockCopy2D_SSE4_1( - const void* const reference, const ptrdiff_t reference_stride, - const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/, - const int /*horizontal_filter_id*/, const int /*vertical_filter_id*/, - const int width, const int height, void* const prediction, - const ptrdiff_t pred_stride) { + const void* LIBGAV1_RESTRICT const reference, + const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/, + const int /*vertical_filter_index*/, const int /*horizontal_filter_id*/, + const int /*vertical_filter_id*/, const int width, const int height, + void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) { const auto* src = static_cast<const uint8_t*>(reference); auto* dest = static_cast<uint8_t*>(prediction); // Note: allow vertical access to height + 1. Because this function is only diff --git a/src/dsp/x86/distance_weighted_blend_sse4.cc b/src/dsp/x86/distance_weighted_blend_sse4.cc index 3c29b19..c813df4 100644 --- a/src/dsp/x86/distance_weighted_blend_sse4.cc +++ b/src/dsp/x86/distance_weighted_blend_sse4.cc @@ -54,8 +54,10 @@ inline __m128i ComputeWeightedAverage8(const __m128i& pred0, template <int height> inline void DistanceWeightedBlend4xH_SSE4_1( - const int16_t* pred_0, const int16_t* pred_1, const uint8_t weight_0, - const uint8_t weight_1, void* const dest, const ptrdiff_t dest_stride) { + const int16_t* LIBGAV1_RESTRICT pred_0, + const int16_t* LIBGAV1_RESTRICT pred_1, const uint8_t weight_0, + const uint8_t weight_1, void* LIBGAV1_RESTRICT const dest, + const ptrdiff_t dest_stride) { auto* dst = static_cast<uint8_t*>(dest); const __m128i weights = _mm_set1_epi32(weight_0 | (weight_1 << 16)); @@ -98,8 +100,10 @@ inline void DistanceWeightedBlend4xH_SSE4_1( template <int height> inline void DistanceWeightedBlend8xH_SSE4_1( - const int16_t* pred_0, const int16_t* pred_1, const uint8_t weight_0, - const uint8_t weight_1, void* const dest, const ptrdiff_t dest_stride) { + const int16_t* LIBGAV1_RESTRICT pred_0, + const int16_t* LIBGAV1_RESTRICT pred_1, const uint8_t weight_0, + const uint8_t weight_1, void* LIBGAV1_RESTRICT const dest, + const ptrdiff_t dest_stride) { auto* dst = static_cast<uint8_t*>(dest); const __m128i weights = _mm_set1_epi32(weight_0 | (weight_1 << 16)); @@ -125,9 +129,10 @@ inline void DistanceWeightedBlend8xH_SSE4_1( } inline void DistanceWeightedBlendLarge_SSE4_1( - const int16_t* pred_0, const int16_t* pred_1, const uint8_t weight_0, - const uint8_t weight_1, const int width, const int height, void* const dest, - const ptrdiff_t dest_stride) { + const int16_t* LIBGAV1_RESTRICT pred_0, + const int16_t* LIBGAV1_RESTRICT pred_1, const uint8_t weight_0, + const uint8_t weight_1, const int width, const int height, + void* LIBGAV1_RESTRICT const dest, const ptrdiff_t dest_stride) { auto* dst = static_cast<uint8_t*>(dest); const __m128i weights = _mm_set1_epi32(weight_0 | (weight_1 << 16)); @@ -154,11 +159,12 @@ inline void DistanceWeightedBlendLarge_SSE4_1( } while (--y != 0); } -void DistanceWeightedBlend_SSE4_1(const void* prediction_0, - const void* prediction_1, +void DistanceWeightedBlend_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, const uint8_t weight_0, const uint8_t weight_1, const int width, - const int height, void* const dest, + const int height, + void* LIBGAV1_RESTRICT const dest, const ptrdiff_t dest_stride) { const auto* pred_0 = static_cast<const int16_t*>(prediction_0); const auto* pred_1 = static_cast<const int16_t*>(prediction_1); @@ -257,8 +263,10 @@ inline __m128i ComputeWeightedAverage8(const __m128i& pred0, template <int height> inline void DistanceWeightedBlend4xH_SSE4_1( - const uint16_t* pred_0, const uint16_t* pred_1, const uint8_t weight_0, - const uint8_t weight_1, void* const dest, const ptrdiff_t dest_stride) { + const uint16_t* LIBGAV1_RESTRICT pred_0, + const uint16_t* LIBGAV1_RESTRICT pred_1, const uint8_t weight_0, + const uint8_t weight_1, void* LIBGAV1_RESTRICT const dest, + const ptrdiff_t dest_stride) { auto* dst = static_cast<uint16_t*>(dest); const __m128i weight0 = _mm_set1_epi32(weight_0); const __m128i weight1 = _mm_set1_epi32(weight_1); @@ -301,8 +309,10 @@ inline void DistanceWeightedBlend4xH_SSE4_1( template <int height> inline void DistanceWeightedBlend8xH_SSE4_1( - const uint16_t* pred_0, const uint16_t* pred_1, const uint8_t weight_0, - const uint8_t weight_1, void* const dest, const ptrdiff_t dest_stride) { + const uint16_t* LIBGAV1_RESTRICT pred_0, + const uint16_t* LIBGAV1_RESTRICT pred_1, const uint8_t weight_0, + const uint8_t weight_1, void* LIBGAV1_RESTRICT const dest, + const ptrdiff_t dest_stride) { auto* dst = static_cast<uint16_t*>(dest); const __m128i weight0 = _mm_set1_epi32(weight_0); const __m128i weight1 = _mm_set1_epi32(weight_1); @@ -332,9 +342,10 @@ inline void DistanceWeightedBlend8xH_SSE4_1( } inline void DistanceWeightedBlendLarge_SSE4_1( - const uint16_t* pred_0, const uint16_t* pred_1, const uint8_t weight_0, - const uint8_t weight_1, const int width, const int height, void* const dest, - const ptrdiff_t dest_stride) { + const uint16_t* LIBGAV1_RESTRICT pred_0, + const uint16_t* LIBGAV1_RESTRICT pred_1, const uint8_t weight_0, + const uint8_t weight_1, const int width, const int height, + void* LIBGAV1_RESTRICT const dest, const ptrdiff_t dest_stride) { auto* dst = static_cast<uint16_t*>(dest); const __m128i weight0 = _mm_set1_epi32(weight_0); const __m128i weight1 = _mm_set1_epi32(weight_1); @@ -364,11 +375,12 @@ inline void DistanceWeightedBlendLarge_SSE4_1( } while (--y != 0); } -void DistanceWeightedBlend_SSE4_1(const void* prediction_0, - const void* prediction_1, +void DistanceWeightedBlend_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, const uint8_t weight_0, const uint8_t weight_1, const int width, - const int height, void* const dest, + const int height, + void* LIBGAV1_RESTRICT const dest, const ptrdiff_t dest_stride) { const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); diff --git a/src/dsp/x86/film_grain_sse4.cc b/src/dsp/x86/film_grain_sse4.cc index 745c1ca..9ece947 100644 --- a/src/dsp/x86/film_grain_sse4.cc +++ b/src/dsp/x86/film_grain_sse4.cc @@ -126,30 +126,16 @@ inline __m128i Clip3(const __m128i value, const __m128i low, } template <int bitdepth, typename Pixel> -inline __m128i GetScalingFactors( - const uint8_t scaling_lut[kScalingLookupTableSize], const Pixel* source) { +inline __m128i GetScalingFactors(const int16_t* scaling_lut, + const Pixel* source) { alignas(16) int16_t start_vals[8]; - if (bitdepth == 8) { - // TODO(petersonab): Speed this up by creating a uint16_t scaling_lut. - // Currently this code results in a series of movzbl. - for (int i = 0; i < 8; ++i) { - start_vals[i] = scaling_lut[source[i]]; - } - return LoadAligned16(start_vals); - } - alignas(16) int16_t end_vals[8]; - // TODO(petersonab): Precompute this into a larger table for direct lookups. + static_assert(bitdepth <= kBitdepth10, + "SSE4 Film Grain is not yet implemented for 12bpp."); for (int i = 0; i < 8; ++i) { - const int index = source[i] >> 2; - start_vals[i] = scaling_lut[index]; - end_vals[i] = scaling_lut[index + 1]; + assert(source[i] < kScalingLookupTableSize << (bitdepth - 2)); + start_vals[i] = scaling_lut[source[i]]; } - const __m128i start = LoadAligned16(start_vals); - const __m128i end = LoadAligned16(end_vals); - __m128i remainder = LoadSource(source); - remainder = _mm_srli_epi16(_mm_slli_epi16(remainder, 14), 1); - const __m128i delta = _mm_mulhrs_epi16(_mm_sub_epi16(end, start), remainder); - return _mm_add_epi16(start, delta); + return LoadAligned16(start_vals); } // |scaling_shift| is in range [8,11]. @@ -162,11 +148,10 @@ inline __m128i ScaleNoise(const __m128i noise, const __m128i scaling, template <int bitdepth, typename GrainType, typename Pixel> void BlendNoiseWithImageLuma_SSE4_1( - const void* noise_image_ptr, int min_value, int max_luma, int scaling_shift, - int width, int height, int start_height, - const uint8_t scaling_lut_y[kScalingLookupTableSize], - const void* source_plane_y, ptrdiff_t source_stride_y, void* dest_plane_y, - ptrdiff_t dest_stride_y) { + const void* LIBGAV1_RESTRICT noise_image_ptr, int min_value, int max_luma, + int scaling_shift, int width, int height, int start_height, + const int16_t* scaling_lut_y, const void* source_plane_y, + ptrdiff_t source_stride_y, void* dest_plane_y, ptrdiff_t dest_stride_y) { const auto* noise_image = static_cast<const Array2D<GrainType>*>(noise_image_ptr); const auto* in_y_row = static_cast<const Pixel*>(source_plane_y); @@ -181,7 +166,6 @@ void BlendNoiseWithImageLuma_SSE4_1( do { int x = 0; for (; x < safe_width; x += 8) { - // TODO(b/133525232): Make 16-pixel version of loop body. const __m128i orig = LoadSource(&in_y_row[x]); const __m128i scaling = GetScalingFactors<bitdepth, Pixel>(scaling_lut_y, &in_y_row[x]); @@ -216,9 +200,9 @@ void BlendNoiseWithImageLuma_SSE4_1( template <int bitdepth, typename GrainType, typename Pixel> inline __m128i BlendChromaValsWithCfl( - const Pixel* average_luma_buffer, - const uint8_t scaling_lut[kScalingLookupTableSize], - const Pixel* chroma_cursor, const GrainType* noise_image_cursor, + const Pixel* LIBGAV1_RESTRICT average_luma_buffer, + const int16_t* scaling_lut, const Pixel* LIBGAV1_RESTRICT chroma_cursor, + const GrainType* LIBGAV1_RESTRICT noise_image_cursor, const __m128i scaling_shift) { const __m128i scaling = GetScalingFactors<bitdepth, Pixel>(scaling_lut, average_luma_buffer); @@ -232,11 +216,10 @@ template <int bitdepth, typename GrainType, typename Pixel> LIBGAV1_ALWAYS_INLINE void BlendChromaPlaneWithCfl_SSE4_1( const Array2D<GrainType>& noise_image, int min_value, int max_chroma, int width, int height, int start_height, int subsampling_x, - int subsampling_y, int scaling_shift, - const uint8_t scaling_lut[kScalingLookupTableSize], const Pixel* in_y_row, - ptrdiff_t source_stride_y, const Pixel* in_chroma_row, - ptrdiff_t source_stride_chroma, Pixel* out_chroma_row, - ptrdiff_t dest_stride) { + int subsampling_y, int scaling_shift, const int16_t* scaling_lut, + const Pixel* LIBGAV1_RESTRICT in_y_row, ptrdiff_t source_stride_y, + const Pixel* in_chroma_row, ptrdiff_t source_stride_chroma, + Pixel* out_chroma_row, ptrdiff_t dest_stride) { const __m128i floor = _mm_set1_epi16(min_value); const __m128i ceiling = _mm_set1_epi16(max_chroma); alignas(16) Pixel luma_buffer[16]; @@ -258,8 +241,6 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlaneWithCfl_SSE4_1( int x = 0; for (; x < safe_chroma_width; x += 8) { const int luma_x = x << subsampling_x; - // TODO(petersonab): Consider specializing by subsampling_x. In the 444 - // case &in_y_row[x] can be passed to GetScalingFactors directly. const __m128i average_luma = GetAverageLuma(&in_y_row[luma_x], subsampling_x); StoreUnsigned(average_luma_buffer, average_luma); @@ -277,7 +258,7 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlaneWithCfl_SSE4_1( // Prevent huge indices from entering GetScalingFactors due to // uninitialized values. This is not a problem in 8bpp because the table // is made larger than 255 values. - if (bitdepth > 8) { + if (bitdepth > kBitdepth8) { memset(luma_buffer, 0, sizeof(luma_buffer)); } const int luma_x = x << subsampling_x; @@ -306,11 +287,11 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlaneWithCfl_SSE4_1( // This further implies that scaling_lut_u == scaling_lut_v == scaling_lut_y. template <int bitdepth, typename GrainType, typename Pixel> void BlendNoiseWithImageChromaWithCfl_SSE4_1( - Plane plane, const FilmGrainParams& params, const void* noise_image_ptr, - int min_value, int max_chroma, int width, int height, int start_height, - int subsampling_x, int subsampling_y, - const uint8_t scaling_lut[kScalingLookupTableSize], - const void* source_plane_y, ptrdiff_t source_stride_y, + Plane plane, const FilmGrainParams& params, + const void* LIBGAV1_RESTRICT noise_image_ptr, int min_value, int max_chroma, + int width, int height, int start_height, int subsampling_x, + int subsampling_y, const int16_t* scaling_lut, + const void* LIBGAV1_RESTRICT source_plane_y, ptrdiff_t source_stride_y, const void* source_plane_uv, ptrdiff_t source_stride_uv, void* dest_plane_uv, ptrdiff_t dest_stride_uv) { const auto* noise_image = @@ -335,10 +316,10 @@ namespace { // |offset| is 32x4 packed to add with the result of _mm_madd_epi16. inline __m128i BlendChromaValsNoCfl8bpp( - const uint8_t scaling_lut[kScalingLookupTableSize], const __m128i& orig, - const int8_t* noise_image_cursor, const __m128i& average_luma, - const __m128i& scaling_shift, const __m128i& offset, - const __m128i& weights) { + const int16_t* scaling_lut, const __m128i& orig, + const int8_t* LIBGAV1_RESTRICT noise_image_cursor, + const __m128i& average_luma, const __m128i& scaling_shift, + const __m128i& offset, const __m128i& weights) { uint8_t merged_buffer[8]; const __m128i combined_lo = _mm_madd_epi16(_mm_unpacklo_epi16(average_luma, orig), weights); @@ -351,9 +332,9 @@ inline __m128i BlendChromaValsNoCfl8bpp( StoreLo8(merged_buffer, _mm_packus_epi16(merged, merged)); const __m128i scaling = - GetScalingFactors<8, uint8_t>(scaling_lut, merged_buffer); + GetScalingFactors<kBitdepth8, uint8_t>(scaling_lut, merged_buffer); __m128i noise = LoadSource(noise_image_cursor); - noise = ScaleNoise<8>(noise, scaling, scaling_shift); + noise = ScaleNoise<kBitdepth8>(noise, scaling, scaling_shift); return _mm_add_epi16(orig, noise); } @@ -361,11 +342,10 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlane8bpp_SSE4_1( const Array2D<int8_t>& noise_image, int min_value, int max_chroma, int width, int height, int start_height, int subsampling_x, int subsampling_y, int scaling_shift, int chroma_offset, - int chroma_multiplier, int luma_multiplier, - const uint8_t scaling_lut[kScalingLookupTableSize], const uint8_t* in_y_row, - ptrdiff_t source_stride_y, const uint8_t* in_chroma_row, - ptrdiff_t source_stride_chroma, uint8_t* out_chroma_row, - ptrdiff_t dest_stride) { + int chroma_multiplier, int luma_multiplier, const int16_t* scaling_lut, + const uint8_t* LIBGAV1_RESTRICT in_y_row, ptrdiff_t source_stride_y, + const uint8_t* in_chroma_row, ptrdiff_t source_stride_chroma, + uint8_t* out_chroma_row, ptrdiff_t dest_stride) { const __m128i floor = _mm_set1_epi16(min_value); const __m128i ceiling = _mm_set1_epi16(max_chroma); @@ -432,11 +412,11 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlane8bpp_SSE4_1( // This function is for the case params_.chroma_scaling_from_luma == false. void BlendNoiseWithImageChroma8bpp_SSE4_1( - Plane plane, const FilmGrainParams& params, const void* noise_image_ptr, - int min_value, int max_chroma, int width, int height, int start_height, - int subsampling_x, int subsampling_y, - const uint8_t scaling_lut[kScalingLookupTableSize], - const void* source_plane_y, ptrdiff_t source_stride_y, + Plane plane, const FilmGrainParams& params, + const void* LIBGAV1_RESTRICT noise_image_ptr, int min_value, int max_chroma, + int width, int height, int start_height, int subsampling_x, + int subsampling_y, const int16_t* scaling_lut, + const void* LIBGAV1_RESTRICT source_plane_y, ptrdiff_t source_stride_y, const void* source_plane_uv, ptrdiff_t source_stride_uv, void* dest_plane_uv, ptrdiff_t dest_stride_uv) { assert(plane == kPlaneU || plane == kPlaneV); @@ -463,10 +443,10 @@ void Init8bpp() { assert(dsp != nullptr); dsp->film_grain.blend_noise_luma = - BlendNoiseWithImageLuma_SSE4_1<8, int8_t, uint8_t>; + BlendNoiseWithImageLuma_SSE4_1<kBitdepth8, int8_t, uint8_t>; dsp->film_grain.blend_noise_chroma[0] = BlendNoiseWithImageChroma8bpp_SSE4_1; dsp->film_grain.blend_noise_chroma[1] = - BlendNoiseWithImageChromaWithCfl_SSE4_1<8, int8_t, uint8_t>; + BlendNoiseWithImageChromaWithCfl_SSE4_1<kBitdepth8, int8_t, uint8_t>; } } // namespace @@ -481,9 +461,9 @@ void Init10bpp() { assert(dsp != nullptr); dsp->film_grain.blend_noise_luma = - BlendNoiseWithImageLuma_SSE4_1<10, int16_t, uint16_t>; + BlendNoiseWithImageLuma_SSE4_1<kBitdepth10, int16_t, uint16_t>; dsp->film_grain.blend_noise_chroma[1] = - BlendNoiseWithImageChromaWithCfl_SSE4_1<10, int16_t, uint16_t>; + BlendNoiseWithImageChromaWithCfl_SSE4_1<kBitdepth10, int16_t, uint16_t>; } } // namespace diff --git a/src/dsp/x86/intra_edge_sse4.cc b/src/dsp/x86/intra_edge_sse4.cc index d6af907..967be06 100644 --- a/src/dsp/x86/intra_edge_sse4.cc +++ b/src/dsp/x86/intra_edge_sse4.cc @@ -41,7 +41,8 @@ constexpr int kMaxEdgeBufferSize = 129; // This function applies the kernel [0, 4, 8, 4, 0] to 12 values. // Assumes |edge| has 16 packed byte values. Produces 12 filter outputs to // write as overlapping sets of 8-bytes. -inline void ComputeKernel1Store12(uint8_t* dest, const uint8_t* source) { +inline void ComputeKernel1Store12(uint8_t* LIBGAV1_RESTRICT dest, + const uint8_t* LIBGAV1_RESTRICT source) { const __m128i edge_lo = LoadUnaligned16(source); const __m128i edge_hi = _mm_srli_si128(edge_lo, 6); // Samples matched with the '4' tap, expanded to 16-bit. @@ -77,7 +78,8 @@ inline void ComputeKernel1Store12(uint8_t* dest, const uint8_t* source) { // This function applies the kernel [0, 5, 6, 5, 0] to 12 values. // Assumes |edge| has 8 packed byte values, and that the 2 invalid values will // be overwritten or safely discarded. -inline void ComputeKernel2Store12(uint8_t* dest, const uint8_t* source) { +inline void ComputeKernel2Store12(uint8_t* LIBGAV1_RESTRICT dest, + const uint8_t* LIBGAV1_RESTRICT source) { const __m128i edge_lo = LoadUnaligned16(source); const __m128i edge_hi = _mm_srli_si128(edge_lo, 6); const __m128i outers_lo = _mm_cvtepu8_epi16(edge_lo); @@ -115,7 +117,8 @@ inline void ComputeKernel2Store12(uint8_t* dest, const uint8_t* source) { } // This function applies the kernel [2, 4, 4, 4, 2] to 8 values. -inline void ComputeKernel3Store8(uint8_t* dest, const uint8_t* source) { +inline void ComputeKernel3Store8(uint8_t* LIBGAV1_RESTRICT dest, + const uint8_t* LIBGAV1_RESTRICT source) { const __m128i edge_lo = LoadUnaligned16(source); const __m128i edge_hi = _mm_srli_si128(edge_lo, 4); // Finish |edge_lo| life cycle quickly. diff --git a/src/dsp/x86/intrapred_cfl_sse4.cc b/src/dsp/x86/intrapred_cfl_sse4.cc index f2dcfdb..eb7e466 100644 --- a/src/dsp/x86/intrapred_cfl_sse4.cc +++ b/src/dsp/x86/intrapred_cfl_sse4.cc @@ -88,7 +88,7 @@ inline __m128i CflPredictUnclipped(const __m128i* input, __m128i alpha_q12, template <int width, int height> void CflIntraPredictor_SSE4_1( - void* const dest, ptrdiff_t stride, + void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], const int alpha) { auto* dst = static_cast<uint8_t*>(dest); @@ -127,7 +127,8 @@ void CflIntraPredictor_SSE4_1( template <int block_height_log2, bool is_inside> void CflSubsampler444_4xH_SSE4_1( int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], - const int max_luma_height, const void* const source, ptrdiff_t stride) { + const int max_luma_height, const void* LIBGAV1_RESTRICT const source, + ptrdiff_t stride) { static_assert(block_height_log2 <= 4, ""); const int block_height = 1 << block_height_log2; const int visible_height = max_luma_height; @@ -189,7 +190,7 @@ template <int block_height_log2> void CflSubsampler444_4xH_SSE4_1( int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], const int max_luma_width, const int max_luma_height, - const void* const source, ptrdiff_t stride) { + const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) { static_assert(block_height_log2 <= 4, ""); assert(max_luma_width >= 4); assert(max_luma_height >= 4); @@ -209,7 +210,7 @@ template <int block_height_log2, bool inside> void CflSubsampler444_8xH_SSE4_1( int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], const int max_luma_width, const int max_luma_height, - const void* const source, ptrdiff_t stride) { + const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) { static_assert(block_height_log2 <= 5, ""); const int block_height = 1 << block_height_log2, block_width = 8; const int visible_height = max_luma_height; @@ -292,7 +293,7 @@ template <int block_height_log2> void CflSubsampler444_8xH_SSE4_1( int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], const int max_luma_width, const int max_luma_height, - const void* const source, ptrdiff_t stride) { + const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) { static_assert(block_height_log2 <= 5, ""); assert(max_luma_width >= 4); assert(max_luma_height >= 4); @@ -315,7 +316,7 @@ template <int block_width_log2, int block_height_log2, bool inside> void CflSubsampler444_SSE4_1( int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], const int max_luma_width, const int max_luma_height, - const void* const source, ptrdiff_t stride) { + const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) { static_assert(block_width_log2 == 4 || block_width_log2 == 5, ""); static_assert(block_height_log2 <= 5, ""); assert(max_luma_width >= 4); @@ -418,7 +419,7 @@ template <int block_width_log2, int block_height_log2> void CflSubsampler444_SSE4_1( int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], const int max_luma_width, const int max_luma_height, - const void* const source, ptrdiff_t stride) { + const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) { static_assert(block_width_log2 == 4 || block_width_log2 == 5, ""); static_assert(block_height_log2 <= 5, ""); assert(max_luma_width >= 4); @@ -441,7 +442,7 @@ template <int block_height_log2> void CflSubsampler420_4xH_SSE4_1( int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], const int /*max_luma_width*/, const int max_luma_height, - const void* const source, ptrdiff_t stride) { + const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) { const int block_height = 1 << block_height_log2; const auto* src = static_cast<const uint8_t*>(source); int16_t* luma_ptr = luma[0]; @@ -511,7 +512,7 @@ template <int block_height_log2, int max_luma_width> inline void CflSubsampler420Impl_8xH_SSE4_1( int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], const int /*max_luma_width*/, const int max_luma_height, - const void* const source, ptrdiff_t stride) { + const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) { const int block_height = 1 << block_height_log2; const auto* src = static_cast<const uint8_t*>(source); const __m128i zero = _mm_setzero_si128(); @@ -620,7 +621,7 @@ template <int block_height_log2> void CflSubsampler420_8xH_SSE4_1( int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], const int max_luma_width, const int max_luma_height, - const void* const source, ptrdiff_t stride) { + const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) { if (max_luma_width == 8) { CflSubsampler420Impl_8xH_SSE4_1<block_height_log2, 8>( luma, max_luma_width, max_luma_height, source, stride); @@ -634,7 +635,7 @@ template <int block_width_log2, int block_height_log2, int max_luma_width> inline void CflSubsampler420Impl_WxH_SSE4_1( int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], const int /*max_luma_width*/, const int max_luma_height, - const void* const source, ptrdiff_t stride) { + const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) { const auto* src = static_cast<const uint8_t*>(source); const __m128i zero = _mm_setzero_si128(); __m128i final_sum = zero; @@ -751,7 +752,7 @@ template <int block_width_log2, int block_height_log2> void CflSubsampler420_WxH_SSE4_1( int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], const int max_luma_width, const int max_luma_height, - const void* const source, ptrdiff_t stride) { + const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) { switch (max_luma_width) { case 8: CflSubsampler420Impl_WxH_SSE4_1<block_width_log2, block_height_log2, 8>( @@ -968,7 +969,7 @@ inline __m128i ClipEpi16(__m128i x, __m128i min, __m128i max) { template <int width, int height> void CflIntraPredictor_10bpp_SSE4_1( - void* const dest, ptrdiff_t stride, + void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], const int alpha) { constexpr int kCflLumaBufferStrideLog2_16i = 5; @@ -1018,7 +1019,8 @@ void CflIntraPredictor_10bpp_SSE4_1( template <int block_height_log2, bool is_inside> void CflSubsampler444_4xH_SSE4_1( int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], - const int max_luma_height, const void* const source, ptrdiff_t stride) { + const int max_luma_height, const void* LIBGAV1_RESTRICT const source, + ptrdiff_t stride) { static_assert(block_height_log2 <= 4, ""); const int block_height = 1 << block_height_log2; const int visible_height = max_luma_height; @@ -1079,7 +1081,7 @@ template <int block_height_log2> void CflSubsampler444_4xH_SSE4_1( int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], const int max_luma_width, const int max_luma_height, - const void* const source, ptrdiff_t stride) { + const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) { static_cast<void>(max_luma_width); static_cast<void>(max_luma_height); static_assert(block_height_log2 <= 4, ""); @@ -1099,7 +1101,8 @@ void CflSubsampler444_4xH_SSE4_1( template <int block_height_log2, bool is_inside> void CflSubsampler444_8xH_SSE4_1( int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], - const int max_luma_height, const void* const source, ptrdiff_t stride) { + const int max_luma_height, const void* LIBGAV1_RESTRICT const source, + ptrdiff_t stride) { const int block_height = 1 << block_height_log2; const int visible_height = max_luma_height; const __m128i dup16 = _mm_set1_epi32(0x01000100); @@ -1158,7 +1161,7 @@ template <int block_height_log2> void CflSubsampler444_8xH_SSE4_1( int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], const int max_luma_width, const int max_luma_height, - const void* const source, ptrdiff_t stride) { + const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) { static_cast<void>(max_luma_width); static_cast<void>(max_luma_height); static_assert(block_height_log2 <= 5, ""); @@ -1182,7 +1185,7 @@ template <int block_width_log2, int block_height_log2, bool is_inside> void CflSubsampler444_WxH_SSE4_1( int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], const int max_luma_width, const int max_luma_height, - const void* const source, ptrdiff_t stride) { + const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) { const int block_height = 1 << block_height_log2; const int visible_height = max_luma_height; const int block_width = 1 << block_width_log2; @@ -1278,7 +1281,7 @@ template <int block_width_log2, int block_height_log2> void CflSubsampler444_WxH_SSE4_1( int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], const int max_luma_width, const int max_luma_height, - const void* const source, ptrdiff_t stride) { + const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) { static_assert(block_width_log2 == 4 || block_width_log2 == 5, "This function will only work for block_width 16 and 32."); static_assert(block_height_log2 <= 5, ""); @@ -1300,7 +1303,7 @@ template <int block_height_log2> void CflSubsampler420_4xH_SSE4_1( int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], const int /*max_luma_width*/, const int max_luma_height, - const void* const source, ptrdiff_t stride) { + const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) { const int block_height = 1 << block_height_log2; const auto* src = static_cast<const uint16_t*>(source); const ptrdiff_t src_stride = stride / sizeof(src[0]); @@ -1371,7 +1374,8 @@ void CflSubsampler420_4xH_SSE4_1( template <int block_height_log2, int max_luma_width> inline void CflSubsampler420Impl_8xH_SSE4_1( int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], - const int max_luma_height, const void* const source, ptrdiff_t stride) { + const int max_luma_height, const void* LIBGAV1_RESTRICT const source, + ptrdiff_t stride) { const int block_height = 1 << block_height_log2; const auto* src = static_cast<const uint16_t*>(source); const ptrdiff_t src_stride = stride / sizeof(src[0]); @@ -1483,7 +1487,7 @@ template <int block_height_log2> void CflSubsampler420_8xH_SSE4_1( int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], const int max_luma_width, const int max_luma_height, - const void* const source, ptrdiff_t stride) { + const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) { if (max_luma_width == 8) { CflSubsampler420Impl_8xH_SSE4_1<block_height_log2, 8>(luma, max_luma_height, source, stride); @@ -1496,7 +1500,8 @@ void CflSubsampler420_8xH_SSE4_1( template <int block_width_log2, int block_height_log2, int max_luma_width> inline void CflSubsampler420Impl_WxH_SSE4_1( int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], - const int max_luma_height, const void* const source, ptrdiff_t stride) { + const int max_luma_height, const void* LIBGAV1_RESTRICT const source, + ptrdiff_t stride) { const auto* src = static_cast<const uint16_t*>(source); const ptrdiff_t src_stride = stride / sizeof(src[0]); const __m128i zero = _mm_setzero_si128(); @@ -1615,7 +1620,7 @@ template <int block_width_log2, int block_height_log2> void CflSubsampler420_WxH_SSE4_1( int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], const int max_luma_width, const int max_luma_height, - const void* const source, ptrdiff_t stride) { + const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) { switch (max_luma_width) { case 8: CflSubsampler420Impl_WxH_SSE4_1<block_width_log2, block_height_log2, 8>( diff --git a/src/dsp/x86/intrapred_filter_sse4.cc b/src/dsp/x86/intrapred_filter_sse4.cc index 022af8d..a43a5cf 100644 --- a/src/dsp/x86/intrapred_filter_sse4.cc +++ b/src/dsp/x86/intrapred_filter_sse4.cc @@ -64,10 +64,10 @@ constexpr int kDuplicateFirstHalf = 0x44; // at zero to preserve the sum. // |pixels| contains p0-p7 in order as shown above. // |taps_0_1| contains the filter kernels used to predict f0 and f1, and so on. -inline void Filter4x2_SSE4_1(uint8_t* dst, const ptrdiff_t stride, - const __m128i& pixels, const __m128i& taps_0_1, - const __m128i& taps_2_3, const __m128i& taps_4_5, - const __m128i& taps_6_7) { +inline void Filter4x2_SSE4_1(uint8_t* LIBGAV1_RESTRICT dst, + const ptrdiff_t stride, const __m128i& pixels, + const __m128i& taps_0_1, const __m128i& taps_2_3, + const __m128i& taps_4_5, const __m128i& taps_6_7) { const __m128i mul_0_01 = _mm_maddubs_epi16(pixels, taps_0_1); const __m128i mul_0_23 = _mm_maddubs_epi16(pixels, taps_2_3); // |output_half| contains 8 partial sums for f0-f7. @@ -93,10 +93,10 @@ inline void Filter4x2_SSE4_1(uint8_t* dst, const ptrdiff_t stride, // for successive blocks. This implementation takes advantage of the fact // that the p5 and p6 for each sub-block come solely from the |left_ptr| buffer, // using shifts to arrange things to fit reusable shuffle vectors. -inline void Filter4xH(uint8_t* dest, ptrdiff_t stride, - const uint8_t* const top_ptr, - const uint8_t* const left_ptr, FilterIntraPredictor pred, - const int height) { +inline void Filter4xH(uint8_t* LIBGAV1_RESTRICT dest, ptrdiff_t stride, + const uint8_t* LIBGAV1_RESTRICT const top_ptr, + const uint8_t* LIBGAV1_RESTRICT const left_ptr, + FilterIntraPredictor pred, const int height) { // Two filter kernels per vector. const __m128i taps_0_1 = LoadAligned16(kFilterIntraTaps[pred][0]); const __m128i taps_2_3 = LoadAligned16(kFilterIntraTaps[pred][2]); @@ -271,9 +271,10 @@ inline void Filter4xH(uint8_t* dest, ptrdiff_t stride, } } -void FilterIntraPredictor_SSE4_1(void* const dest, ptrdiff_t stride, - const void* const top_row, - const void* const left_column, +void FilterIntraPredictor_SSE4_1(void* LIBGAV1_RESTRICT const dest, + ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column, FilterIntraPredictor pred, const int width, const int height) { const auto* const top_ptr = static_cast<const uint8_t*>(top_row); diff --git a/src/dsp/x86/intrapred_smooth_sse4.cc b/src/dsp/x86/intrapred_smooth_sse4.cc index de9f551..b53ee8c 100644 --- a/src/dsp/x86/intrapred_smooth_sse4.cc +++ b/src/dsp/x86/intrapred_smooth_sse4.cc @@ -38,23 +38,12 @@ namespace { // to have visibility of the values. This helps reduce loads and in the // creation of the inverse weights. constexpr uint8_t kSmoothWeights[] = { - // block dimension = 4 - 255, 149, 85, 64, - // block dimension = 8 - 255, 197, 146, 105, 73, 50, 37, 32, - // block dimension = 16 - 255, 225, 196, 170, 145, 123, 102, 84, 68, 54, 43, 33, 26, 20, 17, 16, - // block dimension = 32 - 255, 240, 225, 210, 196, 182, 169, 157, 145, 133, 122, 111, 101, 92, 83, 74, - 66, 59, 52, 45, 39, 34, 29, 25, 21, 17, 14, 12, 10, 9, 8, 8, - // block dimension = 64 - 255, 248, 240, 233, 225, 218, 210, 203, 196, 189, 182, 176, 169, 163, 156, - 150, 144, 138, 133, 127, 121, 116, 111, 106, 101, 96, 91, 86, 82, 77, 73, - 69, 65, 61, 57, 54, 50, 47, 44, 41, 38, 35, 32, 29, 27, 25, 22, 20, 18, 16, - 15, 13, 12, 10, 9, 8, 7, 6, 6, 5, 5, 4, 4, 4}; +#include "src/dsp/smooth_weights.inc" +}; template <int y_mask> -inline void WriteSmoothHorizontalSum4(void* const dest, const __m128i& left, +inline void WriteSmoothHorizontalSum4(void* LIBGAV1_RESTRICT const dest, + const __m128i& left, const __m128i& weights, const __m128i& scaled_top_right, const __m128i& round) { @@ -77,7 +66,8 @@ inline __m128i SmoothDirectionalSum8(const __m128i& pixels, return _mm_add_epi16(scaled_corner, weighted_px); } -inline void WriteSmoothDirectionalSum8(uint8_t* dest, const __m128i& pixels, +inline void WriteSmoothDirectionalSum8(uint8_t* LIBGAV1_RESTRICT dest, + const __m128i& pixels, const __m128i& weights, const __m128i& scaled_corner, const __m128i& round) { @@ -91,13 +81,11 @@ inline void WriteSmoothDirectionalSum8(uint8_t* dest, const __m128i& pixels, // For Horizontal, pixels1 and pixels2 are the same repeated value. For // Vertical, weights1 and weights2 are the same, and scaled_corner1 and // scaled_corner2 are the same. -inline void WriteSmoothDirectionalSum16(uint8_t* dest, const __m128i& pixels1, - const __m128i& pixels2, - const __m128i& weights1, - const __m128i& weights2, - const __m128i& scaled_corner1, - const __m128i& scaled_corner2, - const __m128i& round) { +inline void WriteSmoothDirectionalSum16( + uint8_t* LIBGAV1_RESTRICT dest, const __m128i& pixels1, + const __m128i& pixels2, const __m128i& weights1, const __m128i& weights2, + const __m128i& scaled_corner1, const __m128i& scaled_corner2, + const __m128i& round) { const __m128i weighted_px1 = _mm_mullo_epi16(pixels1, weights1); const __m128i weighted_px2 = _mm_mullo_epi16(pixels2, weights2); const __m128i pred_sum1 = _mm_add_epi16(scaled_corner1, weighted_px1); @@ -109,8 +97,9 @@ inline void WriteSmoothDirectionalSum16(uint8_t* dest, const __m128i& pixels1, } template <int y_mask> -inline void WriteSmoothPredSum4(uint8_t* const dest, const __m128i& top, - const __m128i& left, const __m128i& weights_x, +inline void WriteSmoothPredSum4(uint8_t* LIBGAV1_RESTRICT const dest, + const __m128i& top, const __m128i& left, + const __m128i& weights_x, const __m128i& weights_y, const __m128i& scaled_bottom_left, const __m128i& scaled_top_right, @@ -135,7 +124,8 @@ inline void WriteSmoothPredSum4(uint8_t* const dest, const __m128i& top, // pixels[0]: above and below_pred interleave vector // pixels[1]: left vector // pixels[2]: right_pred vector -inline void LoadSmoothPixels4(const uint8_t* above, const uint8_t* left, +inline void LoadSmoothPixels4(const uint8_t* LIBGAV1_RESTRICT above, + const uint8_t* LIBGAV1_RESTRICT left, const int height, __m128i* pixels) { if (height == 4) { pixels[1] = Load4(left); @@ -156,8 +146,9 @@ inline void LoadSmoothPixels4(const uint8_t* above, const uint8_t* left, // weight_h[2]: same as [0], second half for height = 16 only // weight_h[3]: same as [1], second half for height = 16 only // weight_w[0]: weights_w and scale - weights_w interleave vector -inline void LoadSmoothWeights4(const uint8_t* weight_array, const int height, - __m128i* weight_h, __m128i* weight_w) { +inline void LoadSmoothWeights4(const uint8_t* LIBGAV1_RESTRICT weight_array, + const int height, __m128i* weight_h, + __m128i* weight_w) { const __m128i scale = _mm_set1_epi16(256); const __m128i x_weights = Load4(weight_array); weight_h[0] = _mm_cvtepu8_epi16(x_weights); @@ -179,7 +170,8 @@ inline void LoadSmoothWeights4(const uint8_t* weight_array, const int height, } inline void WriteSmoothPred4x8(const __m128i* pixel, const __m128i* weights_y, - const __m128i* weight_x, uint8_t* dst, + const __m128i* weight_x, + uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride, const bool use_second_half) { const __m128i round = _mm_set1_epi32(256); @@ -215,8 +207,9 @@ inline void WriteSmoothPred4x8(const __m128i* pixel, const __m128i* weights_y, // The interleaving approach has some overhead that causes it to underperform in // the 4x4 case. -void Smooth4x4_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* top_row, const void* left_column) { +void Smooth4x4_SSE4_1(void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT top_row, + const void* LIBGAV1_RESTRICT left_column) { const __m128i top = _mm_cvtepu8_epi32(Load4(top_row)); const __m128i left = _mm_cvtepu8_epi32(Load4(left_column)); const __m128i weights = _mm_cvtepu8_epi32(Load4(kSmoothWeights)); @@ -247,8 +240,9 @@ void Smooth4x4_SSE4_1(void* const dest, const ptrdiff_t stride, scaled_bottom_left, scaled_top_right, scale); } -void Smooth4x8_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* top_row, const void* left_column) { +void Smooth4x8_SSE4_1(void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT top_row, + const void* LIBGAV1_RESTRICT left_column) { const auto* const left_ptr = static_cast<const uint8_t*>(left_column); const auto* const top_ptr = static_cast<const uint8_t*>(top_row); __m128i weights_x[1]; @@ -260,8 +254,10 @@ void Smooth4x8_SSE4_1(void* const dest, const ptrdiff_t stride, WriteSmoothPred4x8(pixels, weights_y, weights_x, dst, stride, false); } -void Smooth4x16_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* top_row, const void* left_column) { +void Smooth4x16_SSE4_1(void* LIBGAV1_RESTRICT const dest, + const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT top_row, + const void* LIBGAV1_RESTRICT left_column) { const auto* const left_ptr = static_cast<const uint8_t*>(left_column); const auto* const top_ptr = static_cast<const uint8_t*>(top_row); __m128i weights_x[1]; @@ -283,7 +279,8 @@ void Smooth4x16_SSE4_1(void* const dest, const ptrdiff_t stride, // pixels[5]: above and below_pred interleave vector, second half // pixels[6]: left vector + 16 // pixels[7]: right_pred vector -inline void LoadSmoothPixels8(const uint8_t* above, const uint8_t* left, +inline void LoadSmoothPixels8(const uint8_t* LIBGAV1_RESTRICT above, + const uint8_t* LIBGAV1_RESTRICT left, const int height, __m128i* pixels) { const __m128i bottom_left = _mm_set1_epi16(left[height - 1]); __m128i top_row = _mm_cvtepu8_epi16(LoadLo8(above)); @@ -317,8 +314,9 @@ inline void LoadSmoothPixels8(const uint8_t* above, const uint8_t* left, // weight_h[7]: same as [1], offset 24 // weight_w[0]: weights_w and scale - weights_w interleave vector, first half // weight_w[1]: weights_w and scale - weights_w interleave vector, second half -inline void LoadSmoothWeights8(const uint8_t* weight_array, const int height, - __m128i* weight_w, __m128i* weight_h) { +inline void LoadSmoothWeights8(const uint8_t* LIBGAV1_RESTRICT weight_array, + const int height, __m128i* weight_w, + __m128i* weight_h) { const int offset = (height < 8) ? 0 : 4; __m128i loaded_weights = LoadUnaligned16(&weight_array[offset]); weight_h[0] = _mm_cvtepu8_epi16(loaded_weights); @@ -360,7 +358,8 @@ inline void LoadSmoothWeights8(const uint8_t* weight_array, const int height, inline void WriteSmoothPred8xH(const __m128i* pixels, const __m128i* weights_x, const __m128i* weights_y, const int height, - uint8_t* dst, const ptrdiff_t stride, + uint8_t* LIBGAV1_RESTRICT dst, + const ptrdiff_t stride, const bool use_second_half) { const __m128i round = _mm_set1_epi32(256); const __m128i mask_increment = _mm_set1_epi16(0x0202); @@ -405,8 +404,9 @@ inline void WriteSmoothPred8xH(const __m128i* pixels, const __m128i* weights_x, } } -void Smooth8x4_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* top_row, const void* left_column) { +void Smooth8x4_SSE4_1(void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT top_row, + const void* LIBGAV1_RESTRICT left_column) { const auto* const left_ptr = static_cast<const uint8_t*>(left_column); const auto* const top_ptr = static_cast<const uint8_t*>(top_row); __m128i pixels[4]; @@ -419,8 +419,9 @@ void Smooth8x4_SSE4_1(void* const dest, const ptrdiff_t stride, WriteSmoothPred8xH(pixels, weights_x, weights_y, 4, dst, stride, false); } -void Smooth8x8_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* top_row, const void* left_column) { +void Smooth8x8_SSE4_1(void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT top_row, + const void* LIBGAV1_RESTRICT left_column) { const auto* const left_ptr = static_cast<const uint8_t*>(left_column); const auto* const top_ptr = static_cast<const uint8_t*>(top_row); @@ -434,8 +435,10 @@ void Smooth8x8_SSE4_1(void* const dest, const ptrdiff_t stride, WriteSmoothPred8xH(pixels, weights_x, weights_y, 8, dst, stride, false); } -void Smooth8x16_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* top_row, const void* left_column) { +void Smooth8x16_SSE4_1(void* LIBGAV1_RESTRICT const dest, + const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT top_row, + const void* LIBGAV1_RESTRICT left_column) { const auto* const left_ptr = static_cast<const uint8_t*>(left_column); const auto* const top_ptr = static_cast<const uint8_t*>(top_row); __m128i pixels[4]; @@ -450,8 +453,10 @@ void Smooth8x16_SSE4_1(void* const dest, const ptrdiff_t stride, WriteSmoothPred8xH(pixels, weights_x, &weights_y[2], 8, dst, stride, true); } -void Smooth8x32_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* top_row, const void* left_column) { +void Smooth8x32_SSE4_1(void* LIBGAV1_RESTRICT const dest, + const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT top_row, + const void* LIBGAV1_RESTRICT left_column) { const auto* const left_ptr = static_cast<const uint8_t*>(left_column); const auto* const top_ptr = static_cast<const uint8_t*>(top_row); __m128i pixels[8]; @@ -473,8 +478,9 @@ void Smooth8x32_SSE4_1(void* const dest, const ptrdiff_t stride, } template <int width, int height> -void SmoothWxH(void* const dest, const ptrdiff_t stride, - const void* const top_row, const void* const left_column) { +void SmoothWxH(void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const left_ptr = static_cast<const uint8_t*>(left_column); const auto* const top_ptr = static_cast<const uint8_t*>(top_row); const uint8_t* const sm_weights_h = kSmoothWeights + height - 4; @@ -532,8 +538,10 @@ void SmoothWxH(void* const dest, const ptrdiff_t stride, } } -void SmoothHorizontal4x4_SSE4_1(void* dest, const ptrdiff_t stride, - const void* top_row, const void* left_column) { +void SmoothHorizontal4x4_SSE4_1(void* LIBGAV1_RESTRICT dest, + const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT top_row, + const void* LIBGAV1_RESTRICT left_column) { const auto* const top_ptr = static_cast<const uint8_t*>(top_row); const __m128i top_right = _mm_set1_epi32(top_ptr[3]); const auto* const left_ptr = static_cast<const uint8_t*>(left_column); @@ -553,9 +561,10 @@ void SmoothHorizontal4x4_SSE4_1(void* dest, const ptrdiff_t stride, WriteSmoothHorizontalSum4<0xFF>(dst, left, weights, scaled_top_right, scale); } -void SmoothHorizontal4x8_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void SmoothHorizontal4x8_SSE4_1( + void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const top = static_cast<const uint8_t*>(top_row); const __m128i top_right = _mm_set1_epi32(top[3]); const __m128i weights = _mm_cvtepu8_epi32(Load4(kSmoothWeights)); @@ -585,9 +594,10 @@ void SmoothHorizontal4x8_SSE4_1(void* const dest, const ptrdiff_t stride, WriteSmoothHorizontalSum4<0xFF>(dst, left, weights, scaled_top_right, scale); } -void SmoothHorizontal4x16_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void SmoothHorizontal4x16_SSE4_1( + void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const top = static_cast<const uint8_t*>(top_row); const __m128i top_right = _mm_set1_epi32(top[3]); const __m128i weights = _mm_cvtepu8_epi32(Load4(kSmoothWeights)); @@ -637,9 +647,10 @@ void SmoothHorizontal4x16_SSE4_1(void* const dest, const ptrdiff_t stride, WriteSmoothHorizontalSum4<0xFF>(dst, left, weights, scaled_top_right, scale); } -void SmoothHorizontal8x4_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void SmoothHorizontal8x4_SSE4_1( + void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const top = static_cast<const uint8_t*>(top_row); const __m128i top_right = _mm_set1_epi16(top[7]); const __m128i left = _mm_cvtepu8_epi16(Load4(left_column)); @@ -666,9 +677,10 @@ void SmoothHorizontal8x4_SSE4_1(void* const dest, const ptrdiff_t stride, WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale); } -void SmoothHorizontal8x8_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void SmoothHorizontal8x8_SSE4_1( + void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const top = static_cast<const uint8_t*>(top_row); const __m128i top_right = _mm_set1_epi16(top[7]); const __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column)); @@ -686,9 +698,10 @@ void SmoothHorizontal8x8_SSE4_1(void* const dest, const ptrdiff_t stride, } } -void SmoothHorizontal8x16_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void SmoothHorizontal8x16_SSE4_1( + void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const top = static_cast<const uint8_t*>(top_row); const __m128i top_right = _mm_set1_epi16(top[7]); const __m128i weights = _mm_cvtepu8_epi16(LoadLo8(kSmoothWeights + 4)); @@ -714,9 +727,10 @@ void SmoothHorizontal8x16_SSE4_1(void* const dest, const ptrdiff_t stride, } } -void SmoothHorizontal8x32_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void SmoothHorizontal8x32_SSE4_1( + void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const top = static_cast<const uint8_t*>(top_row); const __m128i top_right = _mm_set1_epi16(top[7]); const __m128i weights = _mm_cvtepu8_epi16(LoadLo8(kSmoothWeights + 4)); @@ -756,9 +770,10 @@ void SmoothHorizontal8x32_SSE4_1(void* const dest, const ptrdiff_t stride, } } -void SmoothHorizontal16x4_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void SmoothHorizontal16x4_SSE4_1( + void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const top = static_cast<const uint8_t*>(top_row); const __m128i top_right = _mm_set1_epi16(top[15]); const __m128i left = _mm_cvtepu8_epi16(Load4(left_column)); @@ -795,9 +810,10 @@ void SmoothHorizontal16x4_SSE4_1(void* const dest, const ptrdiff_t stride, scaled_top_right1, scaled_top_right2, scale); } -void SmoothHorizontal16x8_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void SmoothHorizontal16x8_SSE4_1( + void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const top = static_cast<const uint8_t*>(top_row); const __m128i top_right = _mm_set1_epi16(top[15]); const __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column)); @@ -822,9 +838,10 @@ void SmoothHorizontal16x8_SSE4_1(void* const dest, const ptrdiff_t stride, } } -void SmoothHorizontal16x16_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void SmoothHorizontal16x16_SSE4_1( + void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const top = static_cast<const uint8_t*>(top_row); const __m128i top_right = _mm_set1_epi16(top[15]); const __m128i weights = LoadUnaligned16(kSmoothWeights + 12); @@ -858,9 +875,10 @@ void SmoothHorizontal16x16_SSE4_1(void* const dest, const ptrdiff_t stride, } } -void SmoothHorizontal16x32_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void SmoothHorizontal16x32_SSE4_1( + void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const top = static_cast<const uint8_t*>(top_row); const __m128i top_right = _mm_set1_epi16(top[15]); const __m128i weights = LoadUnaligned16(kSmoothWeights + 12); @@ -910,9 +928,10 @@ void SmoothHorizontal16x32_SSE4_1(void* const dest, const ptrdiff_t stride, } } -void SmoothHorizontal16x64_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void SmoothHorizontal16x64_SSE4_1( + void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const top = static_cast<const uint8_t*>(top_row); const __m128i top_right = _mm_set1_epi16(top[15]); const __m128i weights = LoadUnaligned16(kSmoothWeights + 12); @@ -940,9 +959,10 @@ void SmoothHorizontal16x64_SSE4_1(void* const dest, const ptrdiff_t stride, } } -void SmoothHorizontal32x8_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void SmoothHorizontal32x8_SSE4_1( + void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const top = static_cast<const uint8_t*>(top_row); const __m128i top_right = _mm_set1_epi16(top[31]); const __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column)); @@ -978,9 +998,10 @@ void SmoothHorizontal32x8_SSE4_1(void* const dest, const ptrdiff_t stride, } } -void SmoothHorizontal32x16_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void SmoothHorizontal32x16_SSE4_1( + void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const top = static_cast<const uint8_t*>(top_row); const __m128i top_right = _mm_set1_epi16(top[31]); const __m128i left1 = _mm_cvtepu8_epi16(LoadLo8(left_column)); @@ -1027,9 +1048,10 @@ void SmoothHorizontal32x16_SSE4_1(void* const dest, const ptrdiff_t stride, } } -void SmoothHorizontal32x32_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void SmoothHorizontal32x32_SSE4_1( + void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const top = static_cast<const uint8_t*>(top_row); const __m128i top_right = _mm_set1_epi16(top[31]); const __m128i weights_lo = LoadUnaligned16(kSmoothWeights + 28); @@ -1096,9 +1118,10 @@ void SmoothHorizontal32x32_SSE4_1(void* const dest, const ptrdiff_t stride, } } -void SmoothHorizontal32x64_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void SmoothHorizontal32x64_SSE4_1( + void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const top = static_cast<const uint8_t*>(top_row); const __m128i top_right = _mm_set1_epi16(top[31]); const __m128i weights_lo = LoadUnaligned16(kSmoothWeights + 28); @@ -1137,9 +1160,10 @@ void SmoothHorizontal32x64_SSE4_1(void* const dest, const ptrdiff_t stride, } } -void SmoothHorizontal64x16_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void SmoothHorizontal64x16_SSE4_1( + void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const top = static_cast<const uint8_t*>(top_row); const __m128i top_right = _mm_set1_epi16(top[63]); const __m128i left1 = _mm_cvtepu8_epi16(LoadLo8(left_column)); @@ -1212,9 +1236,10 @@ void SmoothHorizontal64x16_SSE4_1(void* const dest, const ptrdiff_t stride, } } -void SmoothHorizontal64x32_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void SmoothHorizontal64x32_SSE4_1( + void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const top = static_cast<const uint8_t*>(top_row); const __m128i top_right = _mm_set1_epi16(top[63]); const __m128i left1 = _mm_cvtepu8_epi16(LoadLo8(left_column)); @@ -1315,9 +1340,10 @@ void SmoothHorizontal64x32_SSE4_1(void* const dest, const ptrdiff_t stride, } } -void SmoothHorizontal64x64_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void SmoothHorizontal64x64_SSE4_1( + void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const top = static_cast<const uint8_t*>(top_row); const __m128i top_right = _mm_set1_epi16(top[63]); const __m128i weights_lolo = LoadUnaligned16(kSmoothWeights + 60); @@ -1378,7 +1404,8 @@ void SmoothHorizontal64x64_SSE4_1(void* const dest, const ptrdiff_t stride, } } -inline void LoadSmoothVerticalPixels4(const uint8_t* above, const uint8_t* left, +inline void LoadSmoothVerticalPixels4(const uint8_t* LIBGAV1_RESTRICT above, + const uint8_t* LIBGAV1_RESTRICT left, const int height, __m128i* pixels) { __m128i top = Load4(above); const __m128i bottom_left = _mm_set1_epi16(left[height - 1]); @@ -1390,7 +1417,8 @@ inline void LoadSmoothVerticalPixels4(const uint8_t* above, const uint8_t* left, // (256-w) counterparts. This is precomputed by the compiler when the weights // table is visible to this module. Removing this visibility can cut speed by up // to half in both 4xH and 8xH transforms. -inline void LoadSmoothVerticalWeights4(const uint8_t* weight_array, +inline void LoadSmoothVerticalWeights4(const uint8_t* LIBGAV1_RESTRICT + weight_array, const int height, __m128i* weights) { const __m128i inverter = _mm_set1_epi16(256); @@ -1413,7 +1441,8 @@ inline void LoadSmoothVerticalWeights4(const uint8_t* weight_array, } inline void WriteSmoothVertical4xH(const __m128i* pixel, const __m128i* weight, - const int height, uint8_t* dst, + const int height, + uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride) { const __m128i pred_round = _mm_set1_epi32(128); const __m128i mask_increment = _mm_set1_epi16(0x0202); @@ -1438,9 +1467,10 @@ inline void WriteSmoothVertical4xH(const __m128i* pixel, const __m128i* weight, } } -void SmoothVertical4x4_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void SmoothVertical4x4_SSE4_1(void* LIBGAV1_RESTRICT const dest, + const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const left = static_cast<const uint8_t*>(left_column); const auto* const above = static_cast<const uint8_t*>(top_row); auto* dst = static_cast<uint8_t*>(dest); @@ -1453,9 +1483,10 @@ void SmoothVertical4x4_SSE4_1(void* const dest, const ptrdiff_t stride, WriteSmoothVertical4xH(&pixels, weights, 4, dst, stride); } -void SmoothVertical4x8_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void SmoothVertical4x8_SSE4_1(void* LIBGAV1_RESTRICT const dest, + const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const left = static_cast<const uint8_t*>(left_column); const auto* const above = static_cast<const uint8_t*>(top_row); auto* dst = static_cast<uint8_t*>(dest); @@ -1468,9 +1499,10 @@ void SmoothVertical4x8_SSE4_1(void* const dest, const ptrdiff_t stride, WriteSmoothVertical4xH(&pixels, weights, 8, dst, stride); } -void SmoothVertical4x16_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void SmoothVertical4x16_SSE4_1(void* LIBGAV1_RESTRICT const dest, + const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const left = static_cast<const uint8_t*>(left_column); const auto* const above = static_cast<const uint8_t*>(top_row); auto* dst = static_cast<uint8_t*>(dest); @@ -1485,9 +1517,10 @@ void SmoothVertical4x16_SSE4_1(void* const dest, const ptrdiff_t stride, WriteSmoothVertical4xH(&pixels, &weights[2], 8, dst, stride); } -void SmoothVertical8x4_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void SmoothVertical8x4_SSE4_1(void* LIBGAV1_RESTRICT const dest, + const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const left_ptr = static_cast<const uint8_t*>(left_column); const __m128i bottom_left = _mm_set1_epi16(left_ptr[3]); const __m128i weights = _mm_cvtepu8_epi16(Load4(kSmoothWeights)); @@ -1520,9 +1553,10 @@ void SmoothVertical8x4_SSE4_1(void* const dest, const ptrdiff_t stride, WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y, scale); } -void SmoothVertical8x8_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void SmoothVertical8x8_SSE4_1(void* LIBGAV1_RESTRICT const dest, + const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const left_ptr = static_cast<const uint8_t*>(left_column); const __m128i bottom_left = _mm_set1_epi16(left_ptr[7]); const __m128i weights = _mm_cvtepu8_epi16(LoadLo8(kSmoothWeights + 4)); @@ -1544,9 +1578,10 @@ void SmoothVertical8x8_SSE4_1(void* const dest, const ptrdiff_t stride, } } -void SmoothVertical8x16_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void SmoothVertical8x16_SSE4_1(void* LIBGAV1_RESTRICT const dest, + const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const left_ptr = static_cast<const uint8_t*>(left_column); const __m128i bottom_left = _mm_set1_epi16(left_ptr[15]); const __m128i weights = LoadUnaligned16(kSmoothWeights + 12); @@ -1583,9 +1618,10 @@ void SmoothVertical8x16_SSE4_1(void* const dest, const ptrdiff_t stride, } } -void SmoothVertical8x32_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void SmoothVertical8x32_SSE4_1(void* LIBGAV1_RESTRICT const dest, + const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const left_ptr = static_cast<const uint8_t*>(left_column); const __m128i zero = _mm_setzero_si128(); const __m128i bottom_left = _mm_set1_epi16(left_ptr[31]); @@ -1649,9 +1685,10 @@ void SmoothVertical8x32_SSE4_1(void* const dest, const ptrdiff_t stride, } } -void SmoothVertical16x4_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void SmoothVertical16x4_SSE4_1(void* LIBGAV1_RESTRICT const dest, + const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const left_ptr = static_cast<const uint8_t*>(left_column); auto* dst = static_cast<uint8_t*>(dest); const __m128i bottom_left = _mm_set1_epi16(left_ptr[3]); @@ -1694,9 +1731,10 @@ void SmoothVertical16x4_SSE4_1(void* const dest, const ptrdiff_t stride, scale); } -void SmoothVertical16x8_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void SmoothVertical16x8_SSE4_1(void* LIBGAV1_RESTRICT const dest, + const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const left_ptr = static_cast<const uint8_t*>(left_column); auto* dst = static_cast<uint8_t*>(dest); const __m128i bottom_left = _mm_set1_epi16(left_ptr[7]); @@ -1722,9 +1760,10 @@ void SmoothVertical16x8_SSE4_1(void* const dest, const ptrdiff_t stride, } } -void SmoothVertical16x16_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void SmoothVertical16x16_SSE4_1( + void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const left_ptr = static_cast<const uint8_t*>(left_column); auto* dst = static_cast<uint8_t*>(dest); const __m128i bottom_left = _mm_set1_epi16(left_ptr[15]); @@ -1766,9 +1805,10 @@ void SmoothVertical16x16_SSE4_1(void* const dest, const ptrdiff_t stride, } } -void SmoothVertical16x32_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void SmoothVertical16x32_SSE4_1( + void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const left_ptr = static_cast<const uint8_t*>(left_column); auto* dst = static_cast<uint8_t*>(dest); const __m128i bottom_left = _mm_set1_epi16(left_ptr[31]); @@ -1839,9 +1879,10 @@ void SmoothVertical16x32_SSE4_1(void* const dest, const ptrdiff_t stride, } } -void SmoothVertical16x64_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void SmoothVertical16x64_SSE4_1( + void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const left_ptr = static_cast<const uint8_t*>(left_column); auto* dst = static_cast<uint8_t*>(dest); const __m128i bottom_left = _mm_set1_epi16(left_ptr[63]); @@ -1887,9 +1928,10 @@ void SmoothVertical16x64_SSE4_1(void* const dest, const ptrdiff_t stride, } } -void SmoothVertical32x8_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void SmoothVertical32x8_SSE4_1(void* LIBGAV1_RESTRICT const dest, + const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const left_ptr = static_cast<const uint8_t*>(left_column); const auto* const top_ptr = static_cast<const uint8_t*>(top_row); auto* dst = static_cast<uint8_t*>(dest); @@ -1922,9 +1964,10 @@ void SmoothVertical32x8_SSE4_1(void* const dest, const ptrdiff_t stride, } } -void SmoothVertical32x16_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void SmoothVertical32x16_SSE4_1( + void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const left_ptr = static_cast<const uint8_t*>(left_column); const auto* const top_ptr = static_cast<const uint8_t*>(top_row); auto* dst = static_cast<uint8_t*>(dest); @@ -1975,9 +2018,10 @@ void SmoothVertical32x16_SSE4_1(void* const dest, const ptrdiff_t stride, } } -void SmoothVertical32x32_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void SmoothVertical32x32_SSE4_1( + void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const left_ptr = static_cast<const uint8_t*>(left_column); auto* dst = static_cast<uint8_t*>(dest); const auto* const top_ptr = static_cast<const uint8_t*>(top_row); @@ -2063,9 +2107,10 @@ void SmoothVertical32x32_SSE4_1(void* const dest, const ptrdiff_t stride, } } -void SmoothVertical32x64_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void SmoothVertical32x64_SSE4_1( + void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const left_ptr = static_cast<const uint8_t*>(left_column); auto* dst = static_cast<uint8_t*>(dest); const auto* const top_ptr = static_cast<const uint8_t*>(top_row); @@ -2120,9 +2165,10 @@ void SmoothVertical32x64_SSE4_1(void* const dest, const ptrdiff_t stride, } } -void SmoothVertical64x16_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void SmoothVertical64x16_SSE4_1( + void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const left_ptr = static_cast<const uint8_t*>(left_column); auto* dst = static_cast<uint8_t*>(dest); const auto* const top_ptr = static_cast<const uint8_t*>(top_row); @@ -2192,9 +2238,10 @@ void SmoothVertical64x16_SSE4_1(void* const dest, const ptrdiff_t stride, } } -void SmoothVertical64x32_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void SmoothVertical64x32_SSE4_1( + void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const left_ptr = static_cast<const uint8_t*>(left_column); auto* dst = static_cast<uint8_t*>(dest); const auto* const top_ptr = static_cast<const uint8_t*>(top_row); @@ -2311,9 +2358,10 @@ void SmoothVertical64x32_SSE4_1(void* const dest, const ptrdiff_t stride, } } -void SmoothVertical64x64_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void SmoothVertical64x64_SSE4_1( + void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const left_ptr = static_cast<const uint8_t*>(left_column); auto* dst = static_cast<uint8_t*>(dest); const auto* const top_ptr = static_cast<const uint8_t*>(top_row); diff --git a/src/dsp/x86/intrapred_sse4.cc b/src/dsp/x86/intrapred_sse4.cc index 063929d..556afed 100644 --- a/src/dsp/x86/intrapred_sse4.cc +++ b/src/dsp/x86/intrapred_sse4.cc @@ -90,11 +90,11 @@ struct DirectionalPredFuncs_SSE4_1 { template <int width_log2, int height_log2, DcSumFunc top_sumfn, DcSumFunc left_sumfn, DcStoreFunc storefn, int shiftk, int dc_mult> -void DcPredFuncs_SSE4_1<width_log2, height_log2, top_sumfn, left_sumfn, storefn, - shiftk, dc_mult>::DcTop(void* const dest, - ptrdiff_t stride, - const void* const top_row, - const void* /*left_column*/) { +void DcPredFuncs_SSE4_1< + width_log2, height_log2, top_sumfn, left_sumfn, storefn, shiftk, + dc_mult>::DcTop(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* /*left_column*/) { const __m128i rounder = _mm_set1_epi32(1 << (width_log2 - 1)); const __m128i sum = top_sumfn(top_row); const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, rounder), width_log2); @@ -103,11 +103,11 @@ void DcPredFuncs_SSE4_1<width_log2, height_log2, top_sumfn, left_sumfn, storefn, template <int width_log2, int height_log2, DcSumFunc top_sumfn, DcSumFunc left_sumfn, DcStoreFunc storefn, int shiftk, int dc_mult> -void DcPredFuncs_SSE4_1<width_log2, height_log2, top_sumfn, left_sumfn, storefn, - shiftk, - dc_mult>::DcLeft(void* const dest, ptrdiff_t stride, - const void* /*top_row*/, - const void* const left_column) { +void DcPredFuncs_SSE4_1< + width_log2, height_log2, top_sumfn, left_sumfn, storefn, shiftk, + dc_mult>::DcLeft(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, + const void* /*top_row*/, + const void* LIBGAV1_RESTRICT const left_column) { const __m128i rounder = _mm_set1_epi32(1 << (height_log2 - 1)); const __m128i sum = left_sumfn(left_column); const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, rounder), height_log2); @@ -116,10 +116,11 @@ void DcPredFuncs_SSE4_1<width_log2, height_log2, top_sumfn, left_sumfn, storefn, template <int width_log2, int height_log2, DcSumFunc top_sumfn, DcSumFunc left_sumfn, DcStoreFunc storefn, int shiftk, int dc_mult> -void DcPredFuncs_SSE4_1<width_log2, height_log2, top_sumfn, left_sumfn, storefn, - shiftk, dc_mult>::Dc(void* const dest, ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void DcPredFuncs_SSE4_1< + width_log2, height_log2, top_sumfn, left_sumfn, storefn, shiftk, + dc_mult>::Dc(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const __m128i rounder = _mm_set1_epi32((1 << (width_log2 - 1)) + (1 << (height_log2 - 1))); const __m128i sum_top = top_sumfn(top_row); @@ -141,8 +142,8 @@ void DcPredFuncs_SSE4_1<width_log2, height_log2, top_sumfn, left_sumfn, storefn, template <ColumnStoreFunc col_storefn> void DirectionalPredFuncs_SSE4_1<col_storefn>::Horizontal( - void* const dest, ptrdiff_t stride, const void* /*top_row*/, - const void* const left_column) { + void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, + const void* /*top_row*/, const void* LIBGAV1_RESTRICT const left_column) { col_storefn(dest, stride, left_column); } @@ -384,8 +385,9 @@ inline void WriteDuplicate64x4(void* const dest, ptrdiff_t stride, // ColStoreN<height> copies each of the |height| values in |column| across its // corresponding in dest. template <WriteDuplicateFunc writefn> -inline void ColStore4_SSE4_1(void* const dest, ptrdiff_t stride, - const void* const column) { +inline void ColStore4_SSE4_1(void* LIBGAV1_RESTRICT const dest, + ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const column) { const __m128i col_data = Load4(column); const __m128i col_dup16 = _mm_unpacklo_epi8(col_data, col_data); const __m128i col_dup32 = _mm_unpacklo_epi16(col_dup16, col_dup16); @@ -393,8 +395,9 @@ inline void ColStore4_SSE4_1(void* const dest, ptrdiff_t stride, } template <WriteDuplicateFunc writefn> -inline void ColStore8_SSE4_1(void* const dest, ptrdiff_t stride, - const void* const column) { +inline void ColStore8_SSE4_1(void* LIBGAV1_RESTRICT const dest, + ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const column) { const ptrdiff_t stride4 = stride << 2; const __m128i col_data = LoadLo8(column); const __m128i col_dup16 = _mm_unpacklo_epi8(col_data, col_data); @@ -407,8 +410,9 @@ inline void ColStore8_SSE4_1(void* const dest, ptrdiff_t stride, } template <WriteDuplicateFunc writefn> -inline void ColStore16_SSE4_1(void* const dest, ptrdiff_t stride, - const void* const column) { +inline void ColStore16_SSE4_1(void* LIBGAV1_RESTRICT const dest, + ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const column) { const ptrdiff_t stride4 = stride << 2; const __m128i col_data = _mm_loadu_si128(static_cast<const __m128i*>(column)); const __m128i col_dup16_lo = _mm_unpacklo_epi8(col_data, col_data); @@ -428,8 +432,9 @@ inline void ColStore16_SSE4_1(void* const dest, ptrdiff_t stride, } template <WriteDuplicateFunc writefn> -inline void ColStore32_SSE4_1(void* const dest, ptrdiff_t stride, - const void* const column) { +inline void ColStore32_SSE4_1(void* LIBGAV1_RESTRICT const dest, + ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const column) { const ptrdiff_t stride4 = stride << 2; auto* dst = static_cast<uint8_t*>(dest); for (int y = 0; y < 32; y += 16) { @@ -457,8 +462,9 @@ inline void ColStore32_SSE4_1(void* const dest, ptrdiff_t stride, } template <WriteDuplicateFunc writefn> -inline void ColStore64_SSE4_1(void* const dest, ptrdiff_t stride, - const void* const column) { +inline void ColStore64_SSE4_1(void* LIBGAV1_RESTRICT const dest, + ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const column) { const ptrdiff_t stride4 = stride << 2; auto* dst = static_cast<uint8_t*>(dest); for (int y = 0; y < 64; y += 16) { @@ -574,7 +580,7 @@ struct DirDefs { }; template <int y_mask> -inline void WritePaethLine4(uint8_t* dst, const __m128i& top, +inline void WritePaethLine4(uint8_t* LIBGAV1_RESTRICT dst, const __m128i& top, const __m128i& left, const __m128i& top_lefts, const __m128i& top_dists, const __m128i& left_dists, const __m128i& top_left_diffs) { @@ -614,7 +620,7 @@ inline void WritePaethLine4(uint8_t* dst, const __m128i& top, // could pay off to accommodate top_left_dists for cmpgt, and repack into epi8 // for the blends. template <int y_mask> -inline void WritePaethLine8(uint8_t* dst, const __m128i& top, +inline void WritePaethLine8(uint8_t* LIBGAV1_RESTRICT dst, const __m128i& top, const __m128i& left, const __m128i& top_lefts, const __m128i& top_dists, const __m128i& left_dists, const __m128i& top_left_diffs) { @@ -658,7 +664,7 @@ inline void WritePaethLine8(uint8_t* dst, const __m128i& top, // |left_dists| is provided alongside its spread out version because it doesn't // change between calls and interacts with both kinds of packing. template <int y_mask> -inline void WritePaethLine16(uint8_t* dst, const __m128i& top, +inline void WritePaethLine16(uint8_t* LIBGAV1_RESTRICT dst, const __m128i& top, const __m128i& left, const __m128i& top_lefts, const __m128i& top_dists, const __m128i& left_dists, @@ -712,8 +718,9 @@ inline void WritePaethLine16(uint8_t* dst, const __m128i& top, _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), pred); } -void Paeth4x4_SSE4_1(void* const dest, ptrdiff_t stride, - const void* const top_row, const void* const left_column) { +void Paeth4x4_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const __m128i left = _mm_cvtepu8_epi32(Load4(left_column)); const __m128i top = _mm_cvtepu8_epi32(Load4(top_row)); @@ -742,8 +749,9 @@ void Paeth4x4_SSE4_1(void* const dest, ptrdiff_t stride, top_left_diff); } -void Paeth4x8_SSE4_1(void* const dest, ptrdiff_t stride, - const void* const top_row, const void* const left_column) { +void Paeth4x8_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const __m128i left = LoadLo8(left_column); const __m128i left_lo = _mm_cvtepu8_epi32(left); const __m128i left_hi = _mm_cvtepu8_epi32(_mm_srli_si128(left, 4)); @@ -787,9 +795,9 @@ void Paeth4x8_SSE4_1(void* const dest, ptrdiff_t stride, top_left_diff); } -void Paeth4x16_SSE4_1(void* const dest, ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void Paeth4x16_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const __m128i left = LoadUnaligned16(left_column); const __m128i left_0 = _mm_cvtepu8_epi32(left); const __m128i left_1 = _mm_cvtepu8_epi32(_mm_srli_si128(left, 4)); @@ -862,8 +870,9 @@ void Paeth4x16_SSE4_1(void* const dest, ptrdiff_t stride, top_left_diff); } -void Paeth8x4_SSE4_1(void* const dest, ptrdiff_t stride, - const void* const top_row, const void* const left_column) { +void Paeth8x4_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const __m128i left = _mm_cvtepu8_epi16(Load4(left_column)); const __m128i top = _mm_cvtepu8_epi16(LoadLo8(top_row)); const auto* const top_ptr = static_cast<const uint8_t*>(top_row); @@ -891,8 +900,9 @@ void Paeth8x4_SSE4_1(void* const dest, ptrdiff_t stride, top_left_diff); } -void Paeth8x8_SSE4_1(void* const dest, ptrdiff_t stride, - const void* const top_row, const void* const left_column) { +void Paeth8x8_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column)); const __m128i top = _mm_cvtepu8_epi16(LoadLo8(top_row)); const auto* const top_ptr = static_cast<const uint8_t*>(top_row); @@ -932,9 +942,9 @@ void Paeth8x8_SSE4_1(void* const dest, ptrdiff_t stride, top_left_diff); } -void Paeth8x16_SSE4_1(void* const dest, ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void Paeth8x16_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const __m128i left = LoadUnaligned16(left_column); const __m128i left_lo = _mm_cvtepu8_epi16(left); const __m128i left_hi = _mm_cvtepu8_epi16(_mm_srli_si128(left, 8)); @@ -1001,18 +1011,18 @@ void Paeth8x16_SSE4_1(void* const dest, ptrdiff_t stride, left_dists, top_left_diff); } -void Paeth8x32_SSE4_1(void* const dest, ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void Paeth8x32_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const left_ptr = static_cast<const uint8_t*>(left_column); auto* const dst = static_cast<uint8_t*>(dest); Paeth8x16_SSE4_1(dst, stride, top_row, left_column); Paeth8x16_SSE4_1(dst + (stride << 4), stride, top_row, left_ptr + 16); } -void Paeth16x4_SSE4_1(void* const dest, ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void Paeth16x4_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const __m128i left = Load4(left_column); const __m128i top = LoadUnaligned16(top_row); const __m128i top_lo = _mm_cvtepu8_epi16(top); @@ -1057,7 +1067,7 @@ void Paeth16x4_SSE4_1(void* const dest, ptrdiff_t stride, // Inlined for calling with offsets in larger transform sizes, mainly to // preserve top_left. -inline void WritePaeth16x8(void* const dest, ptrdiff_t stride, +inline void WritePaeth16x8(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, const uint8_t top_left, const __m128i top, const __m128i left) { const __m128i top_lo = _mm_cvtepu8_epi16(top); @@ -1115,9 +1125,9 @@ inline void WritePaeth16x8(void* const dest, ptrdiff_t stride, top_left_diff_lo, top_left_diff_hi); } -void Paeth16x8_SSE4_1(void* const dest, ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void Paeth16x8_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const __m128i top = LoadUnaligned16(top_row); const __m128i left = LoadLo8(left_column); const auto* const top_ptr = static_cast<const uint8_t*>(top_row); @@ -1213,18 +1223,18 @@ void WritePaeth16x16(void* const dest, ptrdiff_t stride, const uint8_t top_left, top_left_diff_lo, top_left_diff_hi); } -void Paeth16x16_SSE4_1(void* const dest, ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void Paeth16x16_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const __m128i left = LoadUnaligned16(left_column); const __m128i top = LoadUnaligned16(top_row); const auto* const top_ptr = static_cast<const uint8_t*>(top_row); WritePaeth16x16(static_cast<uint8_t*>(dest), stride, top_ptr[-1], top, left); } -void Paeth16x32_SSE4_1(void* const dest, ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void Paeth16x32_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const __m128i left_0 = LoadUnaligned16(left_column); const __m128i top = LoadUnaligned16(top_row); const auto* const top_ptr = static_cast<const uint8_t*>(top_row); @@ -1236,9 +1246,9 @@ void Paeth16x32_SSE4_1(void* const dest, ptrdiff_t stride, WritePaeth16x16(dst + (stride << 4), stride, top_left, top, left_1); } -void Paeth16x64_SSE4_1(void* const dest, ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void Paeth16x64_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const ptrdiff_t stride16 = stride << 4; const __m128i left_0 = LoadUnaligned16(left_column); const __m128i top = LoadUnaligned16(top_row); @@ -1258,9 +1268,9 @@ void Paeth16x64_SSE4_1(void* const dest, ptrdiff_t stride, WritePaeth16x16(dst, stride, top_left, top, left_3); } -void Paeth32x8_SSE4_1(void* const dest, ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void Paeth32x8_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const __m128i left = LoadLo8(left_column); const auto* const top_ptr = static_cast<const uint8_t*>(top_row); const __m128i top_0 = LoadUnaligned16(top_row); @@ -1271,9 +1281,9 @@ void Paeth32x8_SSE4_1(void* const dest, ptrdiff_t stride, WritePaeth16x8(dst + 16, stride, top_left, top_1, left); } -void Paeth32x16_SSE4_1(void* const dest, ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void Paeth32x16_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const __m128i left = LoadUnaligned16(left_column); const auto* const top_ptr = static_cast<const uint8_t*>(top_row); const __m128i top_0 = LoadUnaligned16(top_row); @@ -1284,9 +1294,9 @@ void Paeth32x16_SSE4_1(void* const dest, ptrdiff_t stride, WritePaeth16x16(dst + 16, stride, top_left, top_1, left); } -void Paeth32x32_SSE4_1(void* const dest, ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void Paeth32x32_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const left_ptr = static_cast<const uint8_t*>(left_column); const __m128i left_0 = LoadUnaligned16(left_ptr); const auto* const top_ptr = static_cast<const uint8_t*>(top_row); @@ -1302,9 +1312,9 @@ void Paeth32x32_SSE4_1(void* const dest, ptrdiff_t stride, WritePaeth16x16(dst + 16, stride, top_left, top_1, left_1); } -void Paeth32x64_SSE4_1(void* const dest, ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void Paeth32x64_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const left_ptr = static_cast<const uint8_t*>(left_column); const __m128i left_0 = LoadUnaligned16(left_ptr); const auto* const top_ptr = static_cast<const uint8_t*>(top_row); @@ -1328,9 +1338,9 @@ void Paeth32x64_SSE4_1(void* const dest, ptrdiff_t stride, WritePaeth16x16(dst + 16, stride, top_left, top_1, left_3); } -void Paeth64x16_SSE4_1(void* const dest, ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void Paeth64x16_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const __m128i left = LoadUnaligned16(left_column); const auto* const top_ptr = static_cast<const uint8_t*>(top_row); const __m128i top_0 = LoadUnaligned16(top_ptr); @@ -1345,9 +1355,9 @@ void Paeth64x16_SSE4_1(void* const dest, ptrdiff_t stride, WritePaeth16x16(dst + 48, stride, top_left, top_3, left); } -void Paeth64x32_SSE4_1(void* const dest, ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void Paeth64x32_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const left_ptr = static_cast<const uint8_t*>(left_column); const __m128i left_0 = LoadUnaligned16(left_ptr); const __m128i left_1 = LoadUnaligned16(left_ptr + 16); @@ -1369,9 +1379,9 @@ void Paeth64x32_SSE4_1(void* const dest, ptrdiff_t stride, WritePaeth16x16(dst + 48, stride, top_left, top_3, left_1); } -void Paeth64x64_SSE4_1(void* const dest, ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void Paeth64x64_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const left_ptr = static_cast<const uint8_t*>(left_column); const __m128i left_0 = LoadUnaligned16(left_ptr); const __m128i left_1 = LoadUnaligned16(left_ptr + 16); @@ -1793,7 +1803,6 @@ void Init8bpp() { DirDefs::_64x64::Horizontal; #endif } // NOLINT(readability/fn_size) -// TODO(petersonab): Split Init8bpp function into family-specific files. } // namespace } // namespace low_bitdepth @@ -1937,16 +1946,18 @@ inline void WriteDuplicate64x4(void* const dest, ptrdiff_t stride, // ColStoreN<height> copies each of the |height| values in |column| across its // corresponding row in dest. template <WriteDuplicateFunc writefn> -inline void ColStore4_SSE4_1(void* const dest, ptrdiff_t stride, - const void* const column) { +inline void ColStore4_SSE4_1(void* LIBGAV1_RESTRICT const dest, + ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const column) { const __m128i col_data = LoadLo8(column); const __m128i col_dup32 = _mm_unpacklo_epi16(col_data, col_data); writefn(dest, stride, col_dup32); } template <WriteDuplicateFunc writefn> -inline void ColStore8_SSE4_1(void* const dest, ptrdiff_t stride, - const void* const column) { +inline void ColStore8_SSE4_1(void* LIBGAV1_RESTRICT const dest, + ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const column) { const __m128i col_data = LoadUnaligned16(column); const __m128i col_dup32_lo = _mm_unpacklo_epi16(col_data, col_data); const __m128i col_dup32_hi = _mm_unpackhi_epi16(col_data, col_data); @@ -1958,8 +1969,9 @@ inline void ColStore8_SSE4_1(void* const dest, ptrdiff_t stride, } template <WriteDuplicateFunc writefn> -inline void ColStore16_SSE4_1(void* const dest, ptrdiff_t stride, - const void* const column) { +inline void ColStore16_SSE4_1(void* LIBGAV1_RESTRICT const dest, + ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const column) { const ptrdiff_t stride4 = stride << 2; auto* dst = static_cast<uint8_t*>(dest); for (int y = 0; y < 32; y += 16) { @@ -1975,8 +1987,9 @@ inline void ColStore16_SSE4_1(void* const dest, ptrdiff_t stride, } template <WriteDuplicateFunc writefn> -inline void ColStore32_SSE4_1(void* const dest, ptrdiff_t stride, - const void* const column) { +inline void ColStore32_SSE4_1(void* LIBGAV1_RESTRICT const dest, + ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const column) { const ptrdiff_t stride4 = stride << 2; auto* dst = static_cast<uint8_t*>(dest); for (int y = 0; y < 64; y += 16) { @@ -1992,8 +2005,9 @@ inline void ColStore32_SSE4_1(void* const dest, ptrdiff_t stride, } template <WriteDuplicateFunc writefn> -inline void ColStore64_SSE4_1(void* const dest, ptrdiff_t stride, - const void* const column) { +inline void ColStore64_SSE4_1(void* LIBGAV1_RESTRICT const dest, + ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const column) { const ptrdiff_t stride4 = stride << 2; auto* dst = static_cast<uint8_t*>(dest); for (int y = 0; y < 128; y += 16) { diff --git a/src/dsp/x86/inverse_transform_sse4.cc b/src/dsp/x86/inverse_transform_sse4.cc index 12c008f..e9ceb87 100644 --- a/src/dsp/x86/inverse_transform_sse4.cc +++ b/src/dsp/x86/inverse_transform_sse4.cc @@ -41,7 +41,8 @@ namespace { #include "src/dsp/inverse_transform.inc" template <int store_width, int store_count> -LIBGAV1_ALWAYS_INLINE void StoreDst(int16_t* dst, int32_t stride, int32_t idx, +LIBGAV1_ALWAYS_INLINE void StoreDst(int16_t* LIBGAV1_RESTRICT dst, + int32_t stride, int32_t idx, const __m128i* s) { // NOTE: It is expected that the compiler will unroll these loops. if (store_width == 16) { @@ -63,8 +64,8 @@ LIBGAV1_ALWAYS_INLINE void StoreDst(int16_t* dst, int32_t stride, int32_t idx, } template <int load_width, int load_count> -LIBGAV1_ALWAYS_INLINE void LoadSrc(const int16_t* src, int32_t stride, - int32_t idx, __m128i* x) { +LIBGAV1_ALWAYS_INLINE void LoadSrc(const int16_t* LIBGAV1_RESTRICT src, + int32_t stride, int32_t idx, __m128i* x) { // NOTE: It is expected that the compiler will unroll these loops. if (load_width == 16) { for (int i = 0; i < load_count; i += 4) { @@ -1638,9 +1639,10 @@ LIBGAV1_ALWAYS_INLINE bool Identity4DcOnly(void* dest, int adjusted_tx_height, LIBGAV1_ALWAYS_INLINE void Identity4ColumnStoreToFrame( Array2DView<uint8_t> frame, const int start_x, const int start_y, - const int tx_width, const int tx_height, const int16_t* source) { + const int tx_width, const int tx_height, + const int16_t* LIBGAV1_RESTRICT source) { const int stride = frame.columns(); - uint8_t* dst = frame[start_y] + start_x; + uint8_t* LIBGAV1_RESTRICT dst = frame[start_y] + start_x; const __m128i v_multiplier_fraction = _mm_set1_epi16(static_cast<int16_t>(kIdentity4MultiplierFraction << 3)); @@ -1685,9 +1687,10 @@ LIBGAV1_ALWAYS_INLINE void Identity4ColumnStoreToFrame( LIBGAV1_ALWAYS_INLINE void Identity4RowColumnStoreToFrame( Array2DView<uint8_t> frame, const int start_x, const int start_y, - const int tx_width, const int tx_height, const int16_t* source) { + const int tx_width, const int tx_height, + const int16_t* LIBGAV1_RESTRICT source) { const int stride = frame.columns(); - uint8_t* dst = frame[start_y] + start_x; + uint8_t* LIBGAV1_RESTRICT dst = frame[start_y] + start_x; const __m128i v_multiplier_fraction = _mm_set1_epi16(static_cast<int16_t>(kIdentity4MultiplierFraction << 3)); @@ -1789,9 +1792,10 @@ LIBGAV1_ALWAYS_INLINE bool Identity8DcOnly(void* dest, int adjusted_tx_height, LIBGAV1_ALWAYS_INLINE void Identity8ColumnStoreToFrame_SSE4_1( Array2DView<uint8_t> frame, const int start_x, const int start_y, - const int tx_width, const int tx_height, const int16_t* source) { + const int tx_width, const int tx_height, + const int16_t* LIBGAV1_RESTRICT source) { const int stride = frame.columns(); - uint8_t* dst = frame[start_y] + start_x; + uint8_t* LIBGAV1_RESTRICT dst = frame[start_y] + start_x; const __m128i v_eight = _mm_set1_epi16(8); if (tx_width == 4) { int i = 0; @@ -1883,9 +1887,10 @@ LIBGAV1_ALWAYS_INLINE bool Identity16DcOnly(void* dest, int adjusted_tx_height, LIBGAV1_ALWAYS_INLINE void Identity16ColumnStoreToFrame_SSE4_1( Array2DView<uint8_t> frame, const int start_x, const int start_y, - const int tx_width, const int tx_height, const int16_t* source) { + const int tx_width, const int tx_height, + const int16_t* LIBGAV1_RESTRICT source) { const int stride = frame.columns(); - uint8_t* dst = frame[start_y] + start_x; + uint8_t* LIBGAV1_RESTRICT dst = frame[start_y] + start_x; const __m128i v_eight = _mm_set1_epi16(8); const __m128i v_multiplier = _mm_set1_epi16(static_cast<int16_t>(kIdentity4MultiplierFraction << 4)); @@ -1966,9 +1971,10 @@ LIBGAV1_ALWAYS_INLINE bool Identity32DcOnly(void* dest, LIBGAV1_ALWAYS_INLINE void Identity32ColumnStoreToFrame( Array2DView<uint8_t> frame, const int start_x, const int start_y, - const int tx_width, const int tx_height, const int16_t* source) { + const int tx_width, const int tx_height, + const int16_t* LIBGAV1_RESTRICT source) { const int stride = frame.columns(); - uint8_t* dst = frame[start_y] + start_x; + uint8_t* LIBGAV1_RESTRICT dst = frame[start_y] + start_x; const __m128i v_two = _mm_set1_epi16(2); int i = 0; @@ -1995,7 +2001,7 @@ LIBGAV1_ALWAYS_INLINE void Identity32ColumnStoreToFrame( // Process 4 wht4 rows and columns. LIBGAV1_ALWAYS_INLINE void Wht4_SSE4_1(Array2DView<uint8_t> frame, const int start_x, const int start_y, - const void* source, + const void* LIBGAV1_RESTRICT source, const int adjusted_tx_height) { const auto* const src = static_cast<const int16_t*>(source); __m128i s[4], x[4]; @@ -2058,12 +2064,11 @@ LIBGAV1_ALWAYS_INLINE void Wht4_SSE4_1(Array2DView<uint8_t> frame, // Store to frame. const int stride = frame.columns(); - uint8_t* dst = frame[start_y] + start_x; + uint8_t* LIBGAV1_RESTRICT dst = frame[start_y] + start_x; for (int row = 0; row < 4; ++row) { const __m128i frame_data = Load4(dst); const __m128i a = _mm_cvtepu8_epi16(frame_data); - // Saturate to prevent overflowing int16_t - const __m128i b = _mm_adds_epi16(a, s[row]); + const __m128i b = _mm_add_epi16(a, s[row]); Store4(dst, _mm_packus_epi16(b, b)); dst += stride; } @@ -2075,13 +2080,13 @@ LIBGAV1_ALWAYS_INLINE void Wht4_SSE4_1(Array2DView<uint8_t> frame, template <bool enable_flip_rows = false> LIBGAV1_ALWAYS_INLINE void StoreToFrameWithRound( Array2DView<uint8_t> frame, const int start_x, const int start_y, - const int tx_width, const int tx_height, const int16_t* source, - TransformType tx_type) { + const int tx_width, const int tx_height, + const int16_t* LIBGAV1_RESTRICT source, TransformType tx_type) { const bool flip_rows = enable_flip_rows ? kTransformFlipRowsMask.Contains(tx_type) : false; const __m128i v_eight = _mm_set1_epi16(8); const int stride = frame.columns(); - uint8_t* dst = frame[start_y] + start_x; + uint8_t* LIBGAV1_RESTRICT dst = frame[start_y] + start_x; if (tx_width == 4) { for (int i = 0; i < tx_height; ++i) { const int row = flip_rows ? (tx_height - i - 1) * 4 : i * 4; @@ -2262,8 +2267,10 @@ void Dct4TransformLoopRow_SSE4_1(TransformType /*tx_type*/, void Dct4TransformLoopColumn_SSE4_1(TransformType tx_type, TransformSize tx_size, - int adjusted_tx_height, void* src_buffer, - int start_x, int start_y, void* dst_frame) { + int adjusted_tx_height, + void* LIBGAV1_RESTRICT src_buffer, + int start_x, int start_y, + void* LIBGAV1_RESTRICT dst_frame) { auto* src = static_cast<int16_t*>(src_buffer); const int tx_width = kTransformWidth[tx_size]; @@ -2325,8 +2332,10 @@ void Dct8TransformLoopRow_SSE4_1(TransformType /*tx_type*/, void Dct8TransformLoopColumn_SSE4_1(TransformType tx_type, TransformSize tx_size, - int adjusted_tx_height, void* src_buffer, - int start_x, int start_y, void* dst_frame) { + int adjusted_tx_height, + void* LIBGAV1_RESTRICT src_buffer, + int start_x, int start_y, + void* LIBGAV1_RESTRICT dst_frame) { auto* src = static_cast<int16_t*>(src_buffer); const int tx_width = kTransformWidth[tx_size]; @@ -2386,9 +2395,10 @@ void Dct16TransformLoopRow_SSE4_1(TransformType /*tx_type*/, void Dct16TransformLoopColumn_SSE4_1(TransformType tx_type, TransformSize tx_size, - int adjusted_tx_height, void* src_buffer, + int adjusted_tx_height, + void* LIBGAV1_RESTRICT src_buffer, int start_x, int start_y, - void* dst_frame) { + void* LIBGAV1_RESTRICT dst_frame) { auto* src = static_cast<int16_t*>(src_buffer); const int tx_width = kTransformWidth[tx_size]; @@ -2441,9 +2451,10 @@ void Dct32TransformLoopRow_SSE4_1(TransformType /*tx_type*/, void Dct32TransformLoopColumn_SSE4_1(TransformType tx_type, TransformSize tx_size, - int adjusted_tx_height, void* src_buffer, + int adjusted_tx_height, + void* LIBGAV1_RESTRICT src_buffer, int start_x, int start_y, - void* dst_frame) { + void* LIBGAV1_RESTRICT dst_frame) { auto* src = static_cast<int16_t*>(src_buffer); const int tx_width = kTransformWidth[tx_size]; @@ -2486,9 +2497,10 @@ void Dct64TransformLoopRow_SSE4_1(TransformType /*tx_type*/, void Dct64TransformLoopColumn_SSE4_1(TransformType tx_type, TransformSize tx_size, - int adjusted_tx_height, void* src_buffer, + int adjusted_tx_height, + void* LIBGAV1_RESTRICT src_buffer, int start_x, int start_y, - void* dst_frame) { + void* LIBGAV1_RESTRICT dst_frame) { auto* src = static_cast<int16_t*>(src_buffer); const int tx_width = kTransformWidth[tx_size]; @@ -2535,9 +2547,10 @@ void Adst4TransformLoopRow_SSE4_1(TransformType /*tx_type*/, void Adst4TransformLoopColumn_SSE4_1(TransformType tx_type, TransformSize tx_size, - int adjusted_tx_height, void* src_buffer, + int adjusted_tx_height, + void* LIBGAV1_RESTRICT src_buffer, int start_x, int start_y, - void* dst_frame) { + void* LIBGAV1_RESTRICT dst_frame) { auto* src = static_cast<int16_t*>(src_buffer); const int tx_width = kTransformWidth[tx_size]; @@ -2594,9 +2607,10 @@ void Adst8TransformLoopRow_SSE4_1(TransformType /*tx_type*/, void Adst8TransformLoopColumn_SSE4_1(TransformType tx_type, TransformSize tx_size, - int adjusted_tx_height, void* src_buffer, + int adjusted_tx_height, + void* LIBGAV1_RESTRICT src_buffer, int start_x, int start_y, - void* dst_frame) { + void* LIBGAV1_RESTRICT dst_frame) { auto* src = static_cast<int16_t*>(src_buffer); const int tx_width = kTransformWidth[tx_size]; @@ -2658,9 +2672,10 @@ void Adst16TransformLoopRow_SSE4_1(TransformType /*tx_type*/, void Adst16TransformLoopColumn_SSE4_1(TransformType tx_type, TransformSize tx_size, - int adjusted_tx_height, void* src_buffer, + int adjusted_tx_height, + void* LIBGAV1_RESTRICT src_buffer, int start_x, int start_y, - void* dst_frame) { + void* LIBGAV1_RESTRICT dst_frame) { auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame); auto* src = static_cast<int16_t*>(src_buffer); const int tx_width = kTransformWidth[tx_size]; @@ -2727,8 +2742,9 @@ void Identity4TransformLoopRow_SSE4_1(TransformType tx_type, void Identity4TransformLoopColumn_SSE4_1(TransformType tx_type, TransformSize tx_size, int adjusted_tx_height, - void* src_buffer, int start_x, - int start_y, void* dst_frame) { + void* LIBGAV1_RESTRICT src_buffer, + int start_x, int start_y, + void* LIBGAV1_RESTRICT dst_frame) { auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame); auto* src = static_cast<int16_t*>(src_buffer); const int tx_width = kTransformWidth[tx_size]; @@ -2799,8 +2815,9 @@ void Identity8TransformLoopRow_SSE4_1(TransformType tx_type, void Identity8TransformLoopColumn_SSE4_1(TransformType tx_type, TransformSize tx_size, int adjusted_tx_height, - void* src_buffer, int start_x, - int start_y, void* dst_frame) { + void* LIBGAV1_RESTRICT src_buffer, + int start_x, int start_y, + void* LIBGAV1_RESTRICT dst_frame) { auto* src = static_cast<int16_t*>(src_buffer); const int tx_width = kTransformWidth[tx_size]; @@ -2839,8 +2856,9 @@ void Identity16TransformLoopRow_SSE4_1(TransformType /*tx_type*/, void Identity16TransformLoopColumn_SSE4_1(TransformType tx_type, TransformSize tx_size, int adjusted_tx_height, - void* src_buffer, int start_x, - int start_y, void* dst_frame) { + void* LIBGAV1_RESTRICT src_buffer, + int start_x, int start_y, + void* LIBGAV1_RESTRICT dst_frame) { auto* src = static_cast<int16_t*>(src_buffer); const int tx_width = kTransformWidth[tx_size]; @@ -2884,8 +2902,9 @@ void Identity32TransformLoopRow_SSE4_1(TransformType /*tx_type*/, void Identity32TransformLoopColumn_SSE4_1(TransformType /*tx_type*/, TransformSize tx_size, int adjusted_tx_height, - void* src_buffer, int start_x, - int start_y, void* dst_frame) { + void* LIBGAV1_RESTRICT src_buffer, + int start_x, int start_y, + void* LIBGAV1_RESTRICT dst_frame) { auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame); auto* src = static_cast<int16_t*>(src_buffer); const int tx_width = kTransformWidth[tx_size]; @@ -2907,8 +2926,10 @@ void Wht4TransformLoopRow_SSE4_1(TransformType tx_type, TransformSize tx_size, void Wht4TransformLoopColumn_SSE4_1(TransformType tx_type, TransformSize tx_size, - int adjusted_tx_height, void* src_buffer, - int start_x, int start_y, void* dst_frame) { + int adjusted_tx_height, + void* LIBGAV1_RESTRICT src_buffer, + int start_x, int start_y, + void* LIBGAV1_RESTRICT dst_frame) { assert(tx_type == kTransformTypeDctDct); assert(tx_size == kTransformSize4x4); static_cast<void>(tx_type); @@ -2928,88 +2949,88 @@ void Init8bpp() { assert(dsp != nullptr); // Maximum transform size for Dct is 64. -#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize4_1DTransformDct) - dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kRow] = +#if DSP_ENABLED_8BPP_SSE4_1(Transform1dSize4_Transform1dDct) + dsp->inverse_transforms[kTransform1dDct][kTransform1dSize4][kRow] = Dct4TransformLoopRow_SSE4_1; - dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kColumn] = + dsp->inverse_transforms[kTransform1dDct][kTransform1dSize4][kColumn] = Dct4TransformLoopColumn_SSE4_1; #endif -#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize8_1DTransformDct) - dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kRow] = +#if DSP_ENABLED_8BPP_SSE4_1(Transform1dSize8_Transform1dDct) + dsp->inverse_transforms[kTransform1dDct][kTransform1dSize8][kRow] = Dct8TransformLoopRow_SSE4_1; - dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kColumn] = + dsp->inverse_transforms[kTransform1dDct][kTransform1dSize8][kColumn] = Dct8TransformLoopColumn_SSE4_1; #endif -#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize16_1DTransformDct) - dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kRow] = +#if DSP_ENABLED_8BPP_SSE4_1(Transform1dSize16_Transform1dDct) + dsp->inverse_transforms[kTransform1dDct][kTransform1dSize16][kRow] = Dct16TransformLoopRow_SSE4_1; - dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kColumn] = + dsp->inverse_transforms[kTransform1dDct][kTransform1dSize16][kColumn] = Dct16TransformLoopColumn_SSE4_1; #endif -#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize32_1DTransformDct) - dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kRow] = +#if DSP_ENABLED_8BPP_SSE4_1(Transform1dSize32_Transform1dDct) + dsp->inverse_transforms[kTransform1dDct][kTransform1dSize32][kRow] = Dct32TransformLoopRow_SSE4_1; - dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kColumn] = + dsp->inverse_transforms[kTransform1dDct][kTransform1dSize32][kColumn] = Dct32TransformLoopColumn_SSE4_1; #endif -#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize64_1DTransformDct) - dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kRow] = +#if DSP_ENABLED_8BPP_SSE4_1(Transform1dSize64_Transform1dDct) + dsp->inverse_transforms[kTransform1dDct][kTransform1dSize64][kRow] = Dct64TransformLoopRow_SSE4_1; - dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kColumn] = + dsp->inverse_transforms[kTransform1dDct][kTransform1dSize64][kColumn] = Dct64TransformLoopColumn_SSE4_1; #endif // Maximum transform size for Adst is 16. -#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize4_1DTransformAdst) - dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kRow] = +#if DSP_ENABLED_8BPP_SSE4_1(Transform1dSize4_Transform1dAdst) + dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize4][kRow] = Adst4TransformLoopRow_SSE4_1; - dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kColumn] = + dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize4][kColumn] = Adst4TransformLoopColumn_SSE4_1; #endif -#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize8_1DTransformAdst) - dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kRow] = +#if DSP_ENABLED_8BPP_SSE4_1(Transform1dSize8_Transform1dAdst) + dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize8][kRow] = Adst8TransformLoopRow_SSE4_1; - dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kColumn] = + dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize8][kColumn] = Adst8TransformLoopColumn_SSE4_1; #endif -#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize16_1DTransformAdst) - dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kRow] = +#if DSP_ENABLED_8BPP_SSE4_1(Transform1dSize16_Transform1dAdst) + dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize16][kRow] = Adst16TransformLoopRow_SSE4_1; - dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kColumn] = + dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize16][kColumn] = Adst16TransformLoopColumn_SSE4_1; #endif // Maximum transform size for Identity transform is 32. -#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize4_1DTransformIdentity) - dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kRow] = +#if DSP_ENABLED_8BPP_SSE4_1(Transform1dSize4_Transform1dIdentity) + dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize4][kRow] = Identity4TransformLoopRow_SSE4_1; - dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kColumn] = + dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize4][kColumn] = Identity4TransformLoopColumn_SSE4_1; #endif -#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize8_1DTransformIdentity) - dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kRow] = +#if DSP_ENABLED_8BPP_SSE4_1(Transform1dSize8_Transform1dIdentity) + dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize8][kRow] = Identity8TransformLoopRow_SSE4_1; - dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kColumn] = + dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize8][kColumn] = Identity8TransformLoopColumn_SSE4_1; #endif -#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize16_1DTransformIdentity) - dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kRow] = +#if DSP_ENABLED_8BPP_SSE4_1(Transform1dSize16_Transform1dIdentity) + dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize16][kRow] = Identity16TransformLoopRow_SSE4_1; - dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kColumn] = + dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize16][kColumn] = Identity16TransformLoopColumn_SSE4_1; #endif -#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize32_1DTransformIdentity) - dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize32][kRow] = +#if DSP_ENABLED_8BPP_SSE4_1(Transform1dSize32_Transform1dIdentity) + dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize32][kRow] = Identity32TransformLoopRow_SSE4_1; - dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize32][kColumn] = + dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize32][kColumn] = Identity32TransformLoopColumn_SSE4_1; #endif // Maximum transform size for Wht is 4. -#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize4_1DTransformWht) - dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kRow] = +#if DSP_ENABLED_8BPP_SSE4_1(Transform1dSize4_Transform1dWht) + dsp->inverse_transforms[kTransform1dWht][kTransform1dSize4][kRow] = Wht4TransformLoopRow_SSE4_1; - dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kColumn] = + dsp->inverse_transforms[kTransform1dWht][kTransform1dSize4][kColumn] = Wht4TransformLoopColumn_SSE4_1; #endif } diff --git a/src/dsp/x86/inverse_transform_sse4.h b/src/dsp/x86/inverse_transform_sse4.h index 106084b..c31e88b 100644 --- a/src/dsp/x86/inverse_transform_sse4.h +++ b/src/dsp/x86/inverse_transform_sse4.h @@ -34,56 +34,56 @@ void InverseTransformInit_SSE4_1(); // optimization being enabled, signal the sse4 implementation should be used. #if LIBGAV1_TARGETING_SSE4_1 -#ifndef LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformDct -#define LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformDct LIBGAV1_CPU_SSE4_1 +#ifndef LIBGAV1_Dsp8bpp_Transform1dSize4_Transform1dDct +#define LIBGAV1_Dsp8bpp_Transform1dSize4_Transform1dDct LIBGAV1_CPU_SSE4_1 #endif -#ifndef LIBGAV1_Dsp8bpp_1DTransformSize8_1DTransformDct -#define LIBGAV1_Dsp8bpp_1DTransformSize8_1DTransformDct LIBGAV1_CPU_SSE4_1 +#ifndef LIBGAV1_Dsp8bpp_Transform1dSize8_Transform1dDct +#define LIBGAV1_Dsp8bpp_Transform1dSize8_Transform1dDct LIBGAV1_CPU_SSE4_1 #endif -#ifndef LIBGAV1_Dsp8bpp_1DTransformSize16_1DTransformDct -#define LIBGAV1_Dsp8bpp_1DTransformSize16_1DTransformDct LIBGAV1_CPU_SSE4_1 +#ifndef LIBGAV1_Dsp8bpp_Transform1dSize16_Transform1dDct +#define LIBGAV1_Dsp8bpp_Transform1dSize16_Transform1dDct LIBGAV1_CPU_SSE4_1 #endif -#ifndef LIBGAV1_Dsp8bpp_1DTransformSize32_1DTransformDct -#define LIBGAV1_Dsp8bpp_1DTransformSize32_1DTransformDct LIBGAV1_CPU_SSE4_1 +#ifndef LIBGAV1_Dsp8bpp_Transform1dSize32_Transform1dDct +#define LIBGAV1_Dsp8bpp_Transform1dSize32_Transform1dDct LIBGAV1_CPU_SSE4_1 #endif -#ifndef LIBGAV1_Dsp8bpp_1DTransformSize64_1DTransformDct -#define LIBGAV1_Dsp8bpp_1DTransformSize64_1DTransformDct LIBGAV1_CPU_SSE4_1 +#ifndef LIBGAV1_Dsp8bpp_Transform1dSize64_Transform1dDct +#define LIBGAV1_Dsp8bpp_Transform1dSize64_Transform1dDct LIBGAV1_CPU_SSE4_1 #endif -#ifndef LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformAdst -#define LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformAdst LIBGAV1_CPU_SSE4_1 +#ifndef LIBGAV1_Dsp8bpp_Transform1dSize4_Transform1dAdst +#define LIBGAV1_Dsp8bpp_Transform1dSize4_Transform1dAdst LIBGAV1_CPU_SSE4_1 #endif -#ifndef LIBGAV1_Dsp8bpp_1DTransformSize8_1DTransformAdst -#define LIBGAV1_Dsp8bpp_1DTransformSize8_1DTransformAdst LIBGAV1_CPU_SSE4_1 +#ifndef LIBGAV1_Dsp8bpp_Transform1dSize8_Transform1dAdst +#define LIBGAV1_Dsp8bpp_Transform1dSize8_Transform1dAdst LIBGAV1_CPU_SSE4_1 #endif -#ifndef LIBGAV1_Dsp8bpp_1DTransformSize16_1DTransformAdst -#define LIBGAV1_Dsp8bpp_1DTransformSize16_1DTransformAdst LIBGAV1_CPU_SSE4_1 +#ifndef LIBGAV1_Dsp8bpp_Transform1dSize16_Transform1dAdst +#define LIBGAV1_Dsp8bpp_Transform1dSize16_Transform1dAdst LIBGAV1_CPU_SSE4_1 #endif -#ifndef LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformIdentity -#define LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformIdentity LIBGAV1_CPU_SSE4_1 +#ifndef LIBGAV1_Dsp8bpp_Transform1dSize4_Transform1dIdentity +#define LIBGAV1_Dsp8bpp_Transform1dSize4_Transform1dIdentity LIBGAV1_CPU_SSE4_1 #endif -#ifndef LIBGAV1_Dsp8bpp_1DTransformSize8_1DTransformIdentity -#define LIBGAV1_Dsp8bpp_1DTransformSize8_1DTransformIdentity LIBGAV1_CPU_SSE4_1 +#ifndef LIBGAV1_Dsp8bpp_Transform1dSize8_Transform1dIdentity +#define LIBGAV1_Dsp8bpp_Transform1dSize8_Transform1dIdentity LIBGAV1_CPU_SSE4_1 #endif -#ifndef LIBGAV1_Dsp8bpp_1DTransformSize16_1DTransformIdentity -#define LIBGAV1_Dsp8bpp_1DTransformSize16_1DTransformIdentity LIBGAV1_CPU_SSE4_1 +#ifndef LIBGAV1_Dsp8bpp_Transform1dSize16_Transform1dIdentity +#define LIBGAV1_Dsp8bpp_Transform1dSize16_Transform1dIdentity LIBGAV1_CPU_SSE4_1 #endif -#ifndef LIBGAV1_Dsp8bpp_1DTransformSize32_1DTransformIdentity -#define LIBGAV1_Dsp8bpp_1DTransformSize32_1DTransformIdentity LIBGAV1_CPU_SSE4_1 +#ifndef LIBGAV1_Dsp8bpp_Transform1dSize32_Transform1dIdentity +#define LIBGAV1_Dsp8bpp_Transform1dSize32_Transform1dIdentity LIBGAV1_CPU_SSE4_1 #endif -#ifndef LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformWht -#define LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformWht LIBGAV1_CPU_SSE4_1 +#ifndef LIBGAV1_Dsp8bpp_Transform1dSize4_Transform1dWht +#define LIBGAV1_Dsp8bpp_Transform1dSize4_Transform1dWht LIBGAV1_CPU_SSE4_1 #endif #endif // LIBGAV1_TARGETING_SSE4_1 #endif // LIBGAV1_SRC_DSP_X86_INVERSE_TRANSFORM_SSE4_H_ diff --git a/src/dsp/x86/loop_restoration_10bit_avx2.cc b/src/dsp/x86/loop_restoration_10bit_avx2.cc index b38f322..daf5c42 100644 --- a/src/dsp/x86/loop_restoration_10bit_avx2.cc +++ b/src/dsp/x86/loop_restoration_10bit_avx2.cc @@ -472,11 +472,14 @@ inline void WienerVerticalTap1(const int16_t* wiener_buffer, } void WienerFilter_AVX2( - const RestorationUnitInfo& restoration_info, const void* const source, - const ptrdiff_t stride, const void* const top_border, - const ptrdiff_t top_border_stride, const void* const bottom_border, + const RestorationUnitInfo& LIBGAV1_RESTRICT restoration_info, + const void* LIBGAV1_RESTRICT const source, const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_border, + const ptrdiff_t top_border_stride, + const void* LIBGAV1_RESTRICT const bottom_border, const ptrdiff_t bottom_border_stride, const int width, const int height, - RestorationBuffer* const restoration_buffer, void* const dest) { + RestorationBuffer* LIBGAV1_RESTRICT const restoration_buffer, + void* LIBGAV1_RESTRICT const dest) { const int16_t* const number_leading_zero_coefficients = restoration_info.wiener_info.number_leading_zero_coefficients; const int number_rows_to_skip = std::max( @@ -3097,11 +3100,14 @@ inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info, // in the end of each row. It is safe to overwrite the output as it will not be // part of the visible frame. void SelfGuidedFilter_AVX2( - const RestorationUnitInfo& restoration_info, const void* const source, - const ptrdiff_t stride, const void* const top_border, - const ptrdiff_t top_border_stride, const void* const bottom_border, + const RestorationUnitInfo& LIBGAV1_RESTRICT restoration_info, + const void* LIBGAV1_RESTRICT const source, const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_border, + const ptrdiff_t top_border_stride, + const void* LIBGAV1_RESTRICT const bottom_border, const ptrdiff_t bottom_border_stride, const int width, const int height, - RestorationBuffer* const restoration_buffer, void* const dest) { + RestorationBuffer* LIBGAV1_RESTRICT const restoration_buffer, + void* LIBGAV1_RESTRICT const dest) { const int index = restoration_info.sgr_proj_info.index; const int radius_pass_0 = kSgrProjParams[index][0]; // 2 or 0 const int radius_pass_1 = kSgrProjParams[index][2]; // 1 or 0 diff --git a/src/dsp/x86/loop_restoration_10bit_sse4.cc b/src/dsp/x86/loop_restoration_10bit_sse4.cc index 96380e3..6625d51 100644 --- a/src/dsp/x86/loop_restoration_10bit_sse4.cc +++ b/src/dsp/x86/loop_restoration_10bit_sse4.cc @@ -429,11 +429,14 @@ inline void WienerVerticalTap1(const int16_t* wiener_buffer, } void WienerFilter_SSE4_1( - const RestorationUnitInfo& restoration_info, const void* const source, - const ptrdiff_t stride, const void* const top_border, - const ptrdiff_t top_border_stride, const void* const bottom_border, + const RestorationUnitInfo& LIBGAV1_RESTRICT restoration_info, + const void* LIBGAV1_RESTRICT const source, const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_border, + const ptrdiff_t top_border_stride, + const void* LIBGAV1_RESTRICT const bottom_border, const ptrdiff_t bottom_border_stride, const int width, const int height, - RestorationBuffer* const restoration_buffer, void* const dest) { + RestorationBuffer* LIBGAV1_RESTRICT const restoration_buffer, + void* LIBGAV1_RESTRICT const dest) { const int16_t* const number_leading_zero_coefficients = restoration_info.wiener_info.number_leading_zero_coefficients; const int number_rows_to_skip = std::max( @@ -2465,11 +2468,14 @@ inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info, // in the end of each row. It is safe to overwrite the output as it will not be // part of the visible frame. void SelfGuidedFilter_SSE4_1( - const RestorationUnitInfo& restoration_info, const void* const source, - const ptrdiff_t stride, const void* const top_border, - const ptrdiff_t top_border_stride, const void* const bottom_border, + const RestorationUnitInfo& LIBGAV1_RESTRICT restoration_info, + const void* LIBGAV1_RESTRICT const source, const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_border, + const ptrdiff_t top_border_stride, + const void* LIBGAV1_RESTRICT const bottom_border, const ptrdiff_t bottom_border_stride, const int width, const int height, - RestorationBuffer* const restoration_buffer, void* const dest) { + RestorationBuffer* LIBGAV1_RESTRICT const restoration_buffer, + void* LIBGAV1_RESTRICT const dest) { const int index = restoration_info.sgr_proj_info.index; const int radius_pass_0 = kSgrProjParams[index][0]; // 2 or 0 const int radius_pass_1 = kSgrProjParams[index][2]; // 1 or 0 diff --git a/src/dsp/x86/loop_restoration_avx2.cc b/src/dsp/x86/loop_restoration_avx2.cc index 351a324..30e8a22 100644 --- a/src/dsp/x86/loop_restoration_avx2.cc +++ b/src/dsp/x86/loop_restoration_avx2.cc @@ -483,11 +483,14 @@ inline void WienerVerticalTap1(const int16_t* wiener_buffer, } void WienerFilter_AVX2( - const RestorationUnitInfo& restoration_info, const void* const source, - const ptrdiff_t stride, const void* const top_border, - const ptrdiff_t top_border_stride, const void* const bottom_border, + const RestorationUnitInfo& LIBGAV1_RESTRICT restoration_info, + const void* LIBGAV1_RESTRICT const source, const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_border, + const ptrdiff_t top_border_stride, + const void* LIBGAV1_RESTRICT const bottom_border, const ptrdiff_t bottom_border_stride, const int width, const int height, - RestorationBuffer* const restoration_buffer, void* const dest) { + RestorationBuffer* LIBGAV1_RESTRICT const restoration_buffer, + void* LIBGAV1_RESTRICT const dest) { const int16_t* const number_leading_zero_coefficients = restoration_info.wiener_info.number_leading_zero_coefficients; const int number_rows_to_skip = std::max( @@ -2880,11 +2883,14 @@ inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info, // in the end of each row. It is safe to overwrite the output as it will not be // part of the visible frame. void SelfGuidedFilter_AVX2( - const RestorationUnitInfo& restoration_info, const void* const source, - const ptrdiff_t stride, const void* const top_border, - const ptrdiff_t top_border_stride, const void* const bottom_border, + const RestorationUnitInfo& LIBGAV1_RESTRICT restoration_info, + const void* LIBGAV1_RESTRICT const source, const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_border, + const ptrdiff_t top_border_stride, + const void* LIBGAV1_RESTRICT const bottom_border, const ptrdiff_t bottom_border_stride, const int width, const int height, - RestorationBuffer* const restoration_buffer, void* const dest) { + RestorationBuffer* LIBGAV1_RESTRICT const restoration_buffer, + void* LIBGAV1_RESTRICT const dest) { const int index = restoration_info.sgr_proj_info.index; const int radius_pass_0 = kSgrProjParams[index][0]; // 2 or 0 const int radius_pass_1 = kSgrProjParams[index][2]; // 1 or 0 diff --git a/src/dsp/x86/loop_restoration_sse4.cc b/src/dsp/x86/loop_restoration_sse4.cc index 273bcc8..3363f0e 100644 --- a/src/dsp/x86/loop_restoration_sse4.cc +++ b/src/dsp/x86/loop_restoration_sse4.cc @@ -482,11 +482,14 @@ inline void WienerVerticalTap1(const int16_t* wiener_buffer, } void WienerFilter_SSE4_1( - const RestorationUnitInfo& restoration_info, const void* const source, - const ptrdiff_t stride, const void* const top_border, - const ptrdiff_t top_border_stride, const void* const bottom_border, + const RestorationUnitInfo& LIBGAV1_RESTRICT restoration_info, + const void* LIBGAV1_RESTRICT const source, const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_border, + const ptrdiff_t top_border_stride, + const void* LIBGAV1_RESTRICT const bottom_border, const ptrdiff_t bottom_border_stride, const int width, const int height, - RestorationBuffer* const restoration_buffer, void* const dest) { + RestorationBuffer* LIBGAV1_RESTRICT const restoration_buffer, + void* LIBGAV1_RESTRICT const dest) { const int16_t* const number_leading_zero_coefficients = restoration_info.wiener_info.number_leading_zero_coefficients; const int number_rows_to_skip = std::max( @@ -2510,11 +2513,14 @@ inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info, // in the end of each row. It is safe to overwrite the output as it will not be // part of the visible frame. void SelfGuidedFilter_SSE4_1( - const RestorationUnitInfo& restoration_info, const void* const source, - const ptrdiff_t stride, const void* const top_border, - const ptrdiff_t top_border_stride, const void* const bottom_border, + const RestorationUnitInfo& LIBGAV1_RESTRICT restoration_info, + const void* LIBGAV1_RESTRICT const source, const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_border, + const ptrdiff_t top_border_stride, + const void* LIBGAV1_RESTRICT const bottom_border, const ptrdiff_t bottom_border_stride, const int width, const int height, - RestorationBuffer* const restoration_buffer, void* const dest) { + RestorationBuffer* LIBGAV1_RESTRICT const restoration_buffer, + void* LIBGAV1_RESTRICT const dest) { const int index = restoration_info.sgr_proj_info.index; const int radius_pass_0 = kSgrProjParams[index][0]; // 2 or 0 const int radius_pass_1 = kSgrProjParams[index][2]; // 1 or 0 diff --git a/src/dsp/x86/mask_blend_sse4.cc b/src/dsp/x86/mask_blend_sse4.cc index 2e836af..a18444b 100644 --- a/src/dsp/x86/mask_blend_sse4.cc +++ b/src/dsp/x86/mask_blend_sse4.cc @@ -36,7 +36,8 @@ namespace { // Width can only be 4 when it is subsampled from a block of width 8, hence // subsampling_x is always 1 when this function is called. template <int subsampling_x, int subsampling_y> -inline __m128i GetMask4x2(const uint8_t* mask, ptrdiff_t mask_stride) { +inline __m128i GetMask4x2(const uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { if (subsampling_x == 1) { const __m128i mask_val_0 = _mm_cvtepu8_epi16(LoadLo8(mask)); const __m128i mask_val_1 = @@ -62,7 +63,8 @@ inline __m128i GetMask4x2(const uint8_t* mask, ptrdiff_t mask_stride) { // 16-bit is also the lowest packing for hadd, but without subsampling there is // an unfortunate conversion required. template <int subsampling_x, int subsampling_y> -inline __m128i GetMask8(const uint8_t* mask, ptrdiff_t stride) { +inline __m128i GetMask8(const uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t stride) { if (subsampling_x == 1) { const __m128i row_vals = LoadUnaligned16(mask); @@ -89,7 +91,8 @@ inline __m128i GetMask8(const uint8_t* mask, ptrdiff_t stride) { // when is_inter_intra is true, the prediction values are brought to 8-bit // packing as well. template <int subsampling_x, int subsampling_y> -inline __m128i GetInterIntraMask8(const uint8_t* mask, ptrdiff_t stride) { +inline __m128i GetInterIntraMask8(const uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t stride) { if (subsampling_x == 1) { const __m128i row_vals = LoadUnaligned16(mask); @@ -116,10 +119,11 @@ inline __m128i GetInterIntraMask8(const uint8_t* mask, ptrdiff_t stride) { return mask_val; } -inline void WriteMaskBlendLine4x2(const int16_t* const pred_0, - const int16_t* const pred_1, +inline void WriteMaskBlendLine4x2(const int16_t* LIBGAV1_RESTRICT const pred_0, + const int16_t* LIBGAV1_RESTRICT const pred_1, const __m128i pred_mask_0, - const __m128i pred_mask_1, uint8_t* dst, + const __m128i pred_mask_1, + uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t dst_stride) { const __m128i pred_val_0 = LoadAligned16(pred_0); const __m128i pred_val_1 = LoadAligned16(pred_1); @@ -145,9 +149,11 @@ inline void WriteMaskBlendLine4x2(const int16_t* const pred_0, } template <int subsampling_x, int subsampling_y> -inline void MaskBlending4x4_SSE4(const int16_t* pred_0, const int16_t* pred_1, - const uint8_t* mask, - const ptrdiff_t mask_stride, uint8_t* dst, +inline void MaskBlending4x4_SSE4(const int16_t* LIBGAV1_RESTRICT pred_0, + const int16_t* LIBGAV1_RESTRICT pred_1, + const uint8_t* LIBGAV1_RESTRICT mask, + const ptrdiff_t mask_stride, + uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t dst_stride) { const __m128i mask_inverter = _mm_set1_epi16(64); __m128i pred_mask_0 = @@ -167,10 +173,12 @@ inline void MaskBlending4x4_SSE4(const int16_t* pred_0, const int16_t* pred_1, } template <int subsampling_x, int subsampling_y> -inline void MaskBlending4xH_SSE4(const int16_t* pred_0, const int16_t* pred_1, - const uint8_t* const mask_ptr, +inline void MaskBlending4xH_SSE4(const int16_t* LIBGAV1_RESTRICT pred_0, + const int16_t* LIBGAV1_RESTRICT pred_1, + const uint8_t* LIBGAV1_RESTRICT const mask_ptr, const ptrdiff_t mask_stride, const int height, - uint8_t* dst, const ptrdiff_t dst_stride) { + uint8_t* LIBGAV1_RESTRICT dst, + const ptrdiff_t dst_stride) { const uint8_t* mask = mask_ptr; if (height == 4) { MaskBlending4x4_SSE4<subsampling_x, subsampling_y>( @@ -222,11 +230,12 @@ inline void MaskBlending4xH_SSE4(const int16_t* pred_0, const int16_t* pred_1, } template <int subsampling_x, int subsampling_y> -inline void MaskBlend_SSE4(const void* prediction_0, const void* prediction_1, +inline void MaskBlend_SSE4(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, const ptrdiff_t /*prediction_stride_1*/, - const uint8_t* const mask_ptr, + const uint8_t* LIBGAV1_RESTRICT const mask_ptr, const ptrdiff_t mask_stride, const int width, - const int height, void* dest, + const int height, void* LIBGAV1_RESTRICT dest, const ptrdiff_t dst_stride) { auto* dst = static_cast<uint8_t*>(dest); const auto* pred_0 = static_cast<const int16_t*>(prediction_0); @@ -277,11 +286,10 @@ inline void MaskBlend_SSE4(const void* prediction_0, const void* prediction_1, } while (++y < height); } -inline void InterIntraWriteMaskBlendLine8bpp4x2(const uint8_t* const pred_0, - uint8_t* const pred_1, - const ptrdiff_t pred_stride_1, - const __m128i pred_mask_0, - const __m128i pred_mask_1) { +inline void InterIntraWriteMaskBlendLine8bpp4x2( + const uint8_t* LIBGAV1_RESTRICT const pred_0, + uint8_t* LIBGAV1_RESTRICT const pred_1, const ptrdiff_t pred_stride_1, + const __m128i pred_mask_0, const __m128i pred_mask_1) { const __m128i pred_mask = _mm_unpacklo_epi8(pred_mask_0, pred_mask_1); const __m128i pred_val_0 = LoadLo8(pred_0); @@ -301,11 +309,10 @@ inline void InterIntraWriteMaskBlendLine8bpp4x2(const uint8_t* const pred_0, } template <int subsampling_x, int subsampling_y> -inline void InterIntraMaskBlending8bpp4x4_SSE4(const uint8_t* pred_0, - uint8_t* pred_1, - const ptrdiff_t pred_stride_1, - const uint8_t* mask, - const ptrdiff_t mask_stride) { +inline void InterIntraMaskBlending8bpp4x4_SSE4( + const uint8_t* LIBGAV1_RESTRICT pred_0, uint8_t* LIBGAV1_RESTRICT pred_1, + const ptrdiff_t pred_stride_1, const uint8_t* LIBGAV1_RESTRICT mask, + const ptrdiff_t mask_stride) { const __m128i mask_inverter = _mm_set1_epi8(64); const __m128i pred_mask_u16_first = GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride); @@ -328,12 +335,11 @@ inline void InterIntraMaskBlending8bpp4x4_SSE4(const uint8_t* pred_0, } template <int subsampling_x, int subsampling_y> -inline void InterIntraMaskBlending8bpp4xH_SSE4(const uint8_t* pred_0, - uint8_t* pred_1, - const ptrdiff_t pred_stride_1, - const uint8_t* const mask_ptr, - const ptrdiff_t mask_stride, - const int height) { +inline void InterIntraMaskBlending8bpp4xH_SSE4( + const uint8_t* LIBGAV1_RESTRICT pred_0, uint8_t* LIBGAV1_RESTRICT pred_1, + const ptrdiff_t pred_stride_1, + const uint8_t* LIBGAV1_RESTRICT const mask_ptr, const ptrdiff_t mask_stride, + const int height) { const uint8_t* mask = mask_ptr; if (height == 4) { InterIntraMaskBlending8bpp4x4_SSE4<subsampling_x, subsampling_y>( @@ -358,12 +364,11 @@ inline void InterIntraMaskBlending8bpp4xH_SSE4(const uint8_t* pred_0, } template <int subsampling_x, int subsampling_y> -void InterIntraMaskBlend8bpp_SSE4(const uint8_t* prediction_0, - uint8_t* prediction_1, - const ptrdiff_t prediction_stride_1, - const uint8_t* const mask_ptr, - const ptrdiff_t mask_stride, const int width, - const int height) { +void InterIntraMaskBlend8bpp_SSE4( + const uint8_t* LIBGAV1_RESTRICT prediction_0, + uint8_t* LIBGAV1_RESTRICT prediction_1, const ptrdiff_t prediction_stride_1, + const uint8_t* LIBGAV1_RESTRICT const mask_ptr, const ptrdiff_t mask_stride, + const int width, const int height) { if (width == 4) { InterIntraMaskBlending8bpp4xH_SSE4<subsampling_x, subsampling_y>( prediction_0, prediction_1, prediction_stride_1, mask_ptr, mask_stride, @@ -503,10 +508,11 @@ inline __m128i GetMask8(const uint8_t* mask, const ptrdiff_t stride, } inline void WriteMaskBlendLine10bpp4x2_SSE4_1( - const uint16_t* pred_0, const uint16_t* pred_1, - const ptrdiff_t pred_stride_1, const __m128i& pred_mask_0, - const __m128i& pred_mask_1, const __m128i& offset, const __m128i& max, - const __m128i& shift4, uint16_t* dst, const ptrdiff_t dst_stride) { + const uint16_t* LIBGAV1_RESTRICT pred_0, + const uint16_t* LIBGAV1_RESTRICT pred_1, const ptrdiff_t pred_stride_1, + const __m128i& pred_mask_0, const __m128i& pred_mask_1, + const __m128i& offset, const __m128i& max, const __m128i& shift4, + uint16_t* LIBGAV1_RESTRICT dst, const ptrdiff_t dst_stride) { const __m128i pred_val_0 = LoadUnaligned16(pred_0); const __m128i pred_val_1 = LoadHi8(LoadLo8(pred_1), pred_1 + pred_stride_1); @@ -544,11 +550,12 @@ inline void WriteMaskBlendLine10bpp4x2_SSE4_1( } template <int subsampling_x, int subsampling_y> -inline void MaskBlend10bpp4x4_SSE4_1(const uint16_t* pred_0, - const uint16_t* pred_1, +inline void MaskBlend10bpp4x4_SSE4_1(const uint16_t* LIBGAV1_RESTRICT pred_0, + const uint16_t* LIBGAV1_RESTRICT pred_1, const ptrdiff_t pred_stride_1, - const uint8_t* mask, - const ptrdiff_t mask_stride, uint16_t* dst, + const uint8_t* LIBGAV1_RESTRICT mask, + const ptrdiff_t mask_stride, + uint16_t* LIBGAV1_RESTRICT dst, const ptrdiff_t dst_stride) { const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse); const __m128i zero = _mm_setzero_si128(); @@ -575,13 +582,12 @@ inline void MaskBlend10bpp4x4_SSE4_1(const uint16_t* pred_0, } template <int subsampling_x, int subsampling_y> -inline void MaskBlend10bpp4xH_SSE4_1(const uint16_t* pred_0, - const uint16_t* pred_1, - const ptrdiff_t pred_stride_1, - const uint8_t* const mask_ptr, - const ptrdiff_t mask_stride, - const int height, uint16_t* dst, - const ptrdiff_t dst_stride) { +inline void MaskBlend10bpp4xH_SSE4_1( + const uint16_t* LIBGAV1_RESTRICT pred_0, + const uint16_t* LIBGAV1_RESTRICT pred_1, const ptrdiff_t pred_stride_1, + const uint8_t* LIBGAV1_RESTRICT const mask_ptr, const ptrdiff_t mask_stride, + const int height, uint16_t* LIBGAV1_RESTRICT dst, + const ptrdiff_t dst_stride) { const uint8_t* mask = mask_ptr; if (height == 4) { MaskBlend10bpp4x4_SSE4_1<subsampling_x, subsampling_y>( @@ -648,13 +654,13 @@ inline void MaskBlend10bpp4xH_SSE4_1(const uint16_t* pred_0, } template <int subsampling_x, int subsampling_y> -inline void MaskBlend10bpp_SSE4_1(const void* prediction_0, - const void* prediction_1, - const ptrdiff_t prediction_stride_1, - const uint8_t* const mask_ptr, - const ptrdiff_t mask_stride, const int width, - const int height, void* dest, - const ptrdiff_t dest_stride) { +inline void MaskBlend10bpp_SSE4_1( + const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + const ptrdiff_t prediction_stride_1, + const uint8_t* LIBGAV1_RESTRICT const mask_ptr, const ptrdiff_t mask_stride, + const int width, const int height, void* LIBGAV1_RESTRICT dest, + const ptrdiff_t dest_stride) { auto* dst = static_cast<uint16_t*>(dest); const ptrdiff_t dst_stride = dest_stride / sizeof(dst[0]); const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); @@ -725,10 +731,11 @@ inline void MaskBlend10bpp_SSE4_1(const void* prediction_0, } inline void InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1( - const uint16_t* prediction_0, const uint16_t* prediction_1, + const uint16_t* LIBGAV1_RESTRICT prediction_0, + const uint16_t* LIBGAV1_RESTRICT prediction_1, const ptrdiff_t pred_stride_1, const __m128i& pred_mask_0, - const __m128i& pred_mask_1, const __m128i& shift6, uint16_t* dst, - const ptrdiff_t dst_stride) { + const __m128i& pred_mask_1, const __m128i& shift6, + uint16_t* LIBGAV1_RESTRICT dst, const ptrdiff_t dst_stride) { const __m128i pred_val_0 = LoadUnaligned16(prediction_0); const __m128i pred_val_1 = LoadHi8(LoadLo8(prediction_1), prediction_1 + pred_stride_1); @@ -751,9 +758,10 @@ inline void InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1( template <int subsampling_x, int subsampling_y> inline void InterIntraMaskBlend10bpp4x4_SSE4_1( - const uint16_t* pred_0, const uint16_t* pred_1, - const ptrdiff_t pred_stride_1, const uint8_t* mask, - const ptrdiff_t mask_stride, uint16_t* dst, const ptrdiff_t dst_stride) { + const uint16_t* LIBGAV1_RESTRICT pred_0, + const uint16_t* LIBGAV1_RESTRICT pred_1, const ptrdiff_t pred_stride_1, + const uint8_t* LIBGAV1_RESTRICT mask, const ptrdiff_t mask_stride, + uint16_t* LIBGAV1_RESTRICT dst, const ptrdiff_t dst_stride) { const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse); const __m128i shift6 = _mm_set1_epi32((1 << 6) >> 1); const __m128i zero = _mm_setzero_si128(); @@ -777,13 +785,12 @@ inline void InterIntraMaskBlend10bpp4x4_SSE4_1( } template <int subsampling_x, int subsampling_y> -inline void InterIntraMaskBlend10bpp4xH_SSE4_1(const uint16_t* pred_0, - const uint16_t* pred_1, - const ptrdiff_t pred_stride_1, - const uint8_t* const mask_ptr, - const ptrdiff_t mask_stride, - const int height, uint16_t* dst, - const ptrdiff_t dst_stride) { +inline void InterIntraMaskBlend10bpp4xH_SSE4_1( + const uint16_t* LIBGAV1_RESTRICT pred_0, + const uint16_t* LIBGAV1_RESTRICT pred_1, const ptrdiff_t pred_stride_1, + const uint8_t* LIBGAV1_RESTRICT const mask_ptr, const ptrdiff_t mask_stride, + const int height, uint16_t* LIBGAV1_RESTRICT dst, + const ptrdiff_t dst_stride) { const uint8_t* mask = mask_ptr; if (height == 4) { InterIntraMaskBlend10bpp4x4_SSE4_1<subsampling_x, subsampling_y>( @@ -848,9 +855,11 @@ inline void InterIntraMaskBlend10bpp4xH_SSE4_1(const uint16_t* pred_0, template <int subsampling_x, int subsampling_y> inline void InterIntraMaskBlend10bpp_SSE4_1( - const void* prediction_0, const void* prediction_1, - const ptrdiff_t prediction_stride_1, const uint8_t* const mask_ptr, - const ptrdiff_t mask_stride, const int width, const int height, void* dest, + const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + const ptrdiff_t prediction_stride_1, + const uint8_t* LIBGAV1_RESTRICT const mask_ptr, const ptrdiff_t mask_stride, + const int width, const int height, void* LIBGAV1_RESTRICT dest, const ptrdiff_t dest_stride) { auto* dst = static_cast<uint16_t*>(dest); const ptrdiff_t dst_stride = dest_stride / sizeof(dst[0]); diff --git a/src/dsp/x86/motion_field_projection_sse4.cc b/src/dsp/x86/motion_field_projection_sse4.cc index e3f2cce..5641531 100644 --- a/src/dsp/x86/motion_field_projection_sse4.cc +++ b/src/dsp/x86/motion_field_projection_sse4.cc @@ -360,27 +360,12 @@ void MotionFieldProjectionKernel_SSE4_1( } while (++y8 < y8_end); } -void Init8bpp() { - Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8); - assert(dsp != nullptr); - dsp->motion_field_projection_kernel = MotionFieldProjectionKernel_SSE4_1; -} - -#if LIBGAV1_MAX_BITDEPTH >= 10 -void Init10bpp() { - Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10); - assert(dsp != nullptr); - dsp->motion_field_projection_kernel = MotionFieldProjectionKernel_SSE4_1; -} -#endif - } // namespace void MotionFieldProjectionInit_SSE4_1() { - Init8bpp(); -#if LIBGAV1_MAX_BITDEPTH >= 10 - Init10bpp(); -#endif + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8); + assert(dsp != nullptr); + dsp->motion_field_projection_kernel = MotionFieldProjectionKernel_SSE4_1; } } // namespace dsp diff --git a/src/dsp/x86/motion_vector_search_sse4.cc b/src/dsp/x86/motion_vector_search_sse4.cc index 7f5f035..dacc6ec 100644 --- a/src/dsp/x86/motion_vector_search_sse4.cc +++ b/src/dsp/x86/motion_vector_search_sse4.cc @@ -64,7 +64,7 @@ inline __m128i MvProjectionClip(const __m128i mvs[2], } inline __m128i MvProjectionCompoundClip( - const MotionVector* const temporal_mvs, + const MotionVector* LIBGAV1_RESTRICT const temporal_mvs, const int8_t temporal_reference_offsets[2], const int reference_offsets[2]) { const auto* const tmvs = reinterpret_cast<const int32_t*>(temporal_mvs); @@ -83,8 +83,8 @@ inline __m128i MvProjectionCompoundClip( } inline __m128i MvProjectionSingleClip( - const MotionVector* const temporal_mvs, - const int8_t* const temporal_reference_offsets, + const MotionVector* LIBGAV1_RESTRICT const temporal_mvs, + const int8_t* LIBGAV1_RESTRICT const temporal_reference_offsets, const int reference_offset) { const auto* const tmvs = reinterpret_cast<const int16_t*>(temporal_mvs); const __m128i temporal_mv = LoadAligned16(tmvs); @@ -126,9 +126,10 @@ inline void ForceInteger(const __m128i mv, void* const candidate_mvs) { } void MvProjectionCompoundLowPrecision_SSE4_1( - const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets, + const MotionVector* LIBGAV1_RESTRICT temporal_mvs, + const int8_t* LIBGAV1_RESTRICT temporal_reference_offsets, const int reference_offsets[2], const int count, - CompoundMotionVector* candidate_mvs) { + CompoundMotionVector* LIBGAV1_RESTRICT candidate_mvs) { // |reference_offsets| non-zero check usually equals true and is ignored. // To facilitate the compilers, make a local copy of |reference_offsets|. const int offsets[2] = {reference_offsets[0], reference_offsets[1]}; @@ -143,9 +144,10 @@ void MvProjectionCompoundLowPrecision_SSE4_1( } void MvProjectionCompoundForceInteger_SSE4_1( - const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets, + const MotionVector* LIBGAV1_RESTRICT temporal_mvs, + const int8_t* LIBGAV1_RESTRICT temporal_reference_offsets, const int reference_offsets[2], const int count, - CompoundMotionVector* candidate_mvs) { + CompoundMotionVector* LIBGAV1_RESTRICT candidate_mvs) { // |reference_offsets| non-zero check usually equals true and is ignored. // To facilitate the compilers, make a local copy of |reference_offsets|. const int offsets[2] = {reference_offsets[0], reference_offsets[1]}; @@ -160,9 +162,10 @@ void MvProjectionCompoundForceInteger_SSE4_1( } void MvProjectionCompoundHighPrecision_SSE4_1( - const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets, + const MotionVector* LIBGAV1_RESTRICT temporal_mvs, + const int8_t* LIBGAV1_RESTRICT temporal_reference_offsets, const int reference_offsets[2], const int count, - CompoundMotionVector* candidate_mvs) { + CompoundMotionVector* LIBGAV1_RESTRICT candidate_mvs) { // |reference_offsets| non-zero check usually equals true and is ignored. // To facilitate the compilers, make a local copy of |reference_offsets|. const int offsets[2] = {reference_offsets[0], reference_offsets[1]}; @@ -177,8 +180,10 @@ void MvProjectionCompoundHighPrecision_SSE4_1( } void MvProjectionSingleLowPrecision_SSE4_1( - const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets, - const int reference_offset, const int count, MotionVector* candidate_mvs) { + const MotionVector* LIBGAV1_RESTRICT temporal_mvs, + const int8_t* LIBGAV1_RESTRICT temporal_reference_offsets, + const int reference_offset, const int count, + MotionVector* LIBGAV1_RESTRICT candidate_mvs) { // Up to three more elements could be calculated. int i = 0; do { @@ -190,8 +195,10 @@ void MvProjectionSingleLowPrecision_SSE4_1( } void MvProjectionSingleForceInteger_SSE4_1( - const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets, - const int reference_offset, const int count, MotionVector* candidate_mvs) { + const MotionVector* LIBGAV1_RESTRICT temporal_mvs, + const int8_t* LIBGAV1_RESTRICT temporal_reference_offsets, + const int reference_offset, const int count, + MotionVector* LIBGAV1_RESTRICT candidate_mvs) { // Up to three more elements could be calculated. int i = 0; do { @@ -203,8 +210,10 @@ void MvProjectionSingleForceInteger_SSE4_1( } void MvProjectionSingleHighPrecision_SSE4_1( - const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets, - const int reference_offset, const int count, MotionVector* candidate_mvs) { + const MotionVector* LIBGAV1_RESTRICT temporal_mvs, + const int8_t* LIBGAV1_RESTRICT temporal_reference_offsets, + const int reference_offset, const int count, + MotionVector* LIBGAV1_RESTRICT candidate_mvs) { // Up to three more elements could be calculated. int i = 0; do { @@ -215,20 +224,10 @@ void MvProjectionSingleHighPrecision_SSE4_1( } while (i < count); } -void Init8bpp() { - Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8); - assert(dsp != nullptr); - dsp->mv_projection_compound[0] = MvProjectionCompoundLowPrecision_SSE4_1; - dsp->mv_projection_compound[1] = MvProjectionCompoundForceInteger_SSE4_1; - dsp->mv_projection_compound[2] = MvProjectionCompoundHighPrecision_SSE4_1; - dsp->mv_projection_single[0] = MvProjectionSingleLowPrecision_SSE4_1; - dsp->mv_projection_single[1] = MvProjectionSingleForceInteger_SSE4_1; - dsp->mv_projection_single[2] = MvProjectionSingleHighPrecision_SSE4_1; -} +} // namespace -#if LIBGAV1_MAX_BITDEPTH >= 10 -void Init10bpp() { - Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10); +void MotionVectorSearchInit_SSE4_1() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8); assert(dsp != nullptr); dsp->mv_projection_compound[0] = MvProjectionCompoundLowPrecision_SSE4_1; dsp->mv_projection_compound[1] = MvProjectionCompoundForceInteger_SSE4_1; @@ -237,16 +236,6 @@ void Init10bpp() { dsp->mv_projection_single[1] = MvProjectionSingleForceInteger_SSE4_1; dsp->mv_projection_single[2] = MvProjectionSingleHighPrecision_SSE4_1; } -#endif - -} // namespace - -void MotionVectorSearchInit_SSE4_1() { - Init8bpp(); -#if LIBGAV1_MAX_BITDEPTH >= 10 - Init10bpp(); -#endif -} } // namespace dsp } // namespace libgav1 diff --git a/src/dsp/x86/obmc_sse4.cc b/src/dsp/x86/obmc_sse4.cc index c34a7f7..8ce23b4 100644 --- a/src/dsp/x86/obmc_sse4.cc +++ b/src/dsp/x86/obmc_sse4.cc @@ -37,8 +37,9 @@ namespace { #include "src/dsp/obmc.inc" inline void OverlapBlendFromLeft2xH_SSE4_1( - uint8_t* const prediction, const ptrdiff_t prediction_stride, - const int height, const uint8_t* const obmc_prediction, + uint8_t* LIBGAV1_RESTRICT const prediction, + const ptrdiff_t prediction_stride, const int height, + const uint8_t* LIBGAV1_RESTRICT const obmc_prediction, const ptrdiff_t obmc_prediction_stride) { uint8_t* pred = prediction; const uint8_t* obmc_pred = obmc_prediction; @@ -68,8 +69,9 @@ inline void OverlapBlendFromLeft2xH_SSE4_1( } inline void OverlapBlendFromLeft4xH_SSE4_1( - uint8_t* const prediction, const ptrdiff_t prediction_stride, - const int height, const uint8_t* const obmc_prediction, + uint8_t* LIBGAV1_RESTRICT const prediction, + const ptrdiff_t prediction_stride, const int height, + const uint8_t* LIBGAV1_RESTRICT const obmc_prediction, const ptrdiff_t obmc_prediction_stride) { uint8_t* pred = prediction; const uint8_t* obmc_pred = obmc_prediction; @@ -106,8 +108,9 @@ inline void OverlapBlendFromLeft4xH_SSE4_1( } inline void OverlapBlendFromLeft8xH_SSE4_1( - uint8_t* const prediction, const ptrdiff_t prediction_stride, - const int height, const uint8_t* const obmc_prediction, + uint8_t* LIBGAV1_RESTRICT const prediction, + const ptrdiff_t prediction_stride, const int height, + const uint8_t* LIBGAV1_RESTRICT const obmc_prediction, const ptrdiff_t obmc_prediction_stride) { uint8_t* pred = prediction; const uint8_t* obmc_pred = obmc_prediction; @@ -130,13 +133,15 @@ inline void OverlapBlendFromLeft8xH_SSE4_1( } while (--y != 0); } -void OverlapBlendFromLeft_SSE4_1(void* const prediction, - const ptrdiff_t prediction_stride, - const int width, const int height, - const void* const obmc_prediction, - const ptrdiff_t obmc_prediction_stride) { +void OverlapBlendFromLeft_SSE4_1( + void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t prediction_stride, + const int width, const int height, + const void* LIBGAV1_RESTRICT const obmc_prediction, + const ptrdiff_t obmc_prediction_stride) { auto* pred = static_cast<uint8_t*>(prediction); const auto* obmc_pred = static_cast<const uint8_t*>(obmc_prediction); + assert(width >= 2); + assert(height >= 4); if (width == 2) { OverlapBlendFromLeft2xH_SSE4_1(pred, prediction_stride, height, obmc_pred, @@ -185,8 +190,9 @@ void OverlapBlendFromLeft_SSE4_1(void* const prediction, } inline void OverlapBlendFromTop4xH_SSE4_1( - uint8_t* const prediction, const ptrdiff_t prediction_stride, - const int height, const uint8_t* const obmc_prediction, + uint8_t* LIBGAV1_RESTRICT const prediction, + const ptrdiff_t prediction_stride, const int height, + const uint8_t* LIBGAV1_RESTRICT const obmc_prediction, const ptrdiff_t obmc_prediction_stride) { uint8_t* pred = prediction; const uint8_t* obmc_pred = obmc_prediction; @@ -227,8 +233,9 @@ inline void OverlapBlendFromTop4xH_SSE4_1( } inline void OverlapBlendFromTop8xH_SSE4_1( - uint8_t* const prediction, const ptrdiff_t prediction_stride, - const int height, const uint8_t* const obmc_prediction, + uint8_t* LIBGAV1_RESTRICT const prediction, + const ptrdiff_t prediction_stride, const int height, + const uint8_t* LIBGAV1_RESTRICT const obmc_prediction, const ptrdiff_t obmc_prediction_stride) { uint8_t* pred = prediction; const uint8_t* obmc_pred = obmc_prediction; @@ -253,15 +260,17 @@ inline void OverlapBlendFromTop8xH_SSE4_1( } while (--y != 0); } -void OverlapBlendFromTop_SSE4_1(void* const prediction, - const ptrdiff_t prediction_stride, - const int width, const int height, - const void* const obmc_prediction, - const ptrdiff_t obmc_prediction_stride) { +void OverlapBlendFromTop_SSE4_1( + void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t prediction_stride, + const int width, const int height, + const void* LIBGAV1_RESTRICT const obmc_prediction, + const ptrdiff_t obmc_prediction_stride) { auto* pred = static_cast<uint8_t*>(prediction); const auto* obmc_pred = static_cast<const uint8_t*>(obmc_prediction); + assert(width >= 4); + assert(height >= 2); - if (width <= 4) { + if (width == 4) { OverlapBlendFromTop4xH_SSE4_1(pred, prediction_stride, height, obmc_pred, obmc_prediction_stride); return; @@ -323,8 +332,9 @@ namespace { constexpr int kRoundBitsObmcBlend = 6; inline void OverlapBlendFromLeft2xH_SSE4_1( - uint16_t* const prediction, const ptrdiff_t pred_stride, const int height, - const uint16_t* const obmc_prediction, const ptrdiff_t obmc_pred_stride) { + uint16_t* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride, + const int height, const uint16_t* LIBGAV1_RESTRICT const obmc_prediction, + const ptrdiff_t obmc_pred_stride) { uint16_t* pred = prediction; const uint16_t* obmc_pred = obmc_prediction; const ptrdiff_t pred_stride2 = pred_stride << 1; @@ -353,8 +363,9 @@ inline void OverlapBlendFromLeft2xH_SSE4_1( } inline void OverlapBlendFromLeft4xH_SSE4_1( - uint16_t* const prediction, const ptrdiff_t pred_stride, const int height, - const uint16_t* const obmc_prediction, const ptrdiff_t obmc_pred_stride) { + uint16_t* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride, + const int height, const uint16_t* LIBGAV1_RESTRICT const obmc_prediction, + const ptrdiff_t obmc_pred_stride) { uint16_t* pred = prediction; const uint16_t* obmc_pred = obmc_prediction; const ptrdiff_t pred_stride2 = pred_stride << 1; @@ -385,16 +396,18 @@ inline void OverlapBlendFromLeft4xH_SSE4_1( } while (y != 0); } -void OverlapBlendFromLeft10bpp_SSE4_1(void* const prediction, - const ptrdiff_t prediction_stride, - const int width, const int height, - const void* const obmc_prediction, - const ptrdiff_t obmc_prediction_stride) { +void OverlapBlendFromLeft10bpp_SSE4_1( + void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t prediction_stride, + const int width, const int height, + const void* LIBGAV1_RESTRICT const obmc_prediction, + const ptrdiff_t obmc_prediction_stride) { auto* pred = static_cast<uint16_t*>(prediction); const auto* obmc_pred = static_cast<const uint16_t*>(obmc_prediction); const ptrdiff_t pred_stride = prediction_stride / sizeof(pred[0]); const ptrdiff_t obmc_pred_stride = obmc_prediction_stride / sizeof(obmc_pred[0]); + assert(width >= 2); + assert(height >= 4); if (width == 2) { OverlapBlendFromLeft2xH_SSE4_1(pred, pred_stride, height, obmc_pred, @@ -437,54 +450,10 @@ void OverlapBlendFromLeft10bpp_SSE4_1(void* const prediction, } while (x < width); } -inline void OverlapBlendFromTop2xH_SSE4_1(uint16_t* const prediction, - const ptrdiff_t pred_stride, - const int height, - const uint16_t* const obmc_prediction, - const ptrdiff_t obmc_pred_stride) { - uint16_t* pred = prediction; - const uint16_t* obmc_pred = obmc_prediction; - const __m128i mask_inverter = _mm_set1_epi16(64); - const __m128i mask_shuffler = _mm_set_epi32(0x01010101, 0x01010101, 0, 0); - const __m128i mask_preinverter = _mm_set1_epi16(-256 | 1); - const uint8_t* mask = kObmcMask + height - 2; - const int compute_height = - height - (height >> 2); // compute_height based on 8-bit opt - const ptrdiff_t pred_stride2 = pred_stride << 1; - const ptrdiff_t obmc_pred_stride2 = obmc_pred_stride << 1; - int y = 0; - do { - // First mask in the first half, second mask in the second half. - const __m128i mask_val = _mm_shuffle_epi8(Load4(mask + y), mask_shuffler); - const __m128i masks = - _mm_sub_epi8(mask_inverter, _mm_sign_epi8(mask_val, mask_preinverter)); - const __m128i masks_lo = _mm_cvtepi8_epi16(masks); - const __m128i masks_hi = _mm_cvtepi8_epi16(_mm_srli_si128(masks, 8)); - - const __m128i pred_val = LoadHi8(LoadLo8(pred), pred + pred_stride); - const __m128i obmc_pred_val = - LoadHi8(LoadLo8(obmc_pred), obmc_pred + obmc_pred_stride); - const __m128i terms_lo = _mm_unpacklo_epi16(obmc_pred_val, pred_val); - const __m128i terms_hi = _mm_unpackhi_epi16(obmc_pred_val, pred_val); - const __m128i result_lo = RightShiftWithRounding_U32( - _mm_madd_epi16(terms_lo, masks_lo), kRoundBitsObmcBlend); - const __m128i result_hi = RightShiftWithRounding_U32( - _mm_madd_epi16(terms_hi, masks_hi), kRoundBitsObmcBlend); - const __m128i packed_result = _mm_packus_epi32(result_lo, result_hi); - - Store4(pred, packed_result); - Store4(pred + pred_stride, _mm_srli_si128(packed_result, 8)); - pred += pred_stride2; - obmc_pred += obmc_pred_stride2; - y += 2; - } while (y < compute_height); -} - -inline void OverlapBlendFromTop4xH_SSE4_1(uint16_t* const prediction, - const ptrdiff_t pred_stride, - const int height, - const uint16_t* const obmc_prediction, - const ptrdiff_t obmc_pred_stride) { +inline void OverlapBlendFromTop4xH_SSE4_1( + uint16_t* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride, + const int height, const uint16_t* LIBGAV1_RESTRICT const obmc_prediction, + const ptrdiff_t obmc_pred_stride) { uint16_t* pred = prediction; const uint16_t* obmc_pred = obmc_prediction; const __m128i mask_inverter = _mm_set1_epi16(64); @@ -522,22 +491,19 @@ inline void OverlapBlendFromTop4xH_SSE4_1(uint16_t* const prediction, } while (y < compute_height); } -void OverlapBlendFromTop10bpp_SSE4_1(void* const prediction, - const ptrdiff_t prediction_stride, - const int width, const int height, - const void* const obmc_prediction, - const ptrdiff_t obmc_prediction_stride) { +void OverlapBlendFromTop10bpp_SSE4_1( + void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t prediction_stride, + const int width, const int height, + const void* LIBGAV1_RESTRICT const obmc_prediction, + const ptrdiff_t obmc_prediction_stride) { auto* pred = static_cast<uint16_t*>(prediction); const auto* obmc_pred = static_cast<const uint16_t*>(obmc_prediction); const ptrdiff_t pred_stride = prediction_stride / sizeof(pred[0]); const ptrdiff_t obmc_pred_stride = obmc_prediction_stride / sizeof(obmc_pred[0]); + assert(width >= 4); + assert(height >= 2); - if (width == 2) { - OverlapBlendFromTop2xH_SSE4_1(pred, pred_stride, height, obmc_pred, - obmc_pred_stride); - return; - } if (width == 4) { OverlapBlendFromTop4xH_SSE4_1(pred, pred_stride, height, obmc_pred, obmc_pred_stride); diff --git a/src/dsp/x86/super_res_sse4.cc b/src/dsp/x86/super_res_sse4.cc index 85d05bc..458d94e 100644 --- a/src/dsp/x86/super_res_sse4.cc +++ b/src/dsp/x86/super_res_sse4.cc @@ -90,11 +90,13 @@ void SuperResCoefficients_SSE4_1(const int upscaled_width, } while (--x != 0); } -void SuperRes_SSE4_1(const void* const coefficients, void* const source, +void SuperRes_SSE4_1(const void* LIBGAV1_RESTRICT const coefficients, + void* LIBGAV1_RESTRICT const source, const ptrdiff_t source_stride, const int height, const int downscaled_width, const int upscaled_width, const int initial_subpixel_x, const int step, - void* const dest, const ptrdiff_t dest_stride) { + void* LIBGAV1_RESTRICT const dest, + const ptrdiff_t dest_stride) { auto* src = static_cast<uint8_t*>(source) - DivideBy2(kSuperResFilterTaps); auto* dst = static_cast<uint8_t*>(dest); int y = height; @@ -227,11 +229,13 @@ void SuperResCoefficients_SSE4_1(const int upscaled_width, } template <int bitdepth> -void SuperRes_SSE4_1(const void* const coefficients, void* const source, +void SuperRes_SSE4_1(const void* LIBGAV1_RESTRICT const coefficients, + void* LIBGAV1_RESTRICT const source, const ptrdiff_t source_stride, const int height, const int downscaled_width, const int upscaled_width, const int initial_subpixel_x, const int step, - void* const dest, const ptrdiff_t dest_stride) { + void* LIBGAV1_RESTRICT const dest, + const ptrdiff_t dest_stride) { auto* src = static_cast<uint16_t*>(source) - DivideBy2(kSuperResFilterTaps); auto* dst = static_cast<uint16_t*>(dest); int y = height; diff --git a/src/dsp/x86/warp_sse4.cc b/src/dsp/x86/warp_sse4.cc index 9ddfeac..5830894 100644 --- a/src/dsp/x86/warp_sse4.cc +++ b/src/dsp/x86/warp_sse4.cc @@ -101,7 +101,7 @@ inline void HorizontalFilter(const int sx4, const int16_t alpha, template <bool is_compound> inline void WriteVerticalFilter(const __m128i filter[8], const int16_t intermediate_result[15][8], int y, - void* dst_row) { + void* LIBGAV1_RESTRICT dst_row) { constexpr int kRoundBitsVertical = is_compound ? kInterRoundBitsCompoundVertical : kInterRoundBitsVertical; __m128i sum_low = _mm_set1_epi32(kOffsetRemoval); @@ -136,8 +136,9 @@ inline void WriteVerticalFilter(const __m128i filter[8], template <bool is_compound> inline void WriteVerticalFilter(const __m128i filter[8], - const int16_t* intermediate_result_column, - void* dst_row) { + const int16_t* LIBGAV1_RESTRICT + intermediate_result_column, + void* LIBGAV1_RESTRICT dst_row) { constexpr int kRoundBitsVertical = is_compound ? kInterRoundBitsCompoundVertical : kInterRoundBitsVertical; __m128i sum_low = _mm_setzero_si128(); @@ -167,7 +168,7 @@ inline void WriteVerticalFilter(const __m128i filter[8], template <bool is_compound, typename DestType> inline void VerticalFilter(const int16_t source[15][8], int y4, int gamma, - int delta, DestType* dest_row, + int delta, DestType* LIBGAV1_RESTRICT dest_row, ptrdiff_t dest_stride) { int sy4 = (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta); for (int y = 0; y < 8; ++y) { @@ -187,8 +188,9 @@ inline void VerticalFilter(const int16_t source[15][8], int y4, int gamma, } template <bool is_compound, typename DestType> -inline void VerticalFilter(const int16_t* source_cols, int y4, int gamma, - int delta, DestType* dest_row, +inline void VerticalFilter(const int16_t* LIBGAV1_RESTRICT source_cols, int y4, + int gamma, int delta, + DestType* LIBGAV1_RESTRICT dest_row, ptrdiff_t dest_stride) { int sy4 = (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta); for (int y = 0; y < 8; ++y) { @@ -208,9 +210,11 @@ inline void VerticalFilter(const int16_t* source_cols, int y4, int gamma, } template <bool is_compound, typename DestType> -inline void WarpRegion1(const uint8_t* src, ptrdiff_t source_stride, - int source_width, int source_height, int ix4, int iy4, - DestType* dst_row, ptrdiff_t dest_stride) { +inline void WarpRegion1(const uint8_t* LIBGAV1_RESTRICT src, + ptrdiff_t source_stride, int source_width, + int source_height, int ix4, int iy4, + DestType* LIBGAV1_RESTRICT dst_row, + ptrdiff_t dest_stride) { // Region 1 // Points to the left or right border of the first row of |src|. const uint8_t* first_row_border = @@ -244,10 +248,12 @@ inline void WarpRegion1(const uint8_t* src, ptrdiff_t source_stride, } template <bool is_compound, typename DestType> -inline void WarpRegion2(const uint8_t* src, ptrdiff_t source_stride, - int source_width, int y4, int ix4, int iy4, int gamma, - int delta, int16_t intermediate_result_column[15], - DestType* dst_row, ptrdiff_t dest_stride) { +inline void WarpRegion2(const uint8_t* LIBGAV1_RESTRICT src, + ptrdiff_t source_stride, int source_width, int y4, + int ix4, int iy4, int gamma, int delta, + int16_t intermediate_result_column[15], + DestType* LIBGAV1_RESTRICT dst_row, + ptrdiff_t dest_stride) { // Region 2. // Points to the left or right border of the first row of |src|. const uint8_t* first_row_border = @@ -283,9 +289,10 @@ inline void WarpRegion2(const uint8_t* src, ptrdiff_t source_stride, } template <bool is_compound, typename DestType> -inline void WarpRegion3(const uint8_t* src, ptrdiff_t source_stride, - int source_height, int alpha, int beta, int x4, int ix4, - int iy4, int16_t intermediate_result[15][8]) { +inline void WarpRegion3(const uint8_t* LIBGAV1_RESTRICT src, + ptrdiff_t source_stride, int source_height, int alpha, + int beta, int x4, int ix4, int iy4, + int16_t intermediate_result[15][8]) { // Region 3 // At this point, we know ix4 - 7 < source_width - 1 and ix4 + 7 > 0. @@ -315,9 +322,9 @@ inline void WarpRegion3(const uint8_t* src, ptrdiff_t source_stride, } template <bool is_compound, typename DestType> -inline void WarpRegion4(const uint8_t* src, ptrdiff_t source_stride, int alpha, - int beta, int x4, int ix4, int iy4, - int16_t intermediate_result[15][8]) { +inline void WarpRegion4(const uint8_t* LIBGAV1_RESTRICT src, + ptrdiff_t source_stride, int alpha, int beta, int x4, + int ix4, int iy4, int16_t intermediate_result[15][8]) { // Region 4. // At this point, we know ix4 - 7 < source_width - 1 and ix4 + 7 > 0. @@ -351,12 +358,14 @@ inline void WarpRegion4(const uint8_t* src, ptrdiff_t source_stride, int alpha, } template <bool is_compound, typename DestType> -inline void HandleWarpBlock(const uint8_t* src, ptrdiff_t source_stride, - int source_width, int source_height, - const int* warp_params, int subsampling_x, - int subsampling_y, int src_x, int src_y, - int16_t alpha, int16_t beta, int16_t gamma, - int16_t delta, DestType* dst_row, +inline void HandleWarpBlock(const uint8_t* LIBGAV1_RESTRICT src, + ptrdiff_t source_stride, int source_width, + int source_height, + const int* LIBGAV1_RESTRICT warp_params, + int subsampling_x, int subsampling_y, int src_x, + int src_y, int16_t alpha, int16_t beta, + int16_t gamma, int16_t delta, + DestType* LIBGAV1_RESTRICT dst_row, ptrdiff_t dest_stride) { union { // Intermediate_result is the output of the horizontal filtering and @@ -460,11 +469,12 @@ inline void HandleWarpBlock(const uint8_t* src, ptrdiff_t source_stride, } template <bool is_compound> -void Warp_SSE4_1(const void* source, ptrdiff_t source_stride, int source_width, - int source_height, const int* warp_params, int subsampling_x, +void Warp_SSE4_1(const void* LIBGAV1_RESTRICT source, ptrdiff_t source_stride, + int source_width, int source_height, + const int* LIBGAV1_RESTRICT warp_params, int subsampling_x, int subsampling_y, int block_start_x, int block_start_y, int block_width, int block_height, int16_t alpha, int16_t beta, - int16_t gamma, int16_t delta, void* dest, + int16_t gamma, int16_t delta, void* LIBGAV1_RESTRICT dest, ptrdiff_t dest_stride) { const auto* const src = static_cast<const uint8_t*>(source); using DestType = diff --git a/src/dsp/x86/weight_mask_sse4.cc b/src/dsp/x86/weight_mask_sse4.cc index 08a1739..69cb784 100644 --- a/src/dsp/x86/weight_mask_sse4.cc +++ b/src/dsp/x86/weight_mask_sse4.cc @@ -37,8 +37,9 @@ namespace { constexpr int kRoundingBits8bpp = 4; template <bool mask_is_inverse, bool is_store_16> -inline void WeightMask16_SSE4(const int16_t* prediction_0, - const int16_t* prediction_1, uint8_t* mask, +inline void WeightMask16_SSE4(const int16_t* LIBGAV1_RESTRICT prediction_0, + const int16_t* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, ptrdiff_t mask_stride) { const __m128i pred_00 = LoadAligned16(prediction_0); const __m128i pred_10 = LoadAligned16(prediction_1); @@ -86,8 +87,9 @@ inline void WeightMask16_SSE4(const int16_t* prediction_0, mask += mask_stride << 1 template <bool mask_is_inverse> -void WeightMask8x8_SSE4(const void* prediction_0, const void* prediction_1, - uint8_t* mask, ptrdiff_t mask_stride) { +void WeightMask8x8_SSE4(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const int16_t*>(prediction_0); const auto* pred_1 = static_cast<const int16_t*>(prediction_1); @@ -98,8 +100,10 @@ void WeightMask8x8_SSE4(const void* prediction_0, const void* prediction_1, } template <bool mask_is_inverse> -void WeightMask8x16_SSE4(const void* prediction_0, const void* prediction_1, - uint8_t* mask, ptrdiff_t mask_stride) { +void WeightMask8x16_SSE4(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const int16_t*>(prediction_0); const auto* pred_1 = static_cast<const int16_t*>(prediction_1); int y3 = 3; @@ -112,8 +116,10 @@ void WeightMask8x16_SSE4(const void* prediction_0, const void* prediction_1, } template <bool mask_is_inverse> -void WeightMask8x32_SSE4(const void* prediction_0, const void* prediction_1, - uint8_t* mask, ptrdiff_t mask_stride) { +void WeightMask8x32_SSE4(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const int16_t*>(prediction_0); const auto* pred_1 = static_cast<const int16_t*>(prediction_1); int y5 = 5; @@ -135,8 +141,10 @@ void WeightMask8x32_SSE4(const void* prediction_0, const void* prediction_1, mask += mask_stride template <bool mask_is_inverse> -void WeightMask16x8_SSE4(const void* prediction_0, const void* prediction_1, - uint8_t* mask, ptrdiff_t mask_stride) { +void WeightMask16x8_SSE4(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const int16_t*>(prediction_0); const auto* pred_1 = static_cast<const int16_t*>(prediction_1); int y = 7; @@ -147,8 +155,10 @@ void WeightMask16x8_SSE4(const void* prediction_0, const void* prediction_1, } template <bool mask_is_inverse> -void WeightMask16x16_SSE4(const void* prediction_0, const void* prediction_1, - uint8_t* mask, ptrdiff_t mask_stride) { +void WeightMask16x16_SSE4(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const int16_t*>(prediction_0); const auto* pred_1 = static_cast<const int16_t*>(prediction_1); int y3 = 5; @@ -161,8 +171,10 @@ void WeightMask16x16_SSE4(const void* prediction_0, const void* prediction_1, } template <bool mask_is_inverse> -void WeightMask16x32_SSE4(const void* prediction_0, const void* prediction_1, - uint8_t* mask, ptrdiff_t mask_stride) { +void WeightMask16x32_SSE4(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const int16_t*>(prediction_0); const auto* pred_1 = static_cast<const int16_t*>(prediction_1); int y5 = 6; @@ -178,8 +190,10 @@ void WeightMask16x32_SSE4(const void* prediction_0, const void* prediction_1, } template <bool mask_is_inverse> -void WeightMask16x64_SSE4(const void* prediction_0, const void* prediction_1, - uint8_t* mask, ptrdiff_t mask_stride) { +void WeightMask16x64_SSE4(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const int16_t*>(prediction_0); const auto* pred_1 = static_cast<const int16_t*>(prediction_1); int y3 = 21; @@ -203,8 +217,10 @@ void WeightMask16x64_SSE4(const void* prediction_0, const void* prediction_1, mask += mask_stride template <bool mask_is_inverse> -void WeightMask32x8_SSE4(const void* prediction_0, const void* prediction_1, - uint8_t* mask, ptrdiff_t mask_stride) { +void WeightMask32x8_SSE4(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const int16_t*>(prediction_0); const auto* pred_1 = static_cast<const int16_t*>(prediction_1); WEIGHT32_AND_STRIDE; @@ -218,8 +234,10 @@ void WeightMask32x8_SSE4(const void* prediction_0, const void* prediction_1, } template <bool mask_is_inverse> -void WeightMask32x16_SSE4(const void* prediction_0, const void* prediction_1, - uint8_t* mask, ptrdiff_t mask_stride) { +void WeightMask32x16_SSE4(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const int16_t*>(prediction_0); const auto* pred_1 = static_cast<const int16_t*>(prediction_1); int y3 = 5; @@ -232,8 +250,10 @@ void WeightMask32x16_SSE4(const void* prediction_0, const void* prediction_1, } template <bool mask_is_inverse> -void WeightMask32x32_SSE4(const void* prediction_0, const void* prediction_1, - uint8_t* mask, ptrdiff_t mask_stride) { +void WeightMask32x32_SSE4(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const int16_t*>(prediction_0); const auto* pred_1 = static_cast<const int16_t*>(prediction_1); int y5 = 6; @@ -249,8 +269,10 @@ void WeightMask32x32_SSE4(const void* prediction_0, const void* prediction_1, } template <bool mask_is_inverse> -void WeightMask32x64_SSE4(const void* prediction_0, const void* prediction_1, - uint8_t* mask, ptrdiff_t mask_stride) { +void WeightMask32x64_SSE4(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const int16_t*>(prediction_0); const auto* pred_1 = static_cast<const int16_t*>(prediction_1); int y3 = 21; @@ -278,8 +300,10 @@ void WeightMask32x64_SSE4(const void* prediction_0, const void* prediction_1, mask += mask_stride template <bool mask_is_inverse> -void WeightMask64x16_SSE4(const void* prediction_0, const void* prediction_1, - uint8_t* mask, ptrdiff_t mask_stride) { +void WeightMask64x16_SSE4(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const int16_t*>(prediction_0); const auto* pred_1 = static_cast<const int16_t*>(prediction_1); int y3 = 0; @@ -292,8 +316,10 @@ void WeightMask64x16_SSE4(const void* prediction_0, const void* prediction_1, } template <bool mask_is_inverse> -void WeightMask64x32_SSE4(const void* prediction_0, const void* prediction_1, - uint8_t* mask, ptrdiff_t mask_stride) { +void WeightMask64x32_SSE4(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const int16_t*>(prediction_0); const auto* pred_1 = static_cast<const int16_t*>(prediction_1); int y5 = 0; @@ -309,8 +335,10 @@ void WeightMask64x32_SSE4(const void* prediction_0, const void* prediction_1, } template <bool mask_is_inverse> -void WeightMask64x64_SSE4(const void* prediction_0, const void* prediction_1, - uint8_t* mask, ptrdiff_t mask_stride) { +void WeightMask64x64_SSE4(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const int16_t*>(prediction_0); const auto* pred_1 = static_cast<const int16_t*>(prediction_1); int y3 = 0; @@ -323,8 +351,10 @@ void WeightMask64x64_SSE4(const void* prediction_0, const void* prediction_1, } template <bool mask_is_inverse> -void WeightMask64x128_SSE4(const void* prediction_0, const void* prediction_1, - uint8_t* mask, ptrdiff_t mask_stride) { +void WeightMask64x128_SSE4(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const int16_t*>(prediction_0); const auto* pred_1 = static_cast<const int16_t*>(prediction_1); int y3 = 0; @@ -338,8 +368,10 @@ void WeightMask64x128_SSE4(const void* prediction_0, const void* prediction_1, } template <bool mask_is_inverse> -void WeightMask128x64_SSE4(const void* prediction_0, const void* prediction_1, - uint8_t* mask, ptrdiff_t mask_stride) { +void WeightMask128x64_SSE4(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const int16_t*>(prediction_0); const auto* pred_1 = static_cast<const int16_t*>(prediction_1); int y3 = 0; @@ -380,8 +412,10 @@ void WeightMask128x64_SSE4(const void* prediction_0, const void* prediction_1, } template <bool mask_is_inverse> -void WeightMask128x128_SSE4(const void* prediction_0, const void* prediction_1, - uint8_t* mask, ptrdiff_t mask_stride) { +void WeightMask128x128_SSE4(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const int16_t*>(prediction_0); const auto* pred_1 = static_cast<const int16_t*>(prediction_1); int y3 = 0; @@ -467,9 +501,10 @@ constexpr int kRoundingBits10bpp = 6; constexpr int kScaledDiffShift = 4; template <bool mask_is_inverse, bool is_store_16> -inline void WeightMask16_10bpp_SSE4(const uint16_t* prediction_0, - const uint16_t* prediction_1, uint8_t* mask, - ptrdiff_t mask_stride) { +inline void WeightMask16_10bpp_SSE4( + const uint16_t* LIBGAV1_RESTRICT prediction_0, + const uint16_t* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, ptrdiff_t mask_stride) { const __m128i diff_offset = _mm_set1_epi8(38); const __m128i mask_ceiling = _mm_set1_epi8(64); const __m128i zero = _mm_setzero_si128(); @@ -538,8 +573,9 @@ inline void WeightMask16_10bpp_SSE4(const uint16_t* prediction_0, mask += mask_stride << 1 template <bool mask_is_inverse> -void WeightMask8x8_10bpp_SSE4(const void* prediction_0, - const void* prediction_1, uint8_t* mask, +void WeightMask8x8_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); @@ -551,8 +587,9 @@ void WeightMask8x8_10bpp_SSE4(const void* prediction_0, } template <bool mask_is_inverse> -void WeightMask8x16_10bpp_SSE4(const void* prediction_0, - const void* prediction_1, uint8_t* mask, +void WeightMask8x16_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); @@ -566,8 +603,9 @@ void WeightMask8x16_10bpp_SSE4(const void* prediction_0, } template <bool mask_is_inverse> -void WeightMask8x32_10bpp_SSE4(const void* prediction_0, - const void* prediction_1, uint8_t* mask, +void WeightMask8x32_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); @@ -591,8 +629,9 @@ void WeightMask8x32_10bpp_SSE4(const void* prediction_0, mask += mask_stride template <bool mask_is_inverse> -void WeightMask16x8_10bpp_SSE4(const void* prediction_0, - const void* prediction_1, uint8_t* mask, +void WeightMask16x8_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); @@ -604,8 +643,9 @@ void WeightMask16x8_10bpp_SSE4(const void* prediction_0, } template <bool mask_is_inverse> -void WeightMask16x16_10bpp_SSE4(const void* prediction_0, - const void* prediction_1, uint8_t* mask, +void WeightMask16x16_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); @@ -619,8 +659,9 @@ void WeightMask16x16_10bpp_SSE4(const void* prediction_0, } template <bool mask_is_inverse> -void WeightMask16x32_10bpp_SSE4(const void* prediction_0, - const void* prediction_1, uint8_t* mask, +void WeightMask16x32_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); @@ -637,8 +678,9 @@ void WeightMask16x32_10bpp_SSE4(const void* prediction_0, } template <bool mask_is_inverse> -void WeightMask16x64_10bpp_SSE4(const void* prediction_0, - const void* prediction_1, uint8_t* mask, +void WeightMask16x64_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); @@ -664,8 +706,9 @@ void WeightMask16x64_10bpp_SSE4(const void* prediction_0, mask += mask_stride template <bool mask_is_inverse> -void WeightMask32x8_10bpp_SSE4(const void* prediction_0, - const void* prediction_1, uint8_t* mask, +void WeightMask32x8_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); @@ -680,8 +723,9 @@ void WeightMask32x8_10bpp_SSE4(const void* prediction_0, } template <bool mask_is_inverse> -void WeightMask32x16_10bpp_SSE4(const void* prediction_0, - const void* prediction_1, uint8_t* mask, +void WeightMask32x16_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); @@ -695,8 +739,9 @@ void WeightMask32x16_10bpp_SSE4(const void* prediction_0, } template <bool mask_is_inverse> -void WeightMask32x32_10bpp_SSE4(const void* prediction_0, - const void* prediction_1, uint8_t* mask, +void WeightMask32x32_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); @@ -713,8 +758,9 @@ void WeightMask32x32_10bpp_SSE4(const void* prediction_0, } template <bool mask_is_inverse> -void WeightMask32x64_10bpp_SSE4(const void* prediction_0, - const void* prediction_1, uint8_t* mask, +void WeightMask32x64_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); @@ -744,8 +790,9 @@ void WeightMask32x64_10bpp_SSE4(const void* prediction_0, mask += mask_stride template <bool mask_is_inverse> -void WeightMask64x16_10bpp_SSE4(const void* prediction_0, - const void* prediction_1, uint8_t* mask, +void WeightMask64x16_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); @@ -759,8 +806,9 @@ void WeightMask64x16_10bpp_SSE4(const void* prediction_0, } template <bool mask_is_inverse> -void WeightMask64x32_10bpp_SSE4(const void* prediction_0, - const void* prediction_1, uint8_t* mask, +void WeightMask64x32_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); @@ -777,8 +825,9 @@ void WeightMask64x32_10bpp_SSE4(const void* prediction_0, } template <bool mask_is_inverse> -void WeightMask64x64_10bpp_SSE4(const void* prediction_0, - const void* prediction_1, uint8_t* mask, +void WeightMask64x64_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); @@ -792,8 +841,9 @@ void WeightMask64x64_10bpp_SSE4(const void* prediction_0, } template <bool mask_is_inverse> -void WeightMask64x128_10bpp_SSE4(const void* prediction_0, - const void* prediction_1, uint8_t* mask, +void WeightMask64x128_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); @@ -808,8 +858,9 @@ void WeightMask64x128_10bpp_SSE4(const void* prediction_0, } template <bool mask_is_inverse> -void WeightMask128x64_10bpp_SSE4(const void* prediction_0, - const void* prediction_1, uint8_t* mask, +void WeightMask128x64_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); @@ -851,8 +902,9 @@ void WeightMask128x64_10bpp_SSE4(const void* prediction_0, } template <bool mask_is_inverse> -void WeightMask128x128_10bpp_SSE4(const void* prediction_0, - const void* prediction_1, uint8_t* mask, +void WeightMask128x128_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); |