diff options
Diffstat (limited to 'src/dsp/x86/intrapred_smooth_sse4.cc')
-rw-r--r-- | src/dsp/x86/intrapred_smooth_sse4.cc | 378 |
1 files changed, 213 insertions, 165 deletions
diff --git a/src/dsp/x86/intrapred_smooth_sse4.cc b/src/dsp/x86/intrapred_smooth_sse4.cc index de9f551..b53ee8c 100644 --- a/src/dsp/x86/intrapred_smooth_sse4.cc +++ b/src/dsp/x86/intrapred_smooth_sse4.cc @@ -38,23 +38,12 @@ namespace { // to have visibility of the values. This helps reduce loads and in the // creation of the inverse weights. constexpr uint8_t kSmoothWeights[] = { - // block dimension = 4 - 255, 149, 85, 64, - // block dimension = 8 - 255, 197, 146, 105, 73, 50, 37, 32, - // block dimension = 16 - 255, 225, 196, 170, 145, 123, 102, 84, 68, 54, 43, 33, 26, 20, 17, 16, - // block dimension = 32 - 255, 240, 225, 210, 196, 182, 169, 157, 145, 133, 122, 111, 101, 92, 83, 74, - 66, 59, 52, 45, 39, 34, 29, 25, 21, 17, 14, 12, 10, 9, 8, 8, - // block dimension = 64 - 255, 248, 240, 233, 225, 218, 210, 203, 196, 189, 182, 176, 169, 163, 156, - 150, 144, 138, 133, 127, 121, 116, 111, 106, 101, 96, 91, 86, 82, 77, 73, - 69, 65, 61, 57, 54, 50, 47, 44, 41, 38, 35, 32, 29, 27, 25, 22, 20, 18, 16, - 15, 13, 12, 10, 9, 8, 7, 6, 6, 5, 5, 4, 4, 4}; +#include "src/dsp/smooth_weights.inc" +}; template <int y_mask> -inline void WriteSmoothHorizontalSum4(void* const dest, const __m128i& left, +inline void WriteSmoothHorizontalSum4(void* LIBGAV1_RESTRICT const dest, + const __m128i& left, const __m128i& weights, const __m128i& scaled_top_right, const __m128i& round) { @@ -77,7 +66,8 @@ inline __m128i SmoothDirectionalSum8(const __m128i& pixels, return _mm_add_epi16(scaled_corner, weighted_px); } -inline void WriteSmoothDirectionalSum8(uint8_t* dest, const __m128i& pixels, +inline void WriteSmoothDirectionalSum8(uint8_t* LIBGAV1_RESTRICT dest, + const __m128i& pixels, const __m128i& weights, const __m128i& scaled_corner, const __m128i& round) { @@ -91,13 +81,11 @@ inline void WriteSmoothDirectionalSum8(uint8_t* dest, const __m128i& pixels, // For Horizontal, pixels1 and pixels2 are the same repeated value. For // Vertical, weights1 and weights2 are the same, and scaled_corner1 and // scaled_corner2 are the same. -inline void WriteSmoothDirectionalSum16(uint8_t* dest, const __m128i& pixels1, - const __m128i& pixels2, - const __m128i& weights1, - const __m128i& weights2, - const __m128i& scaled_corner1, - const __m128i& scaled_corner2, - const __m128i& round) { +inline void WriteSmoothDirectionalSum16( + uint8_t* LIBGAV1_RESTRICT dest, const __m128i& pixels1, + const __m128i& pixels2, const __m128i& weights1, const __m128i& weights2, + const __m128i& scaled_corner1, const __m128i& scaled_corner2, + const __m128i& round) { const __m128i weighted_px1 = _mm_mullo_epi16(pixels1, weights1); const __m128i weighted_px2 = _mm_mullo_epi16(pixels2, weights2); const __m128i pred_sum1 = _mm_add_epi16(scaled_corner1, weighted_px1); @@ -109,8 +97,9 @@ inline void WriteSmoothDirectionalSum16(uint8_t* dest, const __m128i& pixels1, } template <int y_mask> -inline void WriteSmoothPredSum4(uint8_t* const dest, const __m128i& top, - const __m128i& left, const __m128i& weights_x, +inline void WriteSmoothPredSum4(uint8_t* LIBGAV1_RESTRICT const dest, + const __m128i& top, const __m128i& left, + const __m128i& weights_x, const __m128i& weights_y, const __m128i& scaled_bottom_left, const __m128i& scaled_top_right, @@ -135,7 +124,8 @@ inline void WriteSmoothPredSum4(uint8_t* const dest, const __m128i& top, // pixels[0]: above and below_pred interleave vector // pixels[1]: left vector // pixels[2]: right_pred vector -inline void LoadSmoothPixels4(const uint8_t* above, const uint8_t* left, +inline void LoadSmoothPixels4(const uint8_t* LIBGAV1_RESTRICT above, + const uint8_t* LIBGAV1_RESTRICT left, const int height, __m128i* pixels) { if (height == 4) { pixels[1] = Load4(left); @@ -156,8 +146,9 @@ inline void LoadSmoothPixels4(const uint8_t* above, const uint8_t* left, // weight_h[2]: same as [0], second half for height = 16 only // weight_h[3]: same as [1], second half for height = 16 only // weight_w[0]: weights_w and scale - weights_w interleave vector -inline void LoadSmoothWeights4(const uint8_t* weight_array, const int height, - __m128i* weight_h, __m128i* weight_w) { +inline void LoadSmoothWeights4(const uint8_t* LIBGAV1_RESTRICT weight_array, + const int height, __m128i* weight_h, + __m128i* weight_w) { const __m128i scale = _mm_set1_epi16(256); const __m128i x_weights = Load4(weight_array); weight_h[0] = _mm_cvtepu8_epi16(x_weights); @@ -179,7 +170,8 @@ inline void LoadSmoothWeights4(const uint8_t* weight_array, const int height, } inline void WriteSmoothPred4x8(const __m128i* pixel, const __m128i* weights_y, - const __m128i* weight_x, uint8_t* dst, + const __m128i* weight_x, + uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride, const bool use_second_half) { const __m128i round = _mm_set1_epi32(256); @@ -215,8 +207,9 @@ inline void WriteSmoothPred4x8(const __m128i* pixel, const __m128i* weights_y, // The interleaving approach has some overhead that causes it to underperform in // the 4x4 case. -void Smooth4x4_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* top_row, const void* left_column) { +void Smooth4x4_SSE4_1(void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT top_row, + const void* LIBGAV1_RESTRICT left_column) { const __m128i top = _mm_cvtepu8_epi32(Load4(top_row)); const __m128i left = _mm_cvtepu8_epi32(Load4(left_column)); const __m128i weights = _mm_cvtepu8_epi32(Load4(kSmoothWeights)); @@ -247,8 +240,9 @@ void Smooth4x4_SSE4_1(void* const dest, const ptrdiff_t stride, scaled_bottom_left, scaled_top_right, scale); } -void Smooth4x8_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* top_row, const void* left_column) { +void Smooth4x8_SSE4_1(void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT top_row, + const void* LIBGAV1_RESTRICT left_column) { const auto* const left_ptr = static_cast<const uint8_t*>(left_column); const auto* const top_ptr = static_cast<const uint8_t*>(top_row); __m128i weights_x[1]; @@ -260,8 +254,10 @@ void Smooth4x8_SSE4_1(void* const dest, const ptrdiff_t stride, WriteSmoothPred4x8(pixels, weights_y, weights_x, dst, stride, false); } -void Smooth4x16_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* top_row, const void* left_column) { +void Smooth4x16_SSE4_1(void* LIBGAV1_RESTRICT const dest, + const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT top_row, + const void* LIBGAV1_RESTRICT left_column) { const auto* const left_ptr = static_cast<const uint8_t*>(left_column); const auto* const top_ptr = static_cast<const uint8_t*>(top_row); __m128i weights_x[1]; @@ -283,7 +279,8 @@ void Smooth4x16_SSE4_1(void* const dest, const ptrdiff_t stride, // pixels[5]: above and below_pred interleave vector, second half // pixels[6]: left vector + 16 // pixels[7]: right_pred vector -inline void LoadSmoothPixels8(const uint8_t* above, const uint8_t* left, +inline void LoadSmoothPixels8(const uint8_t* LIBGAV1_RESTRICT above, + const uint8_t* LIBGAV1_RESTRICT left, const int height, __m128i* pixels) { const __m128i bottom_left = _mm_set1_epi16(left[height - 1]); __m128i top_row = _mm_cvtepu8_epi16(LoadLo8(above)); @@ -317,8 +314,9 @@ inline void LoadSmoothPixels8(const uint8_t* above, const uint8_t* left, // weight_h[7]: same as [1], offset 24 // weight_w[0]: weights_w and scale - weights_w interleave vector, first half // weight_w[1]: weights_w and scale - weights_w interleave vector, second half -inline void LoadSmoothWeights8(const uint8_t* weight_array, const int height, - __m128i* weight_w, __m128i* weight_h) { +inline void LoadSmoothWeights8(const uint8_t* LIBGAV1_RESTRICT weight_array, + const int height, __m128i* weight_w, + __m128i* weight_h) { const int offset = (height < 8) ? 0 : 4; __m128i loaded_weights = LoadUnaligned16(&weight_array[offset]); weight_h[0] = _mm_cvtepu8_epi16(loaded_weights); @@ -360,7 +358,8 @@ inline void LoadSmoothWeights8(const uint8_t* weight_array, const int height, inline void WriteSmoothPred8xH(const __m128i* pixels, const __m128i* weights_x, const __m128i* weights_y, const int height, - uint8_t* dst, const ptrdiff_t stride, + uint8_t* LIBGAV1_RESTRICT dst, + const ptrdiff_t stride, const bool use_second_half) { const __m128i round = _mm_set1_epi32(256); const __m128i mask_increment = _mm_set1_epi16(0x0202); @@ -405,8 +404,9 @@ inline void WriteSmoothPred8xH(const __m128i* pixels, const __m128i* weights_x, } } -void Smooth8x4_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* top_row, const void* left_column) { +void Smooth8x4_SSE4_1(void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT top_row, + const void* LIBGAV1_RESTRICT left_column) { const auto* const left_ptr = static_cast<const uint8_t*>(left_column); const auto* const top_ptr = static_cast<const uint8_t*>(top_row); __m128i pixels[4]; @@ -419,8 +419,9 @@ void Smooth8x4_SSE4_1(void* const dest, const ptrdiff_t stride, WriteSmoothPred8xH(pixels, weights_x, weights_y, 4, dst, stride, false); } -void Smooth8x8_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* top_row, const void* left_column) { +void Smooth8x8_SSE4_1(void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT top_row, + const void* LIBGAV1_RESTRICT left_column) { const auto* const left_ptr = static_cast<const uint8_t*>(left_column); const auto* const top_ptr = static_cast<const uint8_t*>(top_row); @@ -434,8 +435,10 @@ void Smooth8x8_SSE4_1(void* const dest, const ptrdiff_t stride, WriteSmoothPred8xH(pixels, weights_x, weights_y, 8, dst, stride, false); } -void Smooth8x16_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* top_row, const void* left_column) { +void Smooth8x16_SSE4_1(void* LIBGAV1_RESTRICT const dest, + const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT top_row, + const void* LIBGAV1_RESTRICT left_column) { const auto* const left_ptr = static_cast<const uint8_t*>(left_column); const auto* const top_ptr = static_cast<const uint8_t*>(top_row); __m128i pixels[4]; @@ -450,8 +453,10 @@ void Smooth8x16_SSE4_1(void* const dest, const ptrdiff_t stride, WriteSmoothPred8xH(pixels, weights_x, &weights_y[2], 8, dst, stride, true); } -void Smooth8x32_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* top_row, const void* left_column) { +void Smooth8x32_SSE4_1(void* LIBGAV1_RESTRICT const dest, + const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT top_row, + const void* LIBGAV1_RESTRICT left_column) { const auto* const left_ptr = static_cast<const uint8_t*>(left_column); const auto* const top_ptr = static_cast<const uint8_t*>(top_row); __m128i pixels[8]; @@ -473,8 +478,9 @@ void Smooth8x32_SSE4_1(void* const dest, const ptrdiff_t stride, } template <int width, int height> -void SmoothWxH(void* const dest, const ptrdiff_t stride, - const void* const top_row, const void* const left_column) { +void SmoothWxH(void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const left_ptr = static_cast<const uint8_t*>(left_column); const auto* const top_ptr = static_cast<const uint8_t*>(top_row); const uint8_t* const sm_weights_h = kSmoothWeights + height - 4; @@ -532,8 +538,10 @@ void SmoothWxH(void* const dest, const ptrdiff_t stride, } } -void SmoothHorizontal4x4_SSE4_1(void* dest, const ptrdiff_t stride, - const void* top_row, const void* left_column) { +void SmoothHorizontal4x4_SSE4_1(void* LIBGAV1_RESTRICT dest, + const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT top_row, + const void* LIBGAV1_RESTRICT left_column) { const auto* const top_ptr = static_cast<const uint8_t*>(top_row); const __m128i top_right = _mm_set1_epi32(top_ptr[3]); const auto* const left_ptr = static_cast<const uint8_t*>(left_column); @@ -553,9 +561,10 @@ void SmoothHorizontal4x4_SSE4_1(void* dest, const ptrdiff_t stride, WriteSmoothHorizontalSum4<0xFF>(dst, left, weights, scaled_top_right, scale); } -void SmoothHorizontal4x8_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void SmoothHorizontal4x8_SSE4_1( + void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const top = static_cast<const uint8_t*>(top_row); const __m128i top_right = _mm_set1_epi32(top[3]); const __m128i weights = _mm_cvtepu8_epi32(Load4(kSmoothWeights)); @@ -585,9 +594,10 @@ void SmoothHorizontal4x8_SSE4_1(void* const dest, const ptrdiff_t stride, WriteSmoothHorizontalSum4<0xFF>(dst, left, weights, scaled_top_right, scale); } -void SmoothHorizontal4x16_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void SmoothHorizontal4x16_SSE4_1( + void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const top = static_cast<const uint8_t*>(top_row); const __m128i top_right = _mm_set1_epi32(top[3]); const __m128i weights = _mm_cvtepu8_epi32(Load4(kSmoothWeights)); @@ -637,9 +647,10 @@ void SmoothHorizontal4x16_SSE4_1(void* const dest, const ptrdiff_t stride, WriteSmoothHorizontalSum4<0xFF>(dst, left, weights, scaled_top_right, scale); } -void SmoothHorizontal8x4_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void SmoothHorizontal8x4_SSE4_1( + void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const top = static_cast<const uint8_t*>(top_row); const __m128i top_right = _mm_set1_epi16(top[7]); const __m128i left = _mm_cvtepu8_epi16(Load4(left_column)); @@ -666,9 +677,10 @@ void SmoothHorizontal8x4_SSE4_1(void* const dest, const ptrdiff_t stride, WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale); } -void SmoothHorizontal8x8_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void SmoothHorizontal8x8_SSE4_1( + void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const top = static_cast<const uint8_t*>(top_row); const __m128i top_right = _mm_set1_epi16(top[7]); const __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column)); @@ -686,9 +698,10 @@ void SmoothHorizontal8x8_SSE4_1(void* const dest, const ptrdiff_t stride, } } -void SmoothHorizontal8x16_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void SmoothHorizontal8x16_SSE4_1( + void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const top = static_cast<const uint8_t*>(top_row); const __m128i top_right = _mm_set1_epi16(top[7]); const __m128i weights = _mm_cvtepu8_epi16(LoadLo8(kSmoothWeights + 4)); @@ -714,9 +727,10 @@ void SmoothHorizontal8x16_SSE4_1(void* const dest, const ptrdiff_t stride, } } -void SmoothHorizontal8x32_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void SmoothHorizontal8x32_SSE4_1( + void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const top = static_cast<const uint8_t*>(top_row); const __m128i top_right = _mm_set1_epi16(top[7]); const __m128i weights = _mm_cvtepu8_epi16(LoadLo8(kSmoothWeights + 4)); @@ -756,9 +770,10 @@ void SmoothHorizontal8x32_SSE4_1(void* const dest, const ptrdiff_t stride, } } -void SmoothHorizontal16x4_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void SmoothHorizontal16x4_SSE4_1( + void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const top = static_cast<const uint8_t*>(top_row); const __m128i top_right = _mm_set1_epi16(top[15]); const __m128i left = _mm_cvtepu8_epi16(Load4(left_column)); @@ -795,9 +810,10 @@ void SmoothHorizontal16x4_SSE4_1(void* const dest, const ptrdiff_t stride, scaled_top_right1, scaled_top_right2, scale); } -void SmoothHorizontal16x8_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void SmoothHorizontal16x8_SSE4_1( + void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const top = static_cast<const uint8_t*>(top_row); const __m128i top_right = _mm_set1_epi16(top[15]); const __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column)); @@ -822,9 +838,10 @@ void SmoothHorizontal16x8_SSE4_1(void* const dest, const ptrdiff_t stride, } } -void SmoothHorizontal16x16_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void SmoothHorizontal16x16_SSE4_1( + void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const top = static_cast<const uint8_t*>(top_row); const __m128i top_right = _mm_set1_epi16(top[15]); const __m128i weights = LoadUnaligned16(kSmoothWeights + 12); @@ -858,9 +875,10 @@ void SmoothHorizontal16x16_SSE4_1(void* const dest, const ptrdiff_t stride, } } -void SmoothHorizontal16x32_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void SmoothHorizontal16x32_SSE4_1( + void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const top = static_cast<const uint8_t*>(top_row); const __m128i top_right = _mm_set1_epi16(top[15]); const __m128i weights = LoadUnaligned16(kSmoothWeights + 12); @@ -910,9 +928,10 @@ void SmoothHorizontal16x32_SSE4_1(void* const dest, const ptrdiff_t stride, } } -void SmoothHorizontal16x64_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void SmoothHorizontal16x64_SSE4_1( + void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const top = static_cast<const uint8_t*>(top_row); const __m128i top_right = _mm_set1_epi16(top[15]); const __m128i weights = LoadUnaligned16(kSmoothWeights + 12); @@ -940,9 +959,10 @@ void SmoothHorizontal16x64_SSE4_1(void* const dest, const ptrdiff_t stride, } } -void SmoothHorizontal32x8_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void SmoothHorizontal32x8_SSE4_1( + void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const top = static_cast<const uint8_t*>(top_row); const __m128i top_right = _mm_set1_epi16(top[31]); const __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column)); @@ -978,9 +998,10 @@ void SmoothHorizontal32x8_SSE4_1(void* const dest, const ptrdiff_t stride, } } -void SmoothHorizontal32x16_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void SmoothHorizontal32x16_SSE4_1( + void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const top = static_cast<const uint8_t*>(top_row); const __m128i top_right = _mm_set1_epi16(top[31]); const __m128i left1 = _mm_cvtepu8_epi16(LoadLo8(left_column)); @@ -1027,9 +1048,10 @@ void SmoothHorizontal32x16_SSE4_1(void* const dest, const ptrdiff_t stride, } } -void SmoothHorizontal32x32_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void SmoothHorizontal32x32_SSE4_1( + void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const top = static_cast<const uint8_t*>(top_row); const __m128i top_right = _mm_set1_epi16(top[31]); const __m128i weights_lo = LoadUnaligned16(kSmoothWeights + 28); @@ -1096,9 +1118,10 @@ void SmoothHorizontal32x32_SSE4_1(void* const dest, const ptrdiff_t stride, } } -void SmoothHorizontal32x64_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void SmoothHorizontal32x64_SSE4_1( + void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const top = static_cast<const uint8_t*>(top_row); const __m128i top_right = _mm_set1_epi16(top[31]); const __m128i weights_lo = LoadUnaligned16(kSmoothWeights + 28); @@ -1137,9 +1160,10 @@ void SmoothHorizontal32x64_SSE4_1(void* const dest, const ptrdiff_t stride, } } -void SmoothHorizontal64x16_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void SmoothHorizontal64x16_SSE4_1( + void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const top = static_cast<const uint8_t*>(top_row); const __m128i top_right = _mm_set1_epi16(top[63]); const __m128i left1 = _mm_cvtepu8_epi16(LoadLo8(left_column)); @@ -1212,9 +1236,10 @@ void SmoothHorizontal64x16_SSE4_1(void* const dest, const ptrdiff_t stride, } } -void SmoothHorizontal64x32_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void SmoothHorizontal64x32_SSE4_1( + void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const top = static_cast<const uint8_t*>(top_row); const __m128i top_right = _mm_set1_epi16(top[63]); const __m128i left1 = _mm_cvtepu8_epi16(LoadLo8(left_column)); @@ -1315,9 +1340,10 @@ void SmoothHorizontal64x32_SSE4_1(void* const dest, const ptrdiff_t stride, } } -void SmoothHorizontal64x64_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void SmoothHorizontal64x64_SSE4_1( + void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const top = static_cast<const uint8_t*>(top_row); const __m128i top_right = _mm_set1_epi16(top[63]); const __m128i weights_lolo = LoadUnaligned16(kSmoothWeights + 60); @@ -1378,7 +1404,8 @@ void SmoothHorizontal64x64_SSE4_1(void* const dest, const ptrdiff_t stride, } } -inline void LoadSmoothVerticalPixels4(const uint8_t* above, const uint8_t* left, +inline void LoadSmoothVerticalPixels4(const uint8_t* LIBGAV1_RESTRICT above, + const uint8_t* LIBGAV1_RESTRICT left, const int height, __m128i* pixels) { __m128i top = Load4(above); const __m128i bottom_left = _mm_set1_epi16(left[height - 1]); @@ -1390,7 +1417,8 @@ inline void LoadSmoothVerticalPixels4(const uint8_t* above, const uint8_t* left, // (256-w) counterparts. This is precomputed by the compiler when the weights // table is visible to this module. Removing this visibility can cut speed by up // to half in both 4xH and 8xH transforms. -inline void LoadSmoothVerticalWeights4(const uint8_t* weight_array, +inline void LoadSmoothVerticalWeights4(const uint8_t* LIBGAV1_RESTRICT + weight_array, const int height, __m128i* weights) { const __m128i inverter = _mm_set1_epi16(256); @@ -1413,7 +1441,8 @@ inline void LoadSmoothVerticalWeights4(const uint8_t* weight_array, } inline void WriteSmoothVertical4xH(const __m128i* pixel, const __m128i* weight, - const int height, uint8_t* dst, + const int height, + uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride) { const __m128i pred_round = _mm_set1_epi32(128); const __m128i mask_increment = _mm_set1_epi16(0x0202); @@ -1438,9 +1467,10 @@ inline void WriteSmoothVertical4xH(const __m128i* pixel, const __m128i* weight, } } -void SmoothVertical4x4_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void SmoothVertical4x4_SSE4_1(void* LIBGAV1_RESTRICT const dest, + const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const left = static_cast<const uint8_t*>(left_column); const auto* const above = static_cast<const uint8_t*>(top_row); auto* dst = static_cast<uint8_t*>(dest); @@ -1453,9 +1483,10 @@ void SmoothVertical4x4_SSE4_1(void* const dest, const ptrdiff_t stride, WriteSmoothVertical4xH(&pixels, weights, 4, dst, stride); } -void SmoothVertical4x8_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void SmoothVertical4x8_SSE4_1(void* LIBGAV1_RESTRICT const dest, + const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const left = static_cast<const uint8_t*>(left_column); const auto* const above = static_cast<const uint8_t*>(top_row); auto* dst = static_cast<uint8_t*>(dest); @@ -1468,9 +1499,10 @@ void SmoothVertical4x8_SSE4_1(void* const dest, const ptrdiff_t stride, WriteSmoothVertical4xH(&pixels, weights, 8, dst, stride); } -void SmoothVertical4x16_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void SmoothVertical4x16_SSE4_1(void* LIBGAV1_RESTRICT const dest, + const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const left = static_cast<const uint8_t*>(left_column); const auto* const above = static_cast<const uint8_t*>(top_row); auto* dst = static_cast<uint8_t*>(dest); @@ -1485,9 +1517,10 @@ void SmoothVertical4x16_SSE4_1(void* const dest, const ptrdiff_t stride, WriteSmoothVertical4xH(&pixels, &weights[2], 8, dst, stride); } -void SmoothVertical8x4_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void SmoothVertical8x4_SSE4_1(void* LIBGAV1_RESTRICT const dest, + const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const left_ptr = static_cast<const uint8_t*>(left_column); const __m128i bottom_left = _mm_set1_epi16(left_ptr[3]); const __m128i weights = _mm_cvtepu8_epi16(Load4(kSmoothWeights)); @@ -1520,9 +1553,10 @@ void SmoothVertical8x4_SSE4_1(void* const dest, const ptrdiff_t stride, WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y, scale); } -void SmoothVertical8x8_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void SmoothVertical8x8_SSE4_1(void* LIBGAV1_RESTRICT const dest, + const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const left_ptr = static_cast<const uint8_t*>(left_column); const __m128i bottom_left = _mm_set1_epi16(left_ptr[7]); const __m128i weights = _mm_cvtepu8_epi16(LoadLo8(kSmoothWeights + 4)); @@ -1544,9 +1578,10 @@ void SmoothVertical8x8_SSE4_1(void* const dest, const ptrdiff_t stride, } } -void SmoothVertical8x16_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void SmoothVertical8x16_SSE4_1(void* LIBGAV1_RESTRICT const dest, + const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const left_ptr = static_cast<const uint8_t*>(left_column); const __m128i bottom_left = _mm_set1_epi16(left_ptr[15]); const __m128i weights = LoadUnaligned16(kSmoothWeights + 12); @@ -1583,9 +1618,10 @@ void SmoothVertical8x16_SSE4_1(void* const dest, const ptrdiff_t stride, } } -void SmoothVertical8x32_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void SmoothVertical8x32_SSE4_1(void* LIBGAV1_RESTRICT const dest, + const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const left_ptr = static_cast<const uint8_t*>(left_column); const __m128i zero = _mm_setzero_si128(); const __m128i bottom_left = _mm_set1_epi16(left_ptr[31]); @@ -1649,9 +1685,10 @@ void SmoothVertical8x32_SSE4_1(void* const dest, const ptrdiff_t stride, } } -void SmoothVertical16x4_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void SmoothVertical16x4_SSE4_1(void* LIBGAV1_RESTRICT const dest, + const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const left_ptr = static_cast<const uint8_t*>(left_column); auto* dst = static_cast<uint8_t*>(dest); const __m128i bottom_left = _mm_set1_epi16(left_ptr[3]); @@ -1694,9 +1731,10 @@ void SmoothVertical16x4_SSE4_1(void* const dest, const ptrdiff_t stride, scale); } -void SmoothVertical16x8_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void SmoothVertical16x8_SSE4_1(void* LIBGAV1_RESTRICT const dest, + const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const left_ptr = static_cast<const uint8_t*>(left_column); auto* dst = static_cast<uint8_t*>(dest); const __m128i bottom_left = _mm_set1_epi16(left_ptr[7]); @@ -1722,9 +1760,10 @@ void SmoothVertical16x8_SSE4_1(void* const dest, const ptrdiff_t stride, } } -void SmoothVertical16x16_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void SmoothVertical16x16_SSE4_1( + void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const left_ptr = static_cast<const uint8_t*>(left_column); auto* dst = static_cast<uint8_t*>(dest); const __m128i bottom_left = _mm_set1_epi16(left_ptr[15]); @@ -1766,9 +1805,10 @@ void SmoothVertical16x16_SSE4_1(void* const dest, const ptrdiff_t stride, } } -void SmoothVertical16x32_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void SmoothVertical16x32_SSE4_1( + void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const left_ptr = static_cast<const uint8_t*>(left_column); auto* dst = static_cast<uint8_t*>(dest); const __m128i bottom_left = _mm_set1_epi16(left_ptr[31]); @@ -1839,9 +1879,10 @@ void SmoothVertical16x32_SSE4_1(void* const dest, const ptrdiff_t stride, } } -void SmoothVertical16x64_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void SmoothVertical16x64_SSE4_1( + void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const left_ptr = static_cast<const uint8_t*>(left_column); auto* dst = static_cast<uint8_t*>(dest); const __m128i bottom_left = _mm_set1_epi16(left_ptr[63]); @@ -1887,9 +1928,10 @@ void SmoothVertical16x64_SSE4_1(void* const dest, const ptrdiff_t stride, } } -void SmoothVertical32x8_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void SmoothVertical32x8_SSE4_1(void* LIBGAV1_RESTRICT const dest, + const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const left_ptr = static_cast<const uint8_t*>(left_column); const auto* const top_ptr = static_cast<const uint8_t*>(top_row); auto* dst = static_cast<uint8_t*>(dest); @@ -1922,9 +1964,10 @@ void SmoothVertical32x8_SSE4_1(void* const dest, const ptrdiff_t stride, } } -void SmoothVertical32x16_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void SmoothVertical32x16_SSE4_1( + void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const left_ptr = static_cast<const uint8_t*>(left_column); const auto* const top_ptr = static_cast<const uint8_t*>(top_row); auto* dst = static_cast<uint8_t*>(dest); @@ -1975,9 +2018,10 @@ void SmoothVertical32x16_SSE4_1(void* const dest, const ptrdiff_t stride, } } -void SmoothVertical32x32_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void SmoothVertical32x32_SSE4_1( + void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const left_ptr = static_cast<const uint8_t*>(left_column); auto* dst = static_cast<uint8_t*>(dest); const auto* const top_ptr = static_cast<const uint8_t*>(top_row); @@ -2063,9 +2107,10 @@ void SmoothVertical32x32_SSE4_1(void* const dest, const ptrdiff_t stride, } } -void SmoothVertical32x64_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void SmoothVertical32x64_SSE4_1( + void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const left_ptr = static_cast<const uint8_t*>(left_column); auto* dst = static_cast<uint8_t*>(dest); const auto* const top_ptr = static_cast<const uint8_t*>(top_row); @@ -2120,9 +2165,10 @@ void SmoothVertical32x64_SSE4_1(void* const dest, const ptrdiff_t stride, } } -void SmoothVertical64x16_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void SmoothVertical64x16_SSE4_1( + void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const left_ptr = static_cast<const uint8_t*>(left_column); auto* dst = static_cast<uint8_t*>(dest); const auto* const top_ptr = static_cast<const uint8_t*>(top_row); @@ -2192,9 +2238,10 @@ void SmoothVertical64x16_SSE4_1(void* const dest, const ptrdiff_t stride, } } -void SmoothVertical64x32_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void SmoothVertical64x32_SSE4_1( + void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const left_ptr = static_cast<const uint8_t*>(left_column); auto* dst = static_cast<uint8_t*>(dest); const auto* const top_ptr = static_cast<const uint8_t*>(top_row); @@ -2311,9 +2358,10 @@ void SmoothVertical64x32_SSE4_1(void* const dest, const ptrdiff_t stride, } } -void SmoothVertical64x64_SSE4_1(void* const dest, const ptrdiff_t stride, - const void* const top_row, - const void* const left_column) { +void SmoothVertical64x64_SSE4_1( + void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const left_ptr = static_cast<const uint8_t*>(left_column); auto* dst = static_cast<uint8_t*>(dest); const auto* const top_ptr = static_cast<const uint8_t*>(top_row); |