aboutsummaryrefslogtreecommitdiff
path: root/src/dsp/x86/intrapred_sse4.cc
diff options
context:
space:
mode:
Diffstat (limited to 'src/dsp/x86/intrapred_sse4.cc')
-rw-r--r--src/dsp/x86/intrapred_sse4.cc1355
1 files changed, 3 insertions, 1352 deletions
diff --git a/src/dsp/x86/intrapred_sse4.cc b/src/dsp/x86/intrapred_sse4.cc
index 9938dfe..063929d 100644
--- a/src/dsp/x86/intrapred_sse4.cc
+++ b/src/dsp/x86/intrapred_sse4.cc
@@ -23,13 +23,14 @@
#include <cassert>
#include <cstddef>
#include <cstdint>
-#include <cstring> // memcpy
+#include <cstring>
#include "src/dsp/constants.h"
#include "src/dsp/dsp.h"
#include "src/dsp/x86/common_sse4.h"
#include "src/dsp/x86/transpose_sse4.h"
#include "src/utils/common.h"
+#include "src/utils/constants.h"
namespace libgav1 {
namespace dsp {
@@ -51,10 +52,6 @@ inline __m128i DivideByMultiplyShift_U32(const __m128i dividend) {
return _mm_mulhi_epi16(interm, _mm_cvtsi32_si128(multiplier));
}
-// This shuffle mask selects 32-bit blocks in the order 0, 1, 0, 1, which
-// duplicates the first 8 bytes of a 128-bit vector into the second 8 bytes.
-constexpr int kDuplicateFirstHalf = 0x44;
-
//------------------------------------------------------------------------------
// DcPredFuncs_SSE4_1
@@ -1408,1337 +1405,6 @@ void Paeth64x64_SSE4_1(void* const dest, ptrdiff_t stride,
WritePaeth16x16(dst + 48, stride, top_left, top_3, left_3);
}
-//------------------------------------------------------------------------------
-// 7.11.2.4. Directional intra prediction process
-
-// Special case: An |xstep| of 64 corresponds to an angle delta of 45, meaning
-// upsampling is ruled out. In addition, the bits masked by 0x3F for
-// |shift_val| are 0 for all multiples of 64, so the formula
-// val = top[top_base_x]*shift + top[top_base_x+1]*(32-shift), reduces to
-// val = top[top_base_x+1] << 5, meaning only the second set of pixels is
-// involved in the output. Hence |top| is offset by 1.
-inline void DirectionalZone1_Step64(uint8_t* dst, ptrdiff_t stride,
- const uint8_t* const top, const int width,
- const int height) {
- ptrdiff_t offset = 1;
- if (height == 4) {
- memcpy(dst, top + offset, width);
- dst += stride;
- memcpy(dst, top + offset + 1, width);
- dst += stride;
- memcpy(dst, top + offset + 2, width);
- dst += stride;
- memcpy(dst, top + offset + 3, width);
- return;
- }
- int y = 0;
- do {
- memcpy(dst, top + offset, width);
- dst += stride;
- memcpy(dst, top + offset + 1, width);
- dst += stride;
- memcpy(dst, top + offset + 2, width);
- dst += stride;
- memcpy(dst, top + offset + 3, width);
- dst += stride;
- memcpy(dst, top + offset + 4, width);
- dst += stride;
- memcpy(dst, top + offset + 5, width);
- dst += stride;
- memcpy(dst, top + offset + 6, width);
- dst += stride;
- memcpy(dst, top + offset + 7, width);
- dst += stride;
-
- offset += 8;
- y += 8;
- } while (y < height);
-}
-
-inline void DirectionalZone1_4xH(uint8_t* dst, ptrdiff_t stride,
- const uint8_t* const top, const int height,
- const int xstep, const bool upsampled) {
- const int upsample_shift = static_cast<int>(upsampled);
- const int scale_bits = 6 - upsample_shift;
- const int rounding_bits = 5;
- const int max_base_x = (height + 3 /* width - 1 */) << upsample_shift;
- const __m128i final_top_val = _mm_set1_epi16(top[max_base_x]);
- const __m128i sampler = upsampled ? _mm_set_epi64x(0, 0x0706050403020100)
- : _mm_set_epi64x(0, 0x0403030202010100);
- // Each 16-bit value here corresponds to a position that may exceed
- // |max_base_x|. When added to the top_base_x, it is used to mask values
- // that pass the end of |top|. Starting from 1 to simulate "cmpge" which is
- // not supported for packed integers.
- const __m128i offsets =
- _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
-
- // All rows from |min_corner_only_y| down will simply use memcpy. |max_base_x|
- // is always greater than |height|, so clipping to 1 is enough to make the
- // logic work.
- const int xstep_units = std::max(xstep >> scale_bits, 1);
- const int min_corner_only_y = std::min(max_base_x / xstep_units, height);
-
- // Rows up to this y-value can be computed without checking for bounds.
- int y = 0;
- int top_x = xstep;
-
- for (; y < min_corner_only_y; ++y, dst += stride, top_x += xstep) {
- const int top_base_x = top_x >> scale_bits;
-
- // Permit negative values of |top_x|.
- const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
- const __m128i shift = _mm_set1_epi8(shift_val);
- const __m128i max_shift = _mm_set1_epi8(32);
- const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
- const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
- __m128i top_index_vect = _mm_set1_epi16(top_base_x);
- top_index_vect = _mm_add_epi16(top_index_vect, offsets);
- const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x);
-
- // Load 8 values because we will select the sampled values based on
- // |upsampled|.
- const __m128i values = LoadLo8(top + top_base_x);
- const __m128i sampled_values = _mm_shuffle_epi8(values, sampler);
- const __m128i past_max = _mm_cmpgt_epi16(top_index_vect, max_base_x_vect);
- __m128i prod = _mm_maddubs_epi16(sampled_values, shifts);
- prod = RightShiftWithRounding_U16(prod, rounding_bits);
- // Replace pixels from invalid range with top-right corner.
- prod = _mm_blendv_epi8(prod, final_top_val, past_max);
- Store4(dst, _mm_packus_epi16(prod, prod));
- }
-
- // Fill in corner-only rows.
- for (; y < height; ++y) {
- memset(dst, top[max_base_x], /* width */ 4);
- dst += stride;
- }
-}
-
-// 7.11.2.4 (7) angle < 90
-inline void DirectionalZone1_Large(uint8_t* dest, ptrdiff_t stride,
- const uint8_t* const top_row,
- const int width, const int height,
- const int xstep, const bool upsampled) {
- const int upsample_shift = static_cast<int>(upsampled);
- const __m128i sampler =
- upsampled ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100)
- : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100);
- const int scale_bits = 6 - upsample_shift;
- const int max_base_x = ((width + height) - 1) << upsample_shift;
-
- const __m128i max_shift = _mm_set1_epi8(32);
- const int rounding_bits = 5;
- const int base_step = 1 << upsample_shift;
- const int base_step8 = base_step << 3;
-
- // All rows from |min_corner_only_y| down will simply use memcpy. |max_base_x|
- // is always greater than |height|, so clipping to 1 is enough to make the
- // logic work.
- const int xstep_units = std::max(xstep >> scale_bits, 1);
- const int min_corner_only_y = std::min(max_base_x / xstep_units, height);
-
- // Rows up to this y-value can be computed without checking for bounds.
- const int max_no_corner_y = std::min(
- LeftShift((max_base_x - (base_step * width)), scale_bits) / xstep,
- height);
- // No need to check for exceeding |max_base_x| in the first loop.
- int y = 0;
- int top_x = xstep;
- for (; y < max_no_corner_y; ++y, dest += stride, top_x += xstep) {
- int top_base_x = top_x >> scale_bits;
- // Permit negative values of |top_x|.
- const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
- const __m128i shift = _mm_set1_epi8(shift_val);
- const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
- const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
- int x = 0;
- do {
- const __m128i top_vals = LoadUnaligned16(top_row + top_base_x);
- __m128i vals = _mm_shuffle_epi8(top_vals, sampler);
- vals = _mm_maddubs_epi16(vals, shifts);
- vals = RightShiftWithRounding_U16(vals, rounding_bits);
- StoreLo8(dest + x, _mm_packus_epi16(vals, vals));
- top_base_x += base_step8;
- x += 8;
- } while (x < width);
- }
-
- // Each 16-bit value here corresponds to a position that may exceed
- // |max_base_x|. When added to the top_base_x, it is used to mask values
- // that pass the end of |top|. Starting from 1 to simulate "cmpge" which is
- // not supported for packed integers.
- const __m128i offsets =
- _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
-
- const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x);
- const __m128i final_top_val = _mm_set1_epi16(top_row[max_base_x]);
- const __m128i base_step8_vect = _mm_set1_epi16(base_step8);
- for (; y < min_corner_only_y; ++y, dest += stride, top_x += xstep) {
- int top_base_x = top_x >> scale_bits;
-
- const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
- const __m128i shift = _mm_set1_epi8(shift_val);
- const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
- const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
- __m128i top_index_vect = _mm_set1_epi16(top_base_x);
- top_index_vect = _mm_add_epi16(top_index_vect, offsets);
-
- int x = 0;
- const int min_corner_only_x =
- std::min(width, ((max_base_x - top_base_x) >> upsample_shift) + 7) & ~7;
- for (; x < min_corner_only_x;
- x += 8, top_base_x += base_step8,
- top_index_vect = _mm_add_epi16(top_index_vect, base_step8_vect)) {
- const __m128i past_max = _mm_cmpgt_epi16(top_index_vect, max_base_x_vect);
- // Assuming a buffer zone of 8 bytes at the end of top_row, this prevents
- // reading out of bounds. If all indices are past max and we don't need to
- // use the loaded bytes at all, |top_base_x| becomes 0. |top_base_x| will
- // reset for the next |y|.
- top_base_x &= ~_mm_cvtsi128_si32(past_max);
- const __m128i top_vals = LoadUnaligned16(top_row + top_base_x);
- __m128i vals = _mm_shuffle_epi8(top_vals, sampler);
- vals = _mm_maddubs_epi16(vals, shifts);
- vals = RightShiftWithRounding_U16(vals, rounding_bits);
- vals = _mm_blendv_epi8(vals, final_top_val, past_max);
- StoreLo8(dest + x, _mm_packus_epi16(vals, vals));
- }
- // Corner-only section of the row.
- memset(dest + x, top_row[max_base_x], width - x);
- }
- // Fill in corner-only rows.
- for (; y < height; ++y) {
- memset(dest, top_row[max_base_x], width);
- dest += stride;
- }
-}
-
-// 7.11.2.4 (7) angle < 90
-inline void DirectionalZone1_SSE4_1(uint8_t* dest, ptrdiff_t stride,
- const uint8_t* const top_row,
- const int width, const int height,
- const int xstep, const bool upsampled) {
- const int upsample_shift = static_cast<int>(upsampled);
- if (xstep == 64) {
- DirectionalZone1_Step64(dest, stride, top_row, width, height);
- return;
- }
- if (width == 4) {
- DirectionalZone1_4xH(dest, stride, top_row, height, xstep, upsampled);
- return;
- }
- if (width >= 32) {
- DirectionalZone1_Large(dest, stride, top_row, width, height, xstep,
- upsampled);
- return;
- }
- const __m128i sampler =
- upsampled ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100)
- : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100);
- const int scale_bits = 6 - upsample_shift;
- const int max_base_x = ((width + height) - 1) << upsample_shift;
-
- const __m128i max_shift = _mm_set1_epi8(32);
- const int rounding_bits = 5;
- const int base_step = 1 << upsample_shift;
- const int base_step8 = base_step << 3;
-
- // No need to check for exceeding |max_base_x| in the loops.
- if (((xstep * height) >> scale_bits) + base_step * width < max_base_x) {
- int top_x = xstep;
- int y = 0;
- do {
- int top_base_x = top_x >> scale_bits;
- // Permit negative values of |top_x|.
- const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
- const __m128i shift = _mm_set1_epi8(shift_val);
- const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
- const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
- int x = 0;
- do {
- const __m128i top_vals = LoadUnaligned16(top_row + top_base_x);
- __m128i vals = _mm_shuffle_epi8(top_vals, sampler);
- vals = _mm_maddubs_epi16(vals, shifts);
- vals = RightShiftWithRounding_U16(vals, rounding_bits);
- StoreLo8(dest + x, _mm_packus_epi16(vals, vals));
- top_base_x += base_step8;
- x += 8;
- } while (x < width);
- dest += stride;
- top_x += xstep;
- } while (++y < height);
- return;
- }
-
- // Each 16-bit value here corresponds to a position that may exceed
- // |max_base_x|. When added to the top_base_x, it is used to mask values
- // that pass the end of |top|. Starting from 1 to simulate "cmpge" which is
- // not supported for packed integers.
- const __m128i offsets =
- _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
-
- const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x);
- const __m128i final_top_val = _mm_set1_epi16(top_row[max_base_x]);
- const __m128i base_step8_vect = _mm_set1_epi16(base_step8);
- int top_x = xstep;
- int y = 0;
- do {
- int top_base_x = top_x >> scale_bits;
-
- if (top_base_x >= max_base_x) {
- for (int i = y; i < height; ++i) {
- memset(dest, top_row[max_base_x], width);
- dest += stride;
- }
- return;
- }
-
- const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
- const __m128i shift = _mm_set1_epi8(shift_val);
- const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
- const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
- __m128i top_index_vect = _mm_set1_epi16(top_base_x);
- top_index_vect = _mm_add_epi16(top_index_vect, offsets);
-
- int x = 0;
- for (; x < width - 8;
- x += 8, top_base_x += base_step8,
- top_index_vect = _mm_add_epi16(top_index_vect, base_step8_vect)) {
- const __m128i past_max = _mm_cmpgt_epi16(top_index_vect, max_base_x_vect);
- // Assuming a buffer zone of 8 bytes at the end of top_row, this prevents
- // reading out of bounds. If all indices are past max and we don't need to
- // use the loaded bytes at all, |top_base_x| becomes 0. |top_base_x| will
- // reset for the next |y|.
- top_base_x &= ~_mm_cvtsi128_si32(past_max);
- const __m128i top_vals = LoadUnaligned16(top_row + top_base_x);
- __m128i vals = _mm_shuffle_epi8(top_vals, sampler);
- vals = _mm_maddubs_epi16(vals, shifts);
- vals = RightShiftWithRounding_U16(vals, rounding_bits);
- vals = _mm_blendv_epi8(vals, final_top_val, past_max);
- StoreLo8(dest + x, _mm_packus_epi16(vals, vals));
- }
- const __m128i past_max = _mm_cmpgt_epi16(top_index_vect, max_base_x_vect);
- __m128i vals;
- if (upsampled) {
- vals = LoadUnaligned16(top_row + top_base_x);
- } else {
- const __m128i top_vals = LoadLo8(top_row + top_base_x);
- vals = _mm_shuffle_epi8(top_vals, sampler);
- vals = _mm_insert_epi8(vals, top_row[top_base_x + 8], 15);
- }
- vals = _mm_maddubs_epi16(vals, shifts);
- vals = RightShiftWithRounding_U16(vals, rounding_bits);
- vals = _mm_blendv_epi8(vals, final_top_val, past_max);
- StoreLo8(dest + x, _mm_packus_epi16(vals, vals));
- dest += stride;
- top_x += xstep;
- } while (++y < height);
-}
-
-void DirectionalIntraPredictorZone1_SSE4_1(void* const dest, ptrdiff_t stride,
- const void* const top_row,
- const int width, const int height,
- const int xstep,
- const bool upsampled_top) {
- const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
- auto* dst = static_cast<uint8_t*>(dest);
- DirectionalZone1_SSE4_1(dst, stride, top_ptr, width, height, xstep,
- upsampled_top);
-}
-
-template <bool upsampled>
-inline void DirectionalZone3_4x4(uint8_t* dest, ptrdiff_t stride,
- const uint8_t* const left_column,
- const int base_left_y, const int ystep) {
- // For use in the non-upsampled case.
- const __m128i sampler = _mm_set_epi64x(0, 0x0403030202010100);
- const int upsample_shift = static_cast<int>(upsampled);
- const int scale_bits = 6 - upsample_shift;
- const __m128i max_shift = _mm_set1_epi8(32);
- const int rounding_bits = 5;
-
- __m128i result_block[4];
- for (int x = 0, left_y = base_left_y; x < 4; x++, left_y += ystep) {
- const int left_base_y = left_y >> scale_bits;
- const int shift_val = ((left_y << upsample_shift) & 0x3F) >> 1;
- const __m128i shift = _mm_set1_epi8(shift_val);
- const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
- const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
- __m128i vals;
- if (upsampled) {
- vals = LoadLo8(left_column + left_base_y);
- } else {
- const __m128i top_vals = LoadLo8(left_column + left_base_y);
- vals = _mm_shuffle_epi8(top_vals, sampler);
- }
- vals = _mm_maddubs_epi16(vals, shifts);
- vals = RightShiftWithRounding_U16(vals, rounding_bits);
- result_block[x] = _mm_packus_epi16(vals, vals);
- }
- const __m128i result = Transpose4x4_U8(result_block);
- // This is result_row0.
- Store4(dest, result);
- dest += stride;
- const int result_row1 = _mm_extract_epi32(result, 1);
- memcpy(dest, &result_row1, sizeof(result_row1));
- dest += stride;
- const int result_row2 = _mm_extract_epi32(result, 2);
- memcpy(dest, &result_row2, sizeof(result_row2));
- dest += stride;
- const int result_row3 = _mm_extract_epi32(result, 3);
- memcpy(dest, &result_row3, sizeof(result_row3));
-}
-
-template <bool upsampled, int height>
-inline void DirectionalZone3_8xH(uint8_t* dest, ptrdiff_t stride,
- const uint8_t* const left_column,
- const int base_left_y, const int ystep) {
- // For use in the non-upsampled case.
- const __m128i sampler =
- _mm_set_epi64x(0x0807070606050504, 0x0403030202010100);
- const int upsample_shift = static_cast<int>(upsampled);
- const int scale_bits = 6 - upsample_shift;
- const __m128i max_shift = _mm_set1_epi8(32);
- const int rounding_bits = 5;
-
- __m128i result_block[8];
- for (int x = 0, left_y = base_left_y; x < 8; x++, left_y += ystep) {
- const int left_base_y = left_y >> scale_bits;
- const int shift_val = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
- const __m128i shift = _mm_set1_epi8(shift_val);
- const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
- const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
- __m128i vals;
- if (upsampled) {
- vals = LoadUnaligned16(left_column + left_base_y);
- } else {
- const __m128i top_vals = LoadUnaligned16(left_column + left_base_y);
- vals = _mm_shuffle_epi8(top_vals, sampler);
- }
- vals = _mm_maddubs_epi16(vals, shifts);
- result_block[x] = RightShiftWithRounding_U16(vals, rounding_bits);
- }
- Transpose8x8_U16(result_block, result_block);
- for (int y = 0; y < height; ++y) {
- StoreLo8(dest, _mm_packus_epi16(result_block[y], result_block[y]));
- dest += stride;
- }
-}
-
-// 7.11.2.4 (9) angle > 180
-void DirectionalIntraPredictorZone3_SSE4_1(void* dest, ptrdiff_t stride,
- const void* const left_column,
- const int width, const int height,
- const int ystep,
- const bool upsampled) {
- const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
- auto* dst = static_cast<uint8_t*>(dest);
- const int upsample_shift = static_cast<int>(upsampled);
- if (width == 4 || height == 4) {
- const ptrdiff_t stride4 = stride << 2;
- if (upsampled) {
- int left_y = ystep;
- int x = 0;
- do {
- uint8_t* dst_x = dst + x;
- int y = 0;
- do {
- DirectionalZone3_4x4<true>(
- dst_x, stride, left_ptr + (y << upsample_shift), left_y, ystep);
- dst_x += stride4;
- y += 4;
- } while (y < height);
- left_y += ystep << 2;
- x += 4;
- } while (x < width);
- } else {
- int left_y = ystep;
- int x = 0;
- do {
- uint8_t* dst_x = dst + x;
- int y = 0;
- do {
- DirectionalZone3_4x4<false>(dst_x, stride, left_ptr + y, left_y,
- ystep);
- dst_x += stride4;
- y += 4;
- } while (y < height);
- left_y += ystep << 2;
- x += 4;
- } while (x < width);
- }
- return;
- }
-
- const ptrdiff_t stride8 = stride << 3;
- if (upsampled) {
- int left_y = ystep;
- int x = 0;
- do {
- uint8_t* dst_x = dst + x;
- int y = 0;
- do {
- DirectionalZone3_8xH<true, 8>(
- dst_x, stride, left_ptr + (y << upsample_shift), left_y, ystep);
- dst_x += stride8;
- y += 8;
- } while (y < height);
- left_y += ystep << 3;
- x += 8;
- } while (x < width);
- } else {
- int left_y = ystep;
- int x = 0;
- do {
- uint8_t* dst_x = dst + x;
- int y = 0;
- do {
- DirectionalZone3_8xH<false, 8>(
- dst_x, stride, left_ptr + (y << upsample_shift), left_y, ystep);
- dst_x += stride8;
- y += 8;
- } while (y < height);
- left_y += ystep << 3;
- x += 8;
- } while (x < width);
- }
-}
-
-//------------------------------------------------------------------------------
-// Directional Zone 2 Functions
-// 7.11.2.4 (8)
-
-// DirectionalBlend* selectively overwrites the values written by
-// DirectionalZone2FromLeftCol*. |zone_bounds| has one 16-bit index for each
-// row.
-template <int y_selector>
-inline void DirectionalBlend4_SSE4_1(uint8_t* dest,
- const __m128i& dest_index_vect,
- const __m128i& vals,
- const __m128i& zone_bounds) {
- const __m128i max_dest_x_vect = _mm_shufflelo_epi16(zone_bounds, y_selector);
- const __m128i use_left = _mm_cmplt_epi16(dest_index_vect, max_dest_x_vect);
- const __m128i original_vals = _mm_cvtepu8_epi16(Load4(dest));
- const __m128i blended_vals = _mm_blendv_epi8(vals, original_vals, use_left);
- Store4(dest, _mm_packus_epi16(blended_vals, blended_vals));
-}
-
-inline void DirectionalBlend8_SSE4_1(uint8_t* dest,
- const __m128i& dest_index_vect,
- const __m128i& vals,
- const __m128i& zone_bounds,
- const __m128i& bounds_selector) {
- const __m128i max_dest_x_vect =
- _mm_shuffle_epi8(zone_bounds, bounds_selector);
- const __m128i use_left = _mm_cmplt_epi16(dest_index_vect, max_dest_x_vect);
- const __m128i original_vals = _mm_cvtepu8_epi16(LoadLo8(dest));
- const __m128i blended_vals = _mm_blendv_epi8(vals, original_vals, use_left);
- StoreLo8(dest, _mm_packus_epi16(blended_vals, blended_vals));
-}
-
-constexpr int kDirectionalWeightBits = 5;
-// |source| is packed with 4 or 8 pairs of 8-bit values from left or top.
-// |shifts| is named to match the specification, with 4 or 8 pairs of (32 -
-// shift) and shift. Shift is guaranteed to be between 0 and 32.
-inline __m128i DirectionalZone2FromSource_SSE4_1(const uint8_t* const source,
- const __m128i& shifts,
- const __m128i& sampler) {
- const __m128i src_vals = LoadUnaligned16(source);
- __m128i vals = _mm_shuffle_epi8(src_vals, sampler);
- vals = _mm_maddubs_epi16(vals, shifts);
- return RightShiftWithRounding_U16(vals, kDirectionalWeightBits);
-}
-
-// Because the source values "move backwards" as the row index increases, the
-// indices derived from ystep are generally negative. This is accommodated by
-// making sure the relative indices are within [-15, 0] when the function is
-// called, and sliding them into the inclusive range [0, 15], relative to a
-// lower base address.
-constexpr int kPositiveIndexOffset = 15;
-
-template <bool upsampled>
-inline void DirectionalZone2FromLeftCol_4x4_SSE4_1(
- uint8_t* dst, ptrdiff_t stride, const uint8_t* const left_column_base,
- __m128i left_y) {
- const int upsample_shift = static_cast<int>(upsampled);
- const int scale_bits = 6 - upsample_shift;
- const __m128i max_shifts = _mm_set1_epi8(32);
- const __m128i shift_mask = _mm_set1_epi32(0x003F003F);
- const __m128i index_increment = _mm_cvtsi32_si128(0x01010101);
- const __m128i positive_offset = _mm_set1_epi8(kPositiveIndexOffset);
- // Left_column and sampler are both offset by 15 so the indices are always
- // positive.
- const uint8_t* left_column = left_column_base - kPositiveIndexOffset;
- for (int y = 0; y < 4; dst += stride, ++y) {
- __m128i offset_y = _mm_srai_epi16(left_y, scale_bits);
- offset_y = _mm_packs_epi16(offset_y, offset_y);
-
- const __m128i adjacent = _mm_add_epi8(offset_y, index_increment);
- __m128i sampler = _mm_unpacklo_epi8(offset_y, adjacent);
- // Slide valid |offset_y| indices from range [-15, 0] to [0, 15] so they
- // can work as shuffle indices. Some values may be out of bounds, but their
- // pred results will be masked over by top prediction.
- sampler = _mm_add_epi8(sampler, positive_offset);
-
- __m128i shifts = _mm_srli_epi16(
- _mm_and_si128(_mm_slli_epi16(left_y, upsample_shift), shift_mask), 1);
- shifts = _mm_packus_epi16(shifts, shifts);
- const __m128i opposite_shifts = _mm_sub_epi8(max_shifts, shifts);
- shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
- const __m128i vals = DirectionalZone2FromSource_SSE4_1(
- left_column + (y << upsample_shift), shifts, sampler);
- Store4(dst, _mm_packus_epi16(vals, vals));
- }
-}
-
-// The height at which a load of 16 bytes will not contain enough source pixels
-// from |left_column| to supply an accurate row when computing 8 pixels at a
-// time. The values are found by inspection. By coincidence, all angles that
-// satisfy (ystep >> 6) == 2 map to the same value, so it is enough to look up
-// by ystep >> 6. The largest index for this lookup is 1023 >> 6 == 15.
-constexpr int kDirectionalZone2ShuffleInvalidHeight[16] = {
- 1024, 1024, 16, 16, 16, 16, 0, 0, 18, 0, 0, 0, 0, 0, 0, 40};
-
-template <bool upsampled>
-inline void DirectionalZone2FromLeftCol_8x8_SSE4_1(
- uint8_t* dst, ptrdiff_t stride, const uint8_t* const left_column,
- __m128i left_y) {
- const int upsample_shift = static_cast<int>(upsampled);
- const int scale_bits = 6 - upsample_shift;
- const __m128i max_shifts = _mm_set1_epi8(32);
- const __m128i shift_mask = _mm_set1_epi32(0x003F003F);
- const __m128i index_increment = _mm_set1_epi8(1);
- const __m128i denegation = _mm_set1_epi8(kPositiveIndexOffset);
- for (int y = 0; y < 8; dst += stride, ++y) {
- __m128i offset_y = _mm_srai_epi16(left_y, scale_bits);
- offset_y = _mm_packs_epi16(offset_y, offset_y);
- const __m128i adjacent = _mm_add_epi8(offset_y, index_increment);
-
- // Offset the relative index because ystep is negative in Zone 2 and shuffle
- // indices must be nonnegative.
- __m128i sampler = _mm_unpacklo_epi8(offset_y, adjacent);
- sampler = _mm_add_epi8(sampler, denegation);
-
- __m128i shifts = _mm_srli_epi16(
- _mm_and_si128(_mm_slli_epi16(left_y, upsample_shift), shift_mask), 1);
- shifts = _mm_packus_epi16(shifts, shifts);
- const __m128i opposite_shifts = _mm_sub_epi8(max_shifts, shifts);
- shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
-
- // The specification adds (y << 6) to left_y, which is subject to
- // upsampling, but this puts sampler indices out of the 0-15 range. It is
- // equivalent to offset the source address by (y << upsample_shift) instead.
- const __m128i vals = DirectionalZone2FromSource_SSE4_1(
- left_column - kPositiveIndexOffset + (y << upsample_shift), shifts,
- sampler);
- StoreLo8(dst, _mm_packus_epi16(vals, vals));
- }
-}
-
-// |zone_bounds| is an epi16 of the relative x index at which base >= -(1 <<
-// upsampled_top), for each row. When there are 4 values, they can be duplicated
-// with a non-register shuffle mask.
-// |shifts| is one pair of weights that applies throughout a given row.
-template <bool upsampled_top>
-inline void DirectionalZone1Blend_4x4(
- uint8_t* dest, const uint8_t* const top_row, ptrdiff_t stride,
- __m128i sampler, const __m128i& zone_bounds, const __m128i& shifts,
- const __m128i& dest_index_x, int top_x, const int xstep) {
- const int upsample_shift = static_cast<int>(upsampled_top);
- const int scale_bits_x = 6 - upsample_shift;
- top_x -= xstep;
-
- int top_base_x = (top_x >> scale_bits_x);
- const __m128i vals0 = DirectionalZone2FromSource_SSE4_1(
- top_row + top_base_x, _mm_shufflelo_epi16(shifts, 0x00), sampler);
- DirectionalBlend4_SSE4_1<0x00>(dest, dest_index_x, vals0, zone_bounds);
- top_x -= xstep;
- dest += stride;
-
- top_base_x = (top_x >> scale_bits_x);
- const __m128i vals1 = DirectionalZone2FromSource_SSE4_1(
- top_row + top_base_x, _mm_shufflelo_epi16(shifts, 0x55), sampler);
- DirectionalBlend4_SSE4_1<0x55>(dest, dest_index_x, vals1, zone_bounds);
- top_x -= xstep;
- dest += stride;
-
- top_base_x = (top_x >> scale_bits_x);
- const __m128i vals2 = DirectionalZone2FromSource_SSE4_1(
- top_row + top_base_x, _mm_shufflelo_epi16(shifts, 0xAA), sampler);
- DirectionalBlend4_SSE4_1<0xAA>(dest, dest_index_x, vals2, zone_bounds);
- top_x -= xstep;
- dest += stride;
-
- top_base_x = (top_x >> scale_bits_x);
- const __m128i vals3 = DirectionalZone2FromSource_SSE4_1(
- top_row + top_base_x, _mm_shufflelo_epi16(shifts, 0xFF), sampler);
- DirectionalBlend4_SSE4_1<0xFF>(dest, dest_index_x, vals3, zone_bounds);
-}
-
-template <bool upsampled_top, int height>
-inline void DirectionalZone1Blend_8xH(
- uint8_t* dest, const uint8_t* const top_row, ptrdiff_t stride,
- __m128i sampler, const __m128i& zone_bounds, const __m128i& shifts,
- const __m128i& dest_index_x, int top_x, const int xstep) {
- const int upsample_shift = static_cast<int>(upsampled_top);
- const int scale_bits_x = 6 - upsample_shift;
-
- __m128i y_selector = _mm_set1_epi32(0x01000100);
- const __m128i index_increment = _mm_set1_epi32(0x02020202);
- for (int y = 0; y < height; ++y,
- y_selector = _mm_add_epi8(y_selector, index_increment),
- dest += stride) {
- top_x -= xstep;
- const int top_base_x = top_x >> scale_bits_x;
- const __m128i vals = DirectionalZone2FromSource_SSE4_1(
- top_row + top_base_x, _mm_shuffle_epi8(shifts, y_selector), sampler);
- DirectionalBlend8_SSE4_1(dest, dest_index_x, vals, zone_bounds, y_selector);
- }
-}
-
-// 7.11.2.4 (8) 90 < angle > 180
-// The strategy for this function is to know how many blocks can be processed
-// with just pixels from |top_ptr|, then handle mixed blocks, then handle only
-// blocks that take from |left_ptr|. Additionally, a fast index-shuffle
-// approach is used for pred values from |left_column| in sections that permit
-// it.
-template <bool upsampled_left, bool upsampled_top>
-inline void DirectionalZone2_SSE4_1(void* dest, ptrdiff_t stride,
- const uint8_t* const top_row,
- const uint8_t* const left_column,
- const int width, const int height,
- const int xstep, const int ystep) {
- auto* dst = static_cast<uint8_t*>(dest);
- const int upsample_left_shift = static_cast<int>(upsampled_left);
- const int upsample_top_shift = static_cast<int>(upsampled_top);
- const __m128i max_shift = _mm_set1_epi8(32);
- const ptrdiff_t stride8 = stride << 3;
- const __m128i dest_index_x =
- _mm_set_epi32(0x00070006, 0x00050004, 0x00030002, 0x00010000);
- const __m128i sampler_top =
- upsampled_top
- ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100)
- : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100);
- const __m128i shift_mask = _mm_set1_epi32(0x003F003F);
- // All columns from |min_top_only_x| to the right will only need |top_row| to
- // compute. This assumes minimum |xstep| is 3.
- const int min_top_only_x = std::min((height * xstep) >> 6, width);
-
- // For steep angles, the source pixels from left_column may not fit in a
- // 16-byte load for shuffling.
- // TODO(petersonab): Find a more precise formula for this subject to x.
- const int max_shuffle_height =
- std::min(height, kDirectionalZone2ShuffleInvalidHeight[ystep >> 6]);
-
- const int xstep8 = xstep << 3;
- const __m128i xstep8_vect = _mm_set1_epi16(xstep8);
- // Accumulate xstep across 8 rows.
- const __m128i xstep_dup = _mm_set1_epi16(-xstep);
- const __m128i increments = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
- const __m128i xstep_for_shift = _mm_mullo_epi16(xstep_dup, increments);
- // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1
- const __m128i scaled_one = _mm_set1_epi16(-64);
- __m128i xstep_bounds_base =
- (xstep == 64) ? _mm_sub_epi16(scaled_one, xstep_for_shift)
- : _mm_sub_epi16(_mm_set1_epi16(-1), xstep_for_shift);
-
- const int left_base_increment = ystep >> 6;
- const int ystep_remainder = ystep & 0x3F;
- const int ystep8 = ystep << 3;
- const int left_base_increment8 = ystep8 >> 6;
- const int ystep_remainder8 = ystep8 & 0x3F;
- const __m128i increment_left8 = _mm_set1_epi16(-ystep_remainder8);
-
- // If the 64 scaling is regarded as a decimal point, the first value of the
- // left_y vector omits the portion which is covered under the left_column
- // offset. Following values need the full ystep as a relative offset.
- const __m128i ystep_init = _mm_set1_epi16(-ystep_remainder);
- const __m128i ystep_dup = _mm_set1_epi16(-ystep);
- __m128i left_y = _mm_mullo_epi16(ystep_dup, dest_index_x);
- left_y = _mm_add_epi16(ystep_init, left_y);
-
- const __m128i increment_top8 = _mm_set1_epi16(8 << 6);
- int x = 0;
-
- // This loop treats each set of 4 columns in 3 stages with y-value boundaries.
- // The first stage, before the first y-loop, covers blocks that are only
- // computed from the top row. The second stage, comprising two y-loops, covers
- // blocks that have a mixture of values computed from top or left. The final
- // stage covers blocks that are only computed from the left.
- for (int left_offset = -left_base_increment; x < min_top_only_x;
- x += 8,
- xstep_bounds_base = _mm_sub_epi16(xstep_bounds_base, increment_top8),
- // Watch left_y because it can still get big.
- left_y = _mm_add_epi16(left_y, increment_left8),
- left_offset -= left_base_increment8) {
- uint8_t* dst_x = dst + x;
-
- // Round down to the nearest multiple of 8.
- const int max_top_only_y = std::min(((x + 1) << 6) / xstep, height) & ~7;
- DirectionalZone1_4xH(dst_x, stride, top_row + (x << upsample_top_shift),
- max_top_only_y, -xstep, upsampled_top);
- DirectionalZone1_4xH(dst_x + 4, stride,
- top_row + ((x + 4) << upsample_top_shift),
- max_top_only_y, -xstep, upsampled_top);
-
- int y = max_top_only_y;
- dst_x += stride * y;
- const int xstep_y = xstep * y;
- const __m128i xstep_y_vect = _mm_set1_epi16(xstep_y);
- // All rows from |min_left_only_y| down for this set of columns, only need
- // |left_column| to compute.
- const int min_left_only_y = std::min(((x + 8) << 6) / xstep, height);
- // At high angles such that min_left_only_y < 8, ystep is low and xstep is
- // high. This means that max_shuffle_height is unbounded and xstep_bounds
- // will overflow in 16 bits. This is prevented by stopping the first
- // blending loop at min_left_only_y for such cases, which means we skip over
- // the second blending loop as well.
- const int left_shuffle_stop_y =
- std::min(max_shuffle_height, min_left_only_y);
- __m128i xstep_bounds = _mm_add_epi16(xstep_bounds_base, xstep_y_vect);
- __m128i xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift, xstep_y_vect);
- int top_x = -xstep_y;
-
- for (; y < left_shuffle_stop_y;
- y += 8, dst_x += stride8,
- xstep_bounds = _mm_add_epi16(xstep_bounds, xstep8_vect),
- xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift_y, xstep8_vect),
- top_x -= xstep8) {
- DirectionalZone2FromLeftCol_8x8_SSE4_1<upsampled_left>(
- dst_x, stride,
- left_column + ((left_offset + y) << upsample_left_shift), left_y);
-
- __m128i shifts = _mm_srli_epi16(
- _mm_and_si128(_mm_slli_epi16(xstep_for_shift_y, upsample_top_shift),
- shift_mask),
- 1);
- shifts = _mm_packus_epi16(shifts, shifts);
- __m128i opposite_shifts = _mm_sub_epi8(max_shift, shifts);
- shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
- __m128i xstep_bounds_off = _mm_srai_epi16(xstep_bounds, 6);
- DirectionalZone1Blend_8xH<upsampled_top, 8>(
- dst_x, top_row + (x << upsample_top_shift), stride, sampler_top,
- xstep_bounds_off, shifts, dest_index_x, top_x, xstep);
- }
- // Pick up from the last y-value, using the 10% slower but secure method for
- // left prediction.
- const auto base_left_y = static_cast<int16_t>(_mm_extract_epi16(left_y, 0));
- for (; y < min_left_only_y;
- y += 8, dst_x += stride8,
- xstep_bounds = _mm_add_epi16(xstep_bounds, xstep8_vect),
- xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift_y, xstep8_vect),
- top_x -= xstep8) {
- const __m128i xstep_bounds_off = _mm_srai_epi16(xstep_bounds, 6);
-
- DirectionalZone3_8xH<upsampled_left, 8>(
- dst_x, stride,
- left_column + ((left_offset + y) << upsample_left_shift), base_left_y,
- -ystep);
-
- __m128i shifts = _mm_srli_epi16(
- _mm_and_si128(_mm_slli_epi16(xstep_for_shift_y, upsample_top_shift),
- shift_mask),
- 1);
- shifts = _mm_packus_epi16(shifts, shifts);
- __m128i opposite_shifts = _mm_sub_epi8(max_shift, shifts);
- shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
- DirectionalZone1Blend_8xH<upsampled_top, 8>(
- dst_x, top_row + (x << upsample_top_shift), stride, sampler_top,
- xstep_bounds_off, shifts, dest_index_x, top_x, xstep);
- }
- // Loop over y for left_only rows.
- for (; y < height; y += 8, dst_x += stride8) {
- DirectionalZone3_8xH<upsampled_left, 8>(
- dst_x, stride,
- left_column + ((left_offset + y) << upsample_left_shift), base_left_y,
- -ystep);
- }
- }
- for (; x < width; x += 4) {
- DirectionalZone1_4xH(dst + x, stride, top_row + (x << upsample_top_shift),
- height, -xstep, upsampled_top);
- }
-}
-
-template <bool upsampled_left, bool upsampled_top>
-inline void DirectionalZone2_4_SSE4_1(void* dest, ptrdiff_t stride,
- const uint8_t* const top_row,
- const uint8_t* const left_column,
- const int width, const int height,
- const int xstep, const int ystep) {
- auto* dst = static_cast<uint8_t*>(dest);
- const int upsample_left_shift = static_cast<int>(upsampled_left);
- const int upsample_top_shift = static_cast<int>(upsampled_top);
- const __m128i max_shift = _mm_set1_epi8(32);
- const ptrdiff_t stride4 = stride << 2;
- const __m128i dest_index_x = _mm_set_epi32(0, 0, 0x00030002, 0x00010000);
- const __m128i sampler_top =
- upsampled_top
- ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100)
- : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100);
- // All columns from |min_top_only_x| to the right will only need |top_row| to
- // compute.
- assert(xstep >= 3);
- const int min_top_only_x = std::min((height * xstep) >> 6, width);
-
- const int xstep4 = xstep << 2;
- const __m128i xstep4_vect = _mm_set1_epi16(xstep4);
- const __m128i xstep_dup = _mm_set1_epi16(-xstep);
- const __m128i increments = _mm_set_epi32(0, 0, 0x00040003, 0x00020001);
- __m128i xstep_for_shift = _mm_mullo_epi16(xstep_dup, increments);
- const __m128i scaled_one = _mm_set1_epi16(-64);
- // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1
- __m128i xstep_bounds_base =
- (xstep == 64) ? _mm_sub_epi16(scaled_one, xstep_for_shift)
- : _mm_sub_epi16(_mm_set1_epi16(-1), xstep_for_shift);
-
- const int left_base_increment = ystep >> 6;
- const int ystep_remainder = ystep & 0x3F;
- const int ystep4 = ystep << 2;
- const int left_base_increment4 = ystep4 >> 6;
- // This is guaranteed to be less than 64, but accumulation may bring it past
- // 64 for higher x values.
- const int ystep_remainder4 = ystep4 & 0x3F;
- const __m128i increment_left4 = _mm_set1_epi16(-ystep_remainder4);
- const __m128i increment_top4 = _mm_set1_epi16(4 << 6);
-
- // If the 64 scaling is regarded as a decimal point, the first value of the
- // left_y vector omits the portion which will go into the left_column offset.
- // Following values need the full ystep as a relative offset.
- const __m128i ystep_init = _mm_set1_epi16(-ystep_remainder);
- const __m128i ystep_dup = _mm_set1_epi16(-ystep);
- __m128i left_y = _mm_mullo_epi16(ystep_dup, dest_index_x);
- left_y = _mm_add_epi16(ystep_init, left_y);
- const __m128i shift_mask = _mm_set1_epi32(0x003F003F);
-
- int x = 0;
- // Loop over x for columns with a mixture of sources.
- for (int left_offset = -left_base_increment; x < min_top_only_x; x += 4,
- xstep_bounds_base = _mm_sub_epi16(xstep_bounds_base, increment_top4),
- left_y = _mm_add_epi16(left_y, increment_left4),
- left_offset -= left_base_increment4) {
- uint8_t* dst_x = dst + x;
-
- // Round down to the nearest multiple of 8.
- const int max_top_only_y = std::min((x << 6) / xstep, height) & 0xFFFFFFF4;
- DirectionalZone1_4xH(dst_x, stride, top_row + (x << upsample_top_shift),
- max_top_only_y, -xstep, upsampled_top);
- int y = max_top_only_y;
- dst_x += stride * y;
- const int xstep_y = xstep * y;
- const __m128i xstep_y_vect = _mm_set1_epi16(xstep_y);
- // All rows from |min_left_only_y| down for this set of columns, only need
- // |left_column| to compute. Rounded up to the nearest multiple of 4.
- const int min_left_only_y = std::min(((x + 4) << 6) / xstep, height);
-
- __m128i xstep_bounds = _mm_add_epi16(xstep_bounds_base, xstep_y_vect);
- __m128i xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift, xstep_y_vect);
- int top_x = -xstep_y;
-
- // Loop over y for mixed rows.
- for (; y < min_left_only_y;
- y += 4, dst_x += stride4,
- xstep_bounds = _mm_add_epi16(xstep_bounds, xstep4_vect),
- xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift_y, xstep4_vect),
- top_x -= xstep4) {
- DirectionalZone2FromLeftCol_4x4_SSE4_1<upsampled_left>(
- dst_x, stride,
- left_column + ((left_offset + y) * (1 << upsample_left_shift)),
- left_y);
-
- __m128i shifts = _mm_srli_epi16(
- _mm_and_si128(_mm_slli_epi16(xstep_for_shift_y, upsample_top_shift),
- shift_mask),
- 1);
- shifts = _mm_packus_epi16(shifts, shifts);
- const __m128i opposite_shifts = _mm_sub_epi8(max_shift, shifts);
- shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
- const __m128i xstep_bounds_off = _mm_srai_epi16(xstep_bounds, 6);
- DirectionalZone1Blend_4x4<upsampled_top>(
- dst_x, top_row + (x << upsample_top_shift), stride, sampler_top,
- xstep_bounds_off, shifts, dest_index_x, top_x, xstep);
- }
- // Loop over y for left-only rows, if any.
- for (; y < height; y += 4, dst_x += stride4) {
- DirectionalZone2FromLeftCol_4x4_SSE4_1<upsampled_left>(
- dst_x, stride,
- left_column + ((left_offset + y) << upsample_left_shift), left_y);
- }
- }
- // Loop over top-only columns, if any.
- for (; x < width; x += 4) {
- DirectionalZone1_4xH(dst + x, stride, top_row + (x << upsample_top_shift),
- height, -xstep, upsampled_top);
- }
-}
-
-void DirectionalIntraPredictorZone2_SSE4_1(void* const dest, ptrdiff_t stride,
- const void* const top_row,
- const void* const left_column,
- const int width, const int height,
- const int xstep, const int ystep,
- const bool upsampled_top,
- const bool upsampled_left) {
- // Increasing the negative buffer for this function allows more rows to be
- // processed at a time without branching in an inner loop to check the base.
- uint8_t top_buffer[288];
- uint8_t left_buffer[288];
- memcpy(top_buffer + 128, static_cast<const uint8_t*>(top_row) - 16, 160);
- memcpy(left_buffer + 128, static_cast<const uint8_t*>(left_column) - 16, 160);
- const uint8_t* top_ptr = top_buffer + 144;
- const uint8_t* left_ptr = left_buffer + 144;
- if (width == 4 || height == 4) {
- if (upsampled_left) {
- if (upsampled_top) {
- DirectionalZone2_4_SSE4_1<true, true>(dest, stride, top_ptr, left_ptr,
- width, height, xstep, ystep);
- } else {
- DirectionalZone2_4_SSE4_1<true, false>(dest, stride, top_ptr, left_ptr,
- width, height, xstep, ystep);
- }
- } else {
- if (upsampled_top) {
- DirectionalZone2_4_SSE4_1<false, true>(dest, stride, top_ptr, left_ptr,
- width, height, xstep, ystep);
- } else {
- DirectionalZone2_4_SSE4_1<false, false>(dest, stride, top_ptr, left_ptr,
- width, height, xstep, ystep);
- }
- }
- return;
- }
- if (upsampled_left) {
- if (upsampled_top) {
- DirectionalZone2_SSE4_1<true, true>(dest, stride, top_ptr, left_ptr,
- width, height, xstep, ystep);
- } else {
- DirectionalZone2_SSE4_1<true, false>(dest, stride, top_ptr, left_ptr,
- width, height, xstep, ystep);
- }
- } else {
- if (upsampled_top) {
- DirectionalZone2_SSE4_1<false, true>(dest, stride, top_ptr, left_ptr,
- width, height, xstep, ystep);
- } else {
- DirectionalZone2_SSE4_1<false, false>(dest, stride, top_ptr, left_ptr,
- width, height, xstep, ystep);
- }
- }
-}
-
-//------------------------------------------------------------------------------
-// FilterIntraPredictor_SSE4_1
-
-// Apply all filter taps to the given 7 packed 16-bit values, keeping the 8th
-// at zero to preserve the sum.
-inline void Filter4x2_SSE4_1(uint8_t* dst, const ptrdiff_t stride,
- const __m128i& pixels, const __m128i& taps_0_1,
- const __m128i& taps_2_3, const __m128i& taps_4_5,
- const __m128i& taps_6_7) {
- const __m128i mul_0_01 = _mm_maddubs_epi16(pixels, taps_0_1);
- const __m128i mul_0_23 = _mm_maddubs_epi16(pixels, taps_2_3);
- // |output_half| contains 8 partial sums.
- __m128i output_half = _mm_hadd_epi16(mul_0_01, mul_0_23);
- __m128i output = _mm_hadd_epi16(output_half, output_half);
- const __m128i output_row0 =
- _mm_packus_epi16(RightShiftWithRounding_S16(output, 4),
- /* arbitrary pack arg */ output);
- Store4(dst, output_row0);
- const __m128i mul_1_01 = _mm_maddubs_epi16(pixels, taps_4_5);
- const __m128i mul_1_23 = _mm_maddubs_epi16(pixels, taps_6_7);
- output_half = _mm_hadd_epi16(mul_1_01, mul_1_23);
- output = _mm_hadd_epi16(output_half, output_half);
- const __m128i output_row1 =
- _mm_packus_epi16(RightShiftWithRounding_S16(output, 4),
- /* arbitrary pack arg */ output);
- Store4(dst + stride, output_row1);
-}
-
-// 4xH transform sizes are given special treatment because LoadLo8 goes out
-// of bounds and every block involves the left column. This implementation
-// loads TL from the top row for the first block, so it is not
-inline void Filter4xH(uint8_t* dest, ptrdiff_t stride,
- const uint8_t* const top_ptr,
- const uint8_t* const left_ptr, FilterIntraPredictor pred,
- const int height) {
- const __m128i taps_0_1 = LoadUnaligned16(kFilterIntraTaps[pred][0]);
- const __m128i taps_2_3 = LoadUnaligned16(kFilterIntraTaps[pred][2]);
- const __m128i taps_4_5 = LoadUnaligned16(kFilterIntraTaps[pred][4]);
- const __m128i taps_6_7 = LoadUnaligned16(kFilterIntraTaps[pred][6]);
- __m128i top = Load4(top_ptr - 1);
- __m128i pixels = _mm_insert_epi8(top, top_ptr[3], 4);
- __m128i left = (height == 4 ? Load4(left_ptr) : LoadLo8(left_ptr));
- left = _mm_slli_si128(left, 5);
-
- // Relative pixels: top[-1], top[0], top[1], top[2], top[3], left[0], left[1],
- // left[2], left[3], left[4], left[5], left[6], left[7]
- pixels = _mm_or_si128(left, pixels);
-
- // Duplicate first 8 bytes.
- pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
- Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
- taps_6_7);
- dest += stride; // Move to y = 1.
- pixels = Load4(dest);
-
- // Relative pixels: top[0], top[1], top[2], top[3], empty, left[-2], left[-1],
- // left[0], left[1], ...
- pixels = _mm_or_si128(left, pixels);
-
- // This mask rearranges bytes in the order: 6, 0, 1, 2, 3, 7, 8, 15. The last
- // byte is an unused value, which shall be multiplied by 0 when we apply the
- // filter.
- constexpr int64_t kInsertTopLeftFirstMask = 0x0F08070302010006;
-
- // Insert left[-1] in front as TL and put left[0] and left[1] at the end.
- const __m128i pixel_order1 = _mm_set1_epi64x(kInsertTopLeftFirstMask);
- pixels = _mm_shuffle_epi8(pixels, pixel_order1);
- dest += stride; // Move to y = 2.
- Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
- taps_6_7);
- dest += stride; // Move to y = 3.
-
- // Compute the middle 8 rows before using common code for the final 4 rows.
- // Because the common code below this block assumes that
- if (height == 16) {
- // This shift allows us to use pixel_order2 twice after shifting by 2 later.
- left = _mm_slli_si128(left, 1);
- pixels = Load4(dest);
-
- // Relative pixels: top[0], top[1], top[2], top[3], empty, empty, left[-4],
- // left[-3], left[-2], left[-1], left[0], left[1], left[2], left[3]
- pixels = _mm_or_si128(left, pixels);
-
- // This mask rearranges bytes in the order: 9, 0, 1, 2, 3, 7, 8, 15. The
- // last byte is an unused value, as above. The top-left was shifted to
- // position nine to keep two empty spaces after the top pixels.
- constexpr int64_t kInsertTopLeftSecondMask = 0x0F0B0A0302010009;
-
- // Insert (relative) left[-1] in front as TL and put left[0] and left[1] at
- // the end.
- const __m128i pixel_order2 = _mm_set1_epi64x(kInsertTopLeftSecondMask);
- pixels = _mm_shuffle_epi8(pixels, pixel_order2);
- dest += stride; // Move to y = 4.
-
- // First 4x2 in the if body.
- Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
- taps_6_7);
-
- // Clear all but final pixel in the first 8 of left column.
- __m128i keep_top_left = _mm_srli_si128(left, 13);
- dest += stride; // Move to y = 5.
- pixels = Load4(dest);
- left = _mm_srli_si128(left, 2);
-
- // Relative pixels: top[0], top[1], top[2], top[3], left[-6],
- // left[-5], left[-4], left[-3], left[-2], left[-1], left[0], left[1]
- pixels = _mm_or_si128(left, pixels);
- left = LoadLo8(left_ptr + 8);
-
- pixels = _mm_shuffle_epi8(pixels, pixel_order2);
- dest += stride; // Move to y = 6.
-
- // Second 4x2 in the if body.
- Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
- taps_6_7);
-
- // Position TL value so we can use pixel_order1.
- keep_top_left = _mm_slli_si128(keep_top_left, 6);
- dest += stride; // Move to y = 7.
- pixels = Load4(dest);
- left = _mm_slli_si128(left, 7);
- left = _mm_or_si128(left, keep_top_left);
-
- // Relative pixels: top[0], top[1], top[2], top[3], empty, empty,
- // left[-1], left[0], left[1], left[2], left[3], ...
- pixels = _mm_or_si128(left, pixels);
- pixels = _mm_shuffle_epi8(pixels, pixel_order1);
- dest += stride; // Move to y = 8.
-
- // Third 4x2 in the if body.
- Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
- taps_6_7);
- dest += stride; // Move to y = 9.
-
- // Prepare final inputs.
- pixels = Load4(dest);
- left = _mm_srli_si128(left, 2);
-
- // Relative pixels: top[0], top[1], top[2], top[3], left[-3], left[-2]
- // left[-1], left[0], left[1], left[2], left[3], ...
- pixels = _mm_or_si128(left, pixels);
- pixels = _mm_shuffle_epi8(pixels, pixel_order1);
- dest += stride; // Move to y = 10.
-
- // Fourth 4x2 in the if body.
- Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
- taps_6_7);
- dest += stride; // Move to y = 11.
- }
-
- // In both the 8 and 16 case, we assume that the left vector has the next TL
- // at position 8.
- if (height > 4) {
- // Erase prior left pixels by shifting TL to position 0.
- left = _mm_srli_si128(left, 8);
- left = _mm_slli_si128(left, 6);
- pixels = Load4(dest);
-
- // Relative pixels: top[0], top[1], top[2], top[3], empty, empty,
- // left[-1], left[0], left[1], left[2], left[3], ...
- pixels = _mm_or_si128(left, pixels);
- pixels = _mm_shuffle_epi8(pixels, pixel_order1);
- dest += stride; // Move to y = 12 or 4.
-
- // First of final two 4x2 blocks.
- Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
- taps_6_7);
- dest += stride; // Move to y = 13 or 5.
- pixels = Load4(dest);
- left = _mm_srli_si128(left, 2);
-
- // Relative pixels: top[0], top[1], top[2], top[3], left[-3], left[-2]
- // left[-1], left[0], left[1], left[2], left[3], ...
- pixels = _mm_or_si128(left, pixels);
- pixels = _mm_shuffle_epi8(pixels, pixel_order1);
- dest += stride; // Move to y = 14 or 6.
-
- // Last of final two 4x2 blocks.
- Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
- taps_6_7);
- }
-}
-
-void FilterIntraPredictor_SSE4_1(void* const dest, ptrdiff_t stride,
- const void* const top_row,
- const void* const left_column,
- FilterIntraPredictor pred, const int width,
- const int height) {
- const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
- const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
- auto* dst = static_cast<uint8_t*>(dest);
- if (width == 4) {
- Filter4xH(dst, stride, top_ptr, left_ptr, pred, height);
- return;
- }
-
- // There is one set of 7 taps for each of the 4x2 output pixels.
- const __m128i taps_0_1 = LoadUnaligned16(kFilterIntraTaps[pred][0]);
- const __m128i taps_2_3 = LoadUnaligned16(kFilterIntraTaps[pred][2]);
- const __m128i taps_4_5 = LoadUnaligned16(kFilterIntraTaps[pred][4]);
- const __m128i taps_6_7 = LoadUnaligned16(kFilterIntraTaps[pred][6]);
-
- // This mask rearranges bytes in the order: 0, 1, 2, 3, 4, 8, 9, 15. The 15 at
- // the end is an unused value, which shall be multiplied by 0 when we apply
- // the filter.
- constexpr int64_t kCondenseLeftMask = 0x0F09080403020100;
-
- // Takes the "left section" and puts it right after p0-p4.
- const __m128i pixel_order1 = _mm_set1_epi64x(kCondenseLeftMask);
-
- // This mask rearranges bytes in the order: 8, 0, 1, 2, 3, 9, 10, 15. The last
- // byte is unused as above.
- constexpr int64_t kInsertTopLeftMask = 0x0F0A090302010008;
-
- // Shuffles the "top left" from the left section, to the front. Used when
- // grabbing data from left_column and not top_row.
- const __m128i pixel_order2 = _mm_set1_epi64x(kInsertTopLeftMask);
-
- // This first pass takes care of the cases where the top left pixel comes from
- // top_row.
- __m128i pixels = LoadLo8(top_ptr - 1);
- __m128i left = _mm_slli_si128(Load4(left_column), 8);
- pixels = _mm_or_si128(pixels, left);
-
- // Two sets of the same pixels to multiply with two sets of taps.
- pixels = _mm_shuffle_epi8(pixels, pixel_order1);
- Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5, taps_6_7);
- left = _mm_srli_si128(left, 1);
-
- // Load
- pixels = Load4(dst + stride);
-
- // Because of the above shift, this OR 'invades' the final of the first 8
- // bytes of |pixels|. This is acceptable because the 8th filter tap is always
- // a padded 0.
- pixels = _mm_or_si128(pixels, left);
- pixels = _mm_shuffle_epi8(pixels, pixel_order2);
- const ptrdiff_t stride2 = stride << 1;
- const ptrdiff_t stride4 = stride << 2;
- Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
- taps_6_7);
- dst += 4;
- for (int x = 3; x < width - 4; x += 4) {
- pixels = Load4(top_ptr + x);
- pixels = _mm_insert_epi8(pixels, top_ptr[x + 4], 4);
- pixels = _mm_insert_epi8(pixels, dst[-1], 5);
- pixels = _mm_insert_epi8(pixels, dst[stride - 1], 6);
-
- // Duplicate bottom half into upper half.
- pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
- Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
- taps_6_7);
- pixels = Load4(dst + stride - 1);
- pixels = _mm_insert_epi8(pixels, dst[stride + 3], 4);
- pixels = _mm_insert_epi8(pixels, dst[stride2 - 1], 5);
- pixels = _mm_insert_epi8(pixels, dst[stride + stride2 - 1], 6);
-
- // Duplicate bottom half into upper half.
- pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
- Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3,
- taps_4_5, taps_6_7);
- dst += 4;
- }
-
- // Now we handle heights that reference previous blocks rather than top_row.
- for (int y = 4; y < height; y += 4) {
- // Leftmost 4x4 block for this height.
- dst -= width;
- dst += stride4;
-
- // Top Left is not available by offset in these leftmost blocks.
- pixels = Load4(dst - stride);
- left = _mm_slli_si128(Load4(left_ptr + y - 1), 8);
- left = _mm_insert_epi8(left, left_ptr[y + 3], 12);
- pixels = _mm_or_si128(pixels, left);
- pixels = _mm_shuffle_epi8(pixels, pixel_order2);
- Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
- taps_6_7);
-
- // The bytes shifted into positions 6 and 7 will be ignored by the shuffle.
- left = _mm_srli_si128(left, 2);
- pixels = Load4(dst + stride);
- pixels = _mm_or_si128(pixels, left);
- pixels = _mm_shuffle_epi8(pixels, pixel_order2);
- Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3,
- taps_4_5, taps_6_7);
-
- dst += 4;
-
- // Remaining 4x4 blocks for this height.
- for (int x = 4; x < width; x += 4) {
- pixels = Load4(dst - stride - 1);
- pixels = _mm_insert_epi8(pixels, dst[-stride + 3], 4);
- pixels = _mm_insert_epi8(pixels, dst[-1], 5);
- pixels = _mm_insert_epi8(pixels, dst[stride - 1], 6);
-
- // Duplicate bottom half into upper half.
- pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
- Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
- taps_6_7);
- pixels = Load4(dst + stride - 1);
- pixels = _mm_insert_epi8(pixels, dst[stride + 3], 4);
- pixels = _mm_insert_epi8(pixels, dst[stride2 - 1], 5);
- pixels = _mm_insert_epi8(pixels, dst[stride2 + stride - 1], 6);
-
- // Duplicate bottom half into upper half.
- pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
- Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3,
- taps_4_5, taps_6_7);
- dst += 4;
- }
- }
-}
-
void Init8bpp() {
Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
assert(dsp != nullptr);
@@ -2746,21 +1412,6 @@ void Init8bpp() {
// These guards check if this version of the function was not superseded by
// a higher optimization level, such as AVX. The corresponding #define also
// prevents the C version from being added to the table.
-#if DSP_ENABLED_8BPP_SSE4_1(FilterIntraPredictor)
- dsp->filter_intra_predictor = FilterIntraPredictor_SSE4_1;
-#endif
-#if DSP_ENABLED_8BPP_SSE4_1(DirectionalIntraPredictorZone1)
- dsp->directional_intra_predictor_zone1 =
- DirectionalIntraPredictorZone1_SSE4_1;
-#endif
-#if DSP_ENABLED_8BPP_SSE4_1(DirectionalIntraPredictorZone2)
- dsp->directional_intra_predictor_zone2 =
- DirectionalIntraPredictorZone2_SSE4_1;
-#endif
-#if DSP_ENABLED_8BPP_SSE4_1(DirectionalIntraPredictorZone3)
- dsp->directional_intra_predictor_zone3 =
- DirectionalIntraPredictorZone3_SSE4_1;
-#endif
#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorDcTop)
dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcTop] =
DcDefs::_4x4::DcTop;
@@ -3524,7 +2175,7 @@ void IntraPredInit_SSE4_1() {
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_TARGETING_SSE4_1
+#else // !LIBGAV1_TARGETING_SSE4_1
namespace libgav1 {
namespace dsp {