aboutsummaryrefslogtreecommitdiff
path: root/src/dsp/x86/intrapred_cfl_sse4.cc
diff options
context:
space:
mode:
Diffstat (limited to 'src/dsp/x86/intrapred_cfl_sse4.cc')
-rw-r--r--src/dsp/x86/intrapred_cfl_sse4.cc1057
1 files changed, 960 insertions, 97 deletions
diff --git a/src/dsp/x86/intrapred_cfl_sse4.cc b/src/dsp/x86/intrapred_cfl_sse4.cc
index fac1556..f2dcfdb 100644
--- a/src/dsp/x86/intrapred_cfl_sse4.cc
+++ b/src/dsp/x86/intrapred_cfl_sse4.cc
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "src/dsp/intrapred.h"
+#include "src/dsp/intrapred_cfl.h"
#include "src/utils/cpu.h"
#if LIBGAV1_TARGETING_SSE4_1
@@ -29,9 +29,48 @@
#include "src/dsp/x86/common_sse4.h"
#include "src/utils/common.h"
#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
namespace libgav1 {
namespace dsp {
+namespace {
+
+// This duplicates the last two 16-bit values in |row|.
+inline __m128i LastRowSamples(const __m128i row) {
+ return _mm_shuffle_epi32(row, 0xFF);
+}
+
+// This duplicates the last 16-bit value in |row|.
+inline __m128i LastRowResult(const __m128i row) {
+ const __m128i dup_row = _mm_shufflehi_epi16(row, 0xFF);
+ return _mm_shuffle_epi32(dup_row, 0xFF);
+}
+
+// Takes in two sums of input row pairs, and completes the computation for two
+// output rows.
+inline __m128i StoreLumaResults4_420(const __m128i vertical_sum0,
+ const __m128i vertical_sum1,
+ int16_t* luma_ptr) {
+ __m128i result = _mm_hadd_epi16(vertical_sum0, vertical_sum1);
+ result = _mm_slli_epi16(result, 1);
+ StoreLo8(luma_ptr, result);
+ StoreHi8(luma_ptr + kCflLumaBufferStride, result);
+ return result;
+}
+
+// Takes two halves of a vertically added pair of rows and completes the
+// computation for one output row.
+inline __m128i StoreLumaResults8_420(const __m128i vertical_sum0,
+ const __m128i vertical_sum1,
+ int16_t* luma_ptr) {
+ __m128i result = _mm_hadd_epi16(vertical_sum0, vertical_sum1);
+ result = _mm_slli_epi16(result, 1);
+ StoreUnaligned16(luma_ptr, result);
+ return result;
+}
+
+} // namespace
+
namespace low_bitdepth {
namespace {
@@ -40,8 +79,8 @@ namespace {
inline __m128i CflPredictUnclipped(const __m128i* input, __m128i alpha_q12,
__m128i alpha_sign, __m128i dc_q0) {
- __m128i ac_q3 = LoadUnaligned16(input);
- __m128i ac_sign = _mm_sign_epi16(alpha_sign, ac_q3);
+ const __m128i ac_q3 = LoadUnaligned16(input);
+ const __m128i ac_sign = _mm_sign_epi16(alpha_sign, ac_q3);
__m128i scaled_luma_q0 = _mm_mulhrs_epi16(_mm_abs_epi16(ac_q3), alpha_q12);
scaled_luma_q0 = _mm_sign_epi16(scaled_luma_q0, ac_sign);
return _mm_add_epi16(scaled_luma_q0, dc_q0);
@@ -88,8 +127,7 @@ void CflIntraPredictor_SSE4_1(
template <int block_height_log2, bool is_inside>
void CflSubsampler444_4xH_SSE4_1(
int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
- const int /*max_luma_width*/, const int max_luma_height,
- const void* const source, ptrdiff_t stride) {
+ const int max_luma_height, const void* const source, ptrdiff_t stride) {
static_assert(block_height_log2 <= 4, "");
const int block_height = 1 << block_height_log2;
const int visible_height = max_luma_height;
@@ -119,12 +157,15 @@ void CflSubsampler444_4xH_SSE4_1(
} while (y < visible_height);
if (!is_inside) {
- int y = visible_height;
+ // Replicate the 2 high lanes.
+ samples = _mm_shuffle_epi32(samples, 0xee);
do {
+ StoreLo8(luma_ptr, samples);
+ luma_ptr += kCflLumaBufferStride;
StoreHi8(luma_ptr, samples);
luma_ptr += kCflLumaBufferStride;
sum = _mm_add_epi16(sum, samples);
- ++y;
+ y += 2;
} while (y < block_height);
}
@@ -152,15 +193,15 @@ void CflSubsampler444_4xH_SSE4_1(
static_assert(block_height_log2 <= 4, "");
assert(max_luma_width >= 4);
assert(max_luma_height >= 4);
- const int block_height = 1 << block_height_log2;
- const int block_width = 4;
+ static_cast<void>(max_luma_width);
+ constexpr int block_height = 1 << block_height_log2;
- if (block_height <= max_luma_height && block_width <= max_luma_width) {
- CflSubsampler444_4xH_SSE4_1<block_height_log2, true>(
- luma, max_luma_width, max_luma_height, source, stride);
+ if (block_height <= max_luma_height) {
+ CflSubsampler444_4xH_SSE4_1<block_height_log2, true>(luma, max_luma_height,
+ source, stride);
} else {
- CflSubsampler444_4xH_SSE4_1<block_height_log2, false>(
- luma, max_luma_width, max_luma_height, source, stride);
+ CflSubsampler444_4xH_SSE4_1<block_height_log2, false>(luma, max_luma_height,
+ source, stride);
}
}
@@ -302,19 +343,9 @@ void CflSubsampler444_SSE4_1(
__m128i inner_sum_lo, inner_sum_hi;
int y = 0;
do {
-#if LIBGAV1_MSAN // We can load uninitialized values here. Even though they are
- // then masked off by blendv, MSAN isn't smart enough to
- // understand that. So we switch to a C implementation here.
- uint16_t c_arr[16];
- for (int x = 0; x < 16; x++) {
- const int x_index = std::min(x, visible_width_16 - 1);
- c_arr[x] = src[x_index] << 3;
- }
- samples0 = LoadUnaligned16(c_arr);
- samples1 = LoadUnaligned16(c_arr + 8);
- static_cast<void>(blend_mask_16);
-#else
- __m128i samples01 = LoadUnaligned16(src);
+ // We can load uninitialized values here. Even though they are then masked
+ // off by blendv, MSAN doesn't model that behavior.
+ __m128i samples01 = LoadUnaligned16Msan(src, invisible_width_16);
if (!inside) {
const __m128i border16 =
@@ -323,26 +354,15 @@ void CflSubsampler444_SSE4_1(
}
samples0 = _mm_slli_epi16(_mm_cvtepu8_epi16(samples01), 3);
samples1 = _mm_slli_epi16(_mm_unpackhi_epi8(samples01, zero), 3);
-#endif // LIBGAV1_MSAN
StoreUnaligned16(luma_ptr, samples0);
StoreUnaligned16(luma_ptr + 8, samples1);
__m128i inner_sum = _mm_add_epi16(samples0, samples1);
if (block_width == 32) {
-#if LIBGAV1_MSAN // We can load uninitialized values here. Even though they are
- // then masked off by blendv, MSAN isn't smart enough to
- // understand that. So we switch to a C implementation here.
- uint16_t c_arr[16];
- for (int x = 16; x < 32; x++) {
- const int x_index = std::min(x, visible_width_32 - 1);
- c_arr[x - 16] = src[x_index] << 3;
- }
- samples2 = LoadUnaligned16(c_arr);
- samples3 = LoadUnaligned16(c_arr + 8);
- static_cast<void>(blend_mask_32);
-#else
- __m128i samples23 = LoadUnaligned16(src + 16);
+ // We can load uninitialized values here. Even though they are then masked
+ // off by blendv, MSAN doesn't model that behavior.
+ __m128i samples23 = LoadUnaligned16Msan(src + 16, invisible_width_32);
if (!inside) {
const __m128i border32 =
_mm_set1_epi8(static_cast<int8_t>(src[visible_width_32 - 1]));
@@ -350,7 +370,6 @@ void CflSubsampler444_SSE4_1(
}
samples2 = _mm_slli_epi16(_mm_cvtepu8_epi16(samples23), 3);
samples3 = _mm_slli_epi16(_mm_unpackhi_epi8(samples23, zero), 3);
-#endif // LIBGAV1_MSAN
StoreUnaligned16(luma_ptr + 16, samples2);
StoreUnaligned16(luma_ptr + 24, samples3);
@@ -418,29 +437,6 @@ void CflSubsampler444_SSE4_1(
}
}
-// Takes in two sums of input row pairs, and completes the computation for two
-// output rows.
-inline __m128i StoreLumaResults4_420(const __m128i vertical_sum0,
- const __m128i vertical_sum1,
- int16_t* luma_ptr) {
- __m128i result = _mm_hadd_epi16(vertical_sum0, vertical_sum1);
- result = _mm_slli_epi16(result, 1);
- StoreLo8(luma_ptr, result);
- StoreHi8(luma_ptr + kCflLumaBufferStride, result);
- return result;
-}
-
-// Takes two halves of a vertically added pair of rows and completes the
-// computation for one output row.
-inline __m128i StoreLumaResults8_420(const __m128i vertical_sum0,
- const __m128i vertical_sum1,
- int16_t* luma_ptr) {
- __m128i result = _mm_hadd_epi16(vertical_sum0, vertical_sum1);
- result = _mm_slli_epi16(result, 1);
- StoreUnaligned16(luma_ptr, result);
- return result;
-}
-
template <int block_height_log2>
void CflSubsampler420_4xH_SSE4_1(
int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
@@ -511,17 +507,6 @@ void CflSubsampler420_4xH_SSE4_1(
}
}
-// This duplicates the last two 16-bit values in |row|.
-inline __m128i LastRowSamples(const __m128i row) {
- return _mm_shuffle_epi32(row, 0xFF);
-}
-
-// This duplicates the last 16-bit value in |row|.
-inline __m128i LastRowResult(const __m128i row) {
- const __m128i dup_row = _mm_shufflehi_epi16(row, 0xFF);
- return _mm_shuffle_epi32(dup_row, 0xFF);
-}
-
template <int block_height_log2, int max_luma_width>
inline void CflSubsampler420Impl_8xH_SSE4_1(
int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
@@ -655,10 +640,11 @@ inline void CflSubsampler420Impl_WxH_SSE4_1(
__m128i final_sum = zero;
const int block_height = 1 << block_height_log2;
const int luma_height = std::min(block_height, max_luma_height >> 1);
+ static_assert(max_luma_width <= 32, "");
int16_t* luma_ptr = luma[0];
__m128i final_row_result;
- // Begin first y section, covering width up to 16.
+ // Begin first y section, covering width up to 32.
int y = 0;
do {
const uint8_t* src_next = src + stride;
@@ -694,29 +680,32 @@ inline void CflSubsampler420Impl_WxH_SSE4_1(
final_row_result =
StoreLumaResults8_420(luma_sum2, luma_sum3, luma_ptr + 8);
sum = _mm_add_epi16(sum, final_row_result);
+ if (block_width_log2 == 5) {
+ const __m128i wide_fill = LastRowResult(final_row_result);
+ sum = _mm_add_epi16(sum, wide_fill);
+ sum = _mm_add_epi16(sum, wide_fill);
+ }
final_sum = _mm_add_epi32(final_sum, _mm_cvtepu16_epi32(sum));
final_sum = _mm_add_epi32(final_sum, _mm_unpackhi_epi16(sum, zero));
src += stride << 1;
luma_ptr += kCflLumaBufferStride;
} while (++y < luma_height);
- // Because max_luma_width is at most 32, any values beyond x=16 will
- // necessarily be duplicated.
- if (block_width_log2 == 5) {
- const __m128i wide_fill = LastRowResult(final_row_result);
- // Multiply duplicated value by number of occurrences, height * 4, since
- // there are 16 in each row and the value appears in the vector 4 times.
- final_sum = _mm_add_epi32(
- final_sum,
- _mm_slli_epi32(_mm_cvtepi16_epi32(wide_fill), block_height_log2 + 2));
- }
-
// Begin second y section.
if (y < block_height) {
const __m128i final_fill0 =
LoadUnaligned16(luma_ptr - kCflLumaBufferStride);
const __m128i final_fill1 =
LoadUnaligned16(luma_ptr - kCflLumaBufferStride + 8);
+ __m128i wide_fill;
+
+ if (block_width_log2 == 5) {
+ // There are 16 16-bit fill values per row, shifting by 2 accounts for
+ // the widening to 32-bit.
+ wide_fill =
+ _mm_slli_epi32(_mm_cvtepi16_epi32(LastRowResult(final_fill1)), 2);
+ }
+
const __m128i final_inner_sum = _mm_add_epi16(final_fill0, final_fill1);
const __m128i final_inner_sum0 = _mm_cvtepu16_epi32(final_inner_sum);
const __m128i final_inner_sum1 = _mm_unpackhi_epi16(final_inner_sum, zero);
@@ -726,6 +715,9 @@ inline void CflSubsampler420Impl_WxH_SSE4_1(
do {
StoreUnaligned16(luma_ptr, final_fill0);
StoreUnaligned16(luma_ptr + 8, final_fill1);
+ if (block_width_log2 == 5) {
+ final_sum = _mm_add_epi32(final_sum, wide_fill);
+ }
luma_ptr += kCflLumaBufferStride;
final_sum = _mm_add_epi32(final_sum, final_fill_to_sum);
@@ -747,14 +739,10 @@ inline void CflSubsampler420Impl_WxH_SSE4_1(
const __m128i samples1 = LoadUnaligned16(luma_ptr + 8);
final_row_result = _mm_sub_epi16(samples1, averages);
StoreUnaligned16(luma_ptr + 8, final_row_result);
- }
- if (block_width_log2 == 5) {
- int16_t* wide_luma_ptr = luma[0] + 16;
- const __m128i wide_fill = LastRowResult(final_row_result);
- for (int i = 0; i < block_height;
- ++i, wide_luma_ptr += kCflLumaBufferStride) {
- StoreUnaligned16(wide_luma_ptr, wide_fill);
- StoreUnaligned16(wide_luma_ptr + 8, wide_fill);
+ if (block_width_log2 == 5) {
+ const __m128i wide_fill = LastRowResult(final_row_result);
+ StoreUnaligned16(luma_ptr + 16, wide_fill);
+ StoreUnaligned16(luma_ptr + 24, wide_fill);
}
}
}
@@ -958,7 +946,882 @@ void Init8bpp() {
} // namespace
} // namespace low_bitdepth
-void IntraPredCflInit_SSE4_1() { low_bitdepth::Init8bpp(); }
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+//------------------------------------------------------------------------------
+// CflIntraPredictor_10bpp_SSE4_1
+
+inline __m128i CflPredictUnclipped(const __m128i* input, __m128i alpha_q12,
+ __m128i alpha_sign, __m128i dc_q0) {
+ const __m128i ac_q3 = LoadUnaligned16(input);
+ const __m128i ac_sign = _mm_sign_epi16(alpha_sign, ac_q3);
+ __m128i scaled_luma_q0 = _mm_mulhrs_epi16(_mm_abs_epi16(ac_q3), alpha_q12);
+ scaled_luma_q0 = _mm_sign_epi16(scaled_luma_q0, ac_sign);
+ return _mm_add_epi16(scaled_luma_q0, dc_q0);
+}
+
+inline __m128i ClipEpi16(__m128i x, __m128i min, __m128i max) {
+ return _mm_max_epi16(_mm_min_epi16(x, max), min);
+}
+
+template <int width, int height>
+void CflIntraPredictor_10bpp_SSE4_1(
+ void* const dest, ptrdiff_t stride,
+ const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int alpha) {
+ constexpr int kCflLumaBufferStrideLog2_16i = 5;
+ constexpr int kCflLumaBufferStrideLog2_128i =
+ kCflLumaBufferStrideLog2_16i - 3;
+ constexpr int kRowIncr = 1 << kCflLumaBufferStrideLog2_128i;
+ auto* dst = static_cast<uint16_t*>(dest);
+ const __m128i alpha_sign = _mm_set1_epi16(alpha);
+ const __m128i alpha_q12 = _mm_slli_epi16(_mm_abs_epi16(alpha_sign), 9);
+ auto* row = reinterpret_cast<const __m128i*>(luma);
+ const __m128i* row_end = row + (height << kCflLumaBufferStrideLog2_128i);
+ const __m128i dc_val = _mm_set1_epi16(dst[0]);
+ const __m128i min = _mm_setzero_si128();
+ const __m128i max = _mm_set1_epi16((1 << kBitdepth10) - 1);
+
+ stride >>= 1;
+
+ do {
+ __m128i res = CflPredictUnclipped(row, alpha_q12, alpha_sign, dc_val);
+ res = ClipEpi16(res, min, max);
+ if (width == 4) {
+ StoreLo8(dst, res);
+ } else if (width == 8) {
+ StoreUnaligned16(dst, res);
+ } else if (width == 16) {
+ StoreUnaligned16(dst, res);
+ const __m128i res_1 =
+ CflPredictUnclipped(row + 1, alpha_q12, alpha_sign, dc_val);
+ StoreUnaligned16(dst + 8, ClipEpi16(res_1, min, max));
+ } else {
+ StoreUnaligned16(dst, res);
+ const __m128i res_1 =
+ CflPredictUnclipped(row + 1, alpha_q12, alpha_sign, dc_val);
+ StoreUnaligned16(dst + 8, ClipEpi16(res_1, min, max));
+ const __m128i res_2 =
+ CflPredictUnclipped(row + 2, alpha_q12, alpha_sign, dc_val);
+ StoreUnaligned16(dst + 16, ClipEpi16(res_2, min, max));
+ const __m128i res_3 =
+ CflPredictUnclipped(row + 3, alpha_q12, alpha_sign, dc_val);
+ StoreUnaligned16(dst + 24, ClipEpi16(res_3, min, max));
+ }
+
+ dst += stride;
+ } while ((row += kRowIncr) < row_end);
+}
+
+template <int block_height_log2, bool is_inside>
+void CflSubsampler444_4xH_SSE4_1(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_height, const void* const source, ptrdiff_t stride) {
+ static_assert(block_height_log2 <= 4, "");
+ const int block_height = 1 << block_height_log2;
+ const int visible_height = max_luma_height;
+ const auto* src = static_cast<const uint16_t*>(source);
+ const ptrdiff_t src_stride = stride / sizeof(src[0]);
+ int16_t* luma_ptr = luma[0];
+ __m128i zero = _mm_setzero_si128();
+ __m128i sum = zero;
+ __m128i samples;
+ int y = visible_height;
+
+ do {
+ samples = LoadHi8(LoadLo8(src), src + src_stride);
+ src += src_stride << 1;
+ sum = _mm_add_epi16(sum, samples);
+ y -= 2;
+ } while (y != 0);
+
+ if (!is_inside) {
+ y = visible_height;
+ samples = _mm_unpackhi_epi64(samples, samples);
+ do {
+ sum = _mm_add_epi16(sum, samples);
+ y += 2;
+ } while (y < block_height);
+ }
+
+ sum = _mm_add_epi32(_mm_unpackhi_epi16(sum, zero), _mm_cvtepu16_epi32(sum));
+ sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 8));
+ sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 4));
+
+ // Here the left shift by 3 (to increase precision) is nullified in right
+ // shift ((log2 of width 4) + 1).
+ __m128i averages = RightShiftWithRounding_U32(sum, block_height_log2 - 1);
+ averages = _mm_shufflelo_epi16(averages, 0);
+ src = static_cast<const uint16_t*>(source);
+ luma_ptr = luma[0];
+ y = visible_height;
+ do {
+ samples = LoadLo8(src);
+ samples = _mm_slli_epi16(samples, 3);
+ StoreLo8(luma_ptr, _mm_sub_epi16(samples, averages));
+ src += src_stride;
+ luma_ptr += kCflLumaBufferStride;
+ } while (--y != 0);
+
+ if (!is_inside) {
+ y = visible_height;
+ // Replicate last line
+ do {
+ StoreLo8(luma_ptr, _mm_sub_epi16(samples, averages));
+ luma_ptr += kCflLumaBufferStride;
+ } while (++y < block_height);
+ }
+}
+
+template <int block_height_log2>
+void CflSubsampler444_4xH_SSE4_1(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_width, const int max_luma_height,
+ const void* const source, ptrdiff_t stride) {
+ static_cast<void>(max_luma_width);
+ static_cast<void>(max_luma_height);
+ static_assert(block_height_log2 <= 4, "");
+ assert(max_luma_width >= 4);
+ assert(max_luma_height >= 4);
+ const int block_height = 1 << block_height_log2;
+
+ if (block_height <= max_luma_height) {
+ CflSubsampler444_4xH_SSE4_1<block_height_log2, true>(luma, max_luma_height,
+ source, stride);
+ } else {
+ CflSubsampler444_4xH_SSE4_1<block_height_log2, false>(luma, max_luma_height,
+ source, stride);
+ }
+}
+
+template <int block_height_log2, bool is_inside>
+void CflSubsampler444_8xH_SSE4_1(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_height, const void* const source, ptrdiff_t stride) {
+ const int block_height = 1 << block_height_log2;
+ const int visible_height = max_luma_height;
+ const __m128i dup16 = _mm_set1_epi32(0x01000100);
+ const auto* src = static_cast<const uint16_t*>(source);
+ const ptrdiff_t src_stride = stride / sizeof(src[0]);
+ int16_t* luma_ptr = luma[0];
+ const __m128i zero = _mm_setzero_si128();
+ __m128i sum = zero;
+ __m128i samples;
+ int y = visible_height;
+
+ do {
+ samples = LoadUnaligned16(src);
+ src += src_stride;
+ sum = _mm_add_epi16(sum, samples);
+ } while (--y != 0);
+
+ if (!is_inside) {
+ y = visible_height;
+ do {
+ sum = _mm_add_epi16(sum, samples);
+ } while (++y < block_height);
+ }
+
+ sum = _mm_add_epi32(_mm_unpackhi_epi16(sum, zero), _mm_cvtepu16_epi32(sum));
+ sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 8));
+ sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 4));
+
+ // Here the left shift by 3 (to increase precision) is nullified in right
+ // shift (log2 of width 8).
+ __m128i averages = RightShiftWithRounding_U32(sum, block_height_log2);
+ averages = _mm_shuffle_epi8(averages, dup16);
+
+ src = static_cast<const uint16_t*>(source);
+ luma_ptr = luma[0];
+ y = visible_height;
+ do {
+ samples = LoadUnaligned16(src);
+ samples = _mm_slli_epi16(samples, 3);
+ StoreUnaligned16(luma_ptr, _mm_sub_epi16(samples, averages));
+ src += src_stride;
+ luma_ptr += kCflLumaBufferStride;
+ } while (--y != 0);
+
+ if (!is_inside) {
+ y = visible_height;
+ // Replicate last line
+ do {
+ StoreUnaligned16(luma_ptr, _mm_sub_epi16(samples, averages));
+ luma_ptr += kCflLumaBufferStride;
+ } while (++y < block_height);
+ }
+}
+
+template <int block_height_log2>
+void CflSubsampler444_8xH_SSE4_1(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_width, const int max_luma_height,
+ const void* const source, ptrdiff_t stride) {
+ static_cast<void>(max_luma_width);
+ static_cast<void>(max_luma_height);
+ static_assert(block_height_log2 <= 5, "");
+ assert(max_luma_width >= 4);
+ assert(max_luma_height >= 4);
+ const int block_height = 1 << block_height_log2;
+ const int block_width = 8;
+
+ const int horz_inside = block_width <= max_luma_width;
+ const int vert_inside = block_height <= max_luma_height;
+ if (horz_inside && vert_inside) {
+ CflSubsampler444_8xH_SSE4_1<block_height_log2, true>(luma, max_luma_height,
+ source, stride);
+ } else {
+ CflSubsampler444_8xH_SSE4_1<block_height_log2, false>(luma, max_luma_height,
+ source, stride);
+ }
+}
+
+template <int block_width_log2, int block_height_log2, bool is_inside>
+void CflSubsampler444_WxH_SSE4_1(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_width, const int max_luma_height,
+ const void* const source, ptrdiff_t stride) {
+ const int block_height = 1 << block_height_log2;
+ const int visible_height = max_luma_height;
+ const int block_width = 1 << block_width_log2;
+ const __m128i dup16 = _mm_set1_epi32(0x01000100);
+ const __m128i zero = _mm_setzero_si128();
+ const auto* src = static_cast<const uint16_t*>(source);
+ const ptrdiff_t src_stride = stride / sizeof(src[0]);
+ int16_t* luma_ptr = luma[0];
+ __m128i sum = zero;
+ __m128i inner_sum_lo, inner_sum_hi;
+ __m128i samples[4];
+ int y = visible_height;
+
+ do {
+ samples[0] = LoadUnaligned16(src);
+ samples[1] = (max_luma_width >= 16) ? LoadUnaligned16(src + 8)
+ : LastRowResult(samples[0]);
+ __m128i inner_sum = _mm_add_epi16(samples[0], samples[1]);
+ if (block_width == 32) {
+ samples[2] = (max_luma_width >= 24) ? LoadUnaligned16(src + 16)
+ : LastRowResult(samples[1]);
+ samples[3] = (max_luma_width == 32) ? LoadUnaligned16(src + 24)
+ : LastRowResult(samples[2]);
+
+ inner_sum = _mm_add_epi16(samples[2], inner_sum);
+ inner_sum = _mm_add_epi16(samples[3], inner_sum);
+ }
+ inner_sum_lo = _mm_cvtepu16_epi32(inner_sum);
+ inner_sum_hi = _mm_unpackhi_epi16(inner_sum, zero);
+ sum = _mm_add_epi32(sum, inner_sum_lo);
+ sum = _mm_add_epi32(sum, inner_sum_hi);
+ src += src_stride;
+ } while (--y != 0);
+
+ if (!is_inside) {
+ y = visible_height;
+ __m128i inner_sum = _mm_add_epi16(samples[0], samples[1]);
+ if (block_width == 32) {
+ inner_sum = _mm_add_epi16(samples[2], inner_sum);
+ inner_sum = _mm_add_epi16(samples[3], inner_sum);
+ }
+ inner_sum_lo = _mm_cvtepu16_epi32(inner_sum);
+ inner_sum_hi = _mm_unpackhi_epi16(inner_sum, zero);
+ do {
+ sum = _mm_add_epi32(sum, inner_sum_lo);
+ sum = _mm_add_epi32(sum, inner_sum_hi);
+ } while (++y < block_height);
+ }
+
+ sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 8));
+ sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 4));
+
+ // Here the left shift by 3 (to increase precision) is subtracted in right
+ // shift factor (block_width_log2 + block_height_log2 - 3).
+ __m128i averages =
+ RightShiftWithRounding_U32(sum, block_width_log2 + block_height_log2 - 3);
+ averages = _mm_shuffle_epi8(averages, dup16);
+
+ src = static_cast<const uint16_t*>(source);
+ __m128i samples_ext = zero;
+ luma_ptr = luma[0];
+ y = visible_height;
+ do {
+ int idx = 0;
+ for (int x = 0; x < block_width; x += 8) {
+ if (max_luma_width > x) {
+ samples[idx] = LoadUnaligned16(&src[x]);
+ samples[idx] = _mm_slli_epi16(samples[idx], 3);
+ samples_ext = samples[idx];
+ } else {
+ samples[idx] = LastRowResult(samples_ext);
+ }
+ StoreUnaligned16(&luma_ptr[x], _mm_sub_epi16(samples[idx++], averages));
+ }
+ src += src_stride;
+ luma_ptr += kCflLumaBufferStride;
+ } while (--y != 0);
+
+ if (!is_inside) {
+ y = visible_height;
+ // Replicate last line
+ do {
+ int idx = 0;
+ for (int x = 0; x < block_width; x += 8) {
+ StoreUnaligned16(&luma_ptr[x], _mm_sub_epi16(samples[idx++], averages));
+ }
+ luma_ptr += kCflLumaBufferStride;
+ } while (++y < block_height);
+ }
+}
+
+template <int block_width_log2, int block_height_log2>
+void CflSubsampler444_WxH_SSE4_1(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_width, const int max_luma_height,
+ const void* const source, ptrdiff_t stride) {
+ static_assert(block_width_log2 == 4 || block_width_log2 == 5,
+ "This function will only work for block_width 16 and 32.");
+ static_assert(block_height_log2 <= 5, "");
+ assert(max_luma_width >= 4);
+ assert(max_luma_height >= 4);
+
+ const int block_height = 1 << block_height_log2;
+ const int vert_inside = block_height <= max_luma_height;
+ if (vert_inside) {
+ CflSubsampler444_WxH_SSE4_1<block_width_log2, block_height_log2, true>(
+ luma, max_luma_width, max_luma_height, source, stride);
+ } else {
+ CflSubsampler444_WxH_SSE4_1<block_width_log2, block_height_log2, false>(
+ luma, max_luma_width, max_luma_height, source, stride);
+ }
+}
+
+template <int block_height_log2>
+void CflSubsampler420_4xH_SSE4_1(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int /*max_luma_width*/, const int max_luma_height,
+ const void* const source, ptrdiff_t stride) {
+ const int block_height = 1 << block_height_log2;
+ const auto* src = static_cast<const uint16_t*>(source);
+ const ptrdiff_t src_stride = stride / sizeof(src[0]);
+ int16_t* luma_ptr = luma[0];
+ const __m128i zero = _mm_setzero_si128();
+ __m128i final_sum = zero;
+ const int luma_height = std::min(block_height, max_luma_height >> 1);
+ int y = luma_height;
+
+ do {
+ const __m128i samples_row0 = LoadUnaligned16(src);
+ src += src_stride;
+ const __m128i samples_row1 = LoadUnaligned16(src);
+ src += src_stride;
+ const __m128i luma_sum01 = _mm_add_epi16(samples_row0, samples_row1);
+
+ const __m128i samples_row2 = LoadUnaligned16(src);
+ src += src_stride;
+ const __m128i samples_row3 = LoadUnaligned16(src);
+ src += src_stride;
+ const __m128i luma_sum23 = _mm_add_epi16(samples_row2, samples_row3);
+ __m128i sum = StoreLumaResults4_420(luma_sum01, luma_sum23, luma_ptr);
+ luma_ptr += kCflLumaBufferStride << 1;
+
+ const __m128i samples_row4 = LoadUnaligned16(src);
+ src += src_stride;
+ const __m128i samples_row5 = LoadUnaligned16(src);
+ src += src_stride;
+ const __m128i luma_sum45 = _mm_add_epi16(samples_row4, samples_row5);
+
+ const __m128i samples_row6 = LoadUnaligned16(src);
+ src += src_stride;
+ const __m128i samples_row7 = LoadUnaligned16(src);
+ src += src_stride;
+ const __m128i luma_sum67 = _mm_add_epi16(samples_row6, samples_row7);
+ sum = _mm_add_epi16(
+ sum, StoreLumaResults4_420(luma_sum45, luma_sum67, luma_ptr));
+ luma_ptr += kCflLumaBufferStride << 1;
+
+ final_sum = _mm_add_epi32(final_sum, _mm_cvtepu16_epi32(sum));
+ final_sum = _mm_add_epi32(final_sum, _mm_unpackhi_epi16(sum, zero));
+ y -= 4;
+ } while (y != 0);
+
+ const __m128i final_fill = LoadLo8(luma_ptr - kCflLumaBufferStride);
+ const __m128i final_fill_to_sum = _mm_cvtepu16_epi32(final_fill);
+ for (y = luma_height; y < block_height; ++y) {
+ StoreLo8(luma_ptr, final_fill);
+ luma_ptr += kCflLumaBufferStride;
+ final_sum = _mm_add_epi32(final_sum, final_fill_to_sum);
+ }
+ final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 8));
+ final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 4));
+
+ __m128i averages = RightShiftWithRounding_U32(
+ final_sum, block_height_log2 + 2 /*log2 of width 4*/);
+
+ averages = _mm_shufflelo_epi16(averages, 0);
+ luma_ptr = luma[0];
+ y = block_height;
+ do {
+ const __m128i samples = LoadLo8(luma_ptr);
+ StoreLo8(luma_ptr, _mm_sub_epi16(samples, averages));
+ luma_ptr += kCflLumaBufferStride;
+ } while (--y != 0);
+}
+
+template <int block_height_log2, int max_luma_width>
+inline void CflSubsampler420Impl_8xH_SSE4_1(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_height, const void* const source, ptrdiff_t stride) {
+ const int block_height = 1 << block_height_log2;
+ const auto* src = static_cast<const uint16_t*>(source);
+ const ptrdiff_t src_stride = stride / sizeof(src[0]);
+ const __m128i zero = _mm_setzero_si128();
+ __m128i final_sum = zero;
+ int16_t* luma_ptr = luma[0];
+ const int luma_height = std::min(block_height, max_luma_height >> 1);
+ int y = luma_height;
+
+ do {
+ const __m128i samples_row00 = LoadUnaligned16(src);
+ const __m128i samples_row01 = (max_luma_width == 16)
+ ? LoadUnaligned16(src + 8)
+ : LastRowSamples(samples_row00);
+ src += src_stride;
+ const __m128i samples_row10 = LoadUnaligned16(src);
+ const __m128i samples_row11 = (max_luma_width == 16)
+ ? LoadUnaligned16(src + 8)
+ : LastRowSamples(samples_row10);
+ src += src_stride;
+ const __m128i luma_sum00 = _mm_add_epi16(samples_row00, samples_row10);
+ const __m128i luma_sum01 = _mm_add_epi16(samples_row01, samples_row11);
+ __m128i sum = StoreLumaResults8_420(luma_sum00, luma_sum01, luma_ptr);
+ luma_ptr += kCflLumaBufferStride;
+
+ const __m128i samples_row20 = LoadUnaligned16(src);
+ const __m128i samples_row21 = (max_luma_width == 16)
+ ? LoadUnaligned16(src + 8)
+ : LastRowSamples(samples_row20);
+ src += src_stride;
+ const __m128i samples_row30 = LoadUnaligned16(src);
+ const __m128i samples_row31 = (max_luma_width == 16)
+ ? LoadUnaligned16(src + 8)
+ : LastRowSamples(samples_row30);
+ src += src_stride;
+ const __m128i luma_sum10 = _mm_add_epi16(samples_row20, samples_row30);
+ const __m128i luma_sum11 = _mm_add_epi16(samples_row21, samples_row31);
+ sum = _mm_add_epi16(
+ sum, StoreLumaResults8_420(luma_sum10, luma_sum11, luma_ptr));
+ luma_ptr += kCflLumaBufferStride;
+
+ const __m128i samples_row40 = LoadUnaligned16(src);
+ const __m128i samples_row41 = (max_luma_width == 16)
+ ? LoadUnaligned16(src + 8)
+ : LastRowSamples(samples_row40);
+ src += src_stride;
+ const __m128i samples_row50 = LoadUnaligned16(src);
+ const __m128i samples_row51 = (max_luma_width == 16)
+ ? LoadUnaligned16(src + 8)
+ : LastRowSamples(samples_row50);
+ src += src_stride;
+ const __m128i luma_sum20 = _mm_add_epi16(samples_row40, samples_row50);
+ const __m128i luma_sum21 = _mm_add_epi16(samples_row41, samples_row51);
+ sum = _mm_add_epi16(
+ sum, StoreLumaResults8_420(luma_sum20, luma_sum21, luma_ptr));
+ luma_ptr += kCflLumaBufferStride;
+
+ const __m128i samples_row60 = LoadUnaligned16(src);
+ const __m128i samples_row61 = (max_luma_width == 16)
+ ? LoadUnaligned16(src + 8)
+ : LastRowSamples(samples_row60);
+ src += src_stride;
+ const __m128i samples_row70 = LoadUnaligned16(src);
+ const __m128i samples_row71 = (max_luma_width == 16)
+ ? LoadUnaligned16(src + 8)
+ : LastRowSamples(samples_row70);
+ src += src_stride;
+ const __m128i luma_sum30 = _mm_add_epi16(samples_row60, samples_row70);
+ const __m128i luma_sum31 = _mm_add_epi16(samples_row61, samples_row71);
+ sum = _mm_add_epi16(
+ sum, StoreLumaResults8_420(luma_sum30, luma_sum31, luma_ptr));
+ luma_ptr += kCflLumaBufferStride;
+
+ final_sum = _mm_add_epi32(final_sum, _mm_cvtepu16_epi32(sum));
+ final_sum = _mm_add_epi32(final_sum, _mm_unpackhi_epi16(sum, zero));
+ y -= 4;
+ } while (y != 0);
+
+ // Duplicate the final row downward to the end after max_luma_height.
+ const __m128i final_fill = LoadUnaligned16(luma_ptr - kCflLumaBufferStride);
+ const __m128i final_fill_to_sum0 = _mm_cvtepi16_epi32(final_fill);
+ const __m128i final_fill_to_sum1 =
+ _mm_cvtepi16_epi32(_mm_srli_si128(final_fill, 8));
+ const __m128i final_fill_to_sum =
+ _mm_add_epi32(final_fill_to_sum0, final_fill_to_sum1);
+ for (y = luma_height; y < block_height; ++y) {
+ StoreUnaligned16(luma_ptr, final_fill);
+ luma_ptr += kCflLumaBufferStride;
+ final_sum = _mm_add_epi32(final_sum, final_fill_to_sum);
+ }
+ final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 8));
+ final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 4));
+
+ __m128i averages = RightShiftWithRounding_S32(
+ final_sum, block_height_log2 + 3 /*log2 of width 8*/);
+
+ averages = _mm_shufflelo_epi16(averages, 0);
+ averages = _mm_shuffle_epi32(averages, 0);
+ luma_ptr = luma[0];
+ y = block_height;
+ do {
+ const __m128i samples = LoadUnaligned16(luma_ptr);
+ StoreUnaligned16(luma_ptr, _mm_sub_epi16(samples, averages));
+ luma_ptr += kCflLumaBufferStride;
+ } while (--y != 0);
+}
+
+template <int block_height_log2>
+void CflSubsampler420_8xH_SSE4_1(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_width, const int max_luma_height,
+ const void* const source, ptrdiff_t stride) {
+ if (max_luma_width == 8) {
+ CflSubsampler420Impl_8xH_SSE4_1<block_height_log2, 8>(luma, max_luma_height,
+ source, stride);
+ } else {
+ CflSubsampler420Impl_8xH_SSE4_1<block_height_log2, 16>(
+ luma, max_luma_height, source, stride);
+ }
+}
+
+template <int block_width_log2, int block_height_log2, int max_luma_width>
+inline void CflSubsampler420Impl_WxH_SSE4_1(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_height, const void* const source, ptrdiff_t stride) {
+ const auto* src = static_cast<const uint16_t*>(source);
+ const ptrdiff_t src_stride = stride / sizeof(src[0]);
+ const __m128i zero = _mm_setzero_si128();
+ __m128i final_sum = zero;
+ const int block_height = 1 << block_height_log2;
+ const int luma_height = std::min(block_height, max_luma_height >> 1);
+ int16_t* luma_ptr = luma[0];
+ __m128i final_row_result;
+ // Begin first y section, covering width up to 32.
+ int y = luma_height;
+
+ do {
+ const uint16_t* src_next = src + src_stride;
+ const __m128i samples_row00 = LoadUnaligned16(src);
+ const __m128i samples_row01 = (max_luma_width >= 16)
+ ? LoadUnaligned16(src + 8)
+ : LastRowSamples(samples_row00);
+ const __m128i samples_row02 = (max_luma_width >= 24)
+ ? LoadUnaligned16(src + 16)
+ : LastRowSamples(samples_row01);
+ const __m128i samples_row03 = (max_luma_width == 32)
+ ? LoadUnaligned16(src + 24)
+ : LastRowSamples(samples_row02);
+ const __m128i samples_row10 = LoadUnaligned16(src_next);
+ const __m128i samples_row11 = (max_luma_width >= 16)
+ ? LoadUnaligned16(src_next + 8)
+ : LastRowSamples(samples_row10);
+ const __m128i samples_row12 = (max_luma_width >= 24)
+ ? LoadUnaligned16(src_next + 16)
+ : LastRowSamples(samples_row11);
+ const __m128i samples_row13 = (max_luma_width == 32)
+ ? LoadUnaligned16(src_next + 24)
+ : LastRowSamples(samples_row12);
+ const __m128i luma_sum0 = _mm_add_epi16(samples_row00, samples_row10);
+ const __m128i luma_sum1 = _mm_add_epi16(samples_row01, samples_row11);
+ const __m128i luma_sum2 = _mm_add_epi16(samples_row02, samples_row12);
+ const __m128i luma_sum3 = _mm_add_epi16(samples_row03, samples_row13);
+ __m128i sum = StoreLumaResults8_420(luma_sum0, luma_sum1, luma_ptr);
+ final_row_result =
+ StoreLumaResults8_420(luma_sum2, luma_sum3, luma_ptr + 8);
+ sum = _mm_add_epi16(sum, final_row_result);
+ final_sum = _mm_add_epi32(final_sum, _mm_cvtepu16_epi32(sum));
+ final_sum = _mm_add_epi32(final_sum, _mm_unpackhi_epi16(sum, zero));
+
+ // Because max_luma_width is at most 32, any values beyond x=16 will
+ // necessarily be duplicated.
+ if (block_width_log2 == 5) {
+ const __m128i wide_fill = LastRowResult(final_row_result);
+ // There are 16 16-bit fill values per row, shifting by 2 accounts for
+ // the widening to 32-bit.
+ final_sum = _mm_add_epi32(
+ final_sum, _mm_slli_epi32(_mm_cvtepi16_epi32(wide_fill), 2));
+ }
+ src += src_stride << 1;
+ luma_ptr += kCflLumaBufferStride;
+ } while (--y != 0);
+
+ // Begin second y section.
+ y = luma_height;
+ if (y < block_height) {
+ const __m128i final_fill0 =
+ LoadUnaligned16(luma_ptr - kCflLumaBufferStride);
+ const __m128i final_fill1 =
+ LoadUnaligned16(luma_ptr - kCflLumaBufferStride + 8);
+ __m128i wide_fill;
+ if (block_width_log2 == 5) {
+ // There are 16 16-bit fill values per row, shifting by 2 accounts for
+ // the widening to 32-bit.
+ wide_fill =
+ _mm_slli_epi32(_mm_cvtepi16_epi32(LastRowResult(final_fill1)), 2);
+ }
+ const __m128i final_inner_sum = _mm_add_epi16(final_fill0, final_fill1);
+ const __m128i final_inner_sum0 = _mm_cvtepu16_epi32(final_inner_sum);
+ const __m128i final_inner_sum1 = _mm_unpackhi_epi16(final_inner_sum, zero);
+ const __m128i final_fill_to_sum =
+ _mm_add_epi32(final_inner_sum0, final_inner_sum1);
+
+ do {
+ StoreUnaligned16(luma_ptr, final_fill0);
+ StoreUnaligned16(luma_ptr + 8, final_fill1);
+ if (block_width_log2 == 5) {
+ final_sum = _mm_add_epi32(final_sum, wide_fill);
+ }
+ luma_ptr += kCflLumaBufferStride;
+ final_sum = _mm_add_epi32(final_sum, final_fill_to_sum);
+ } while (++y < block_height);
+ } // End second y section.
+
+ final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 8));
+ final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 4));
+
+ __m128i averages = RightShiftWithRounding_S32(
+ final_sum, block_width_log2 + block_height_log2);
+ averages = _mm_shufflelo_epi16(averages, 0);
+ averages = _mm_shuffle_epi32(averages, 0);
+
+ luma_ptr = luma[0];
+ y = block_height;
+ do {
+ const __m128i samples0 = LoadUnaligned16(luma_ptr);
+ StoreUnaligned16(luma_ptr, _mm_sub_epi16(samples0, averages));
+ const __m128i samples1 = LoadUnaligned16(luma_ptr + 8);
+ final_row_result = _mm_sub_epi16(samples1, averages);
+ StoreUnaligned16(luma_ptr + 8, final_row_result);
+
+ if (block_width_log2 == 5) {
+ const __m128i wide_fill = LastRowResult(final_row_result);
+ StoreUnaligned16(luma_ptr + 16, wide_fill);
+ StoreUnaligned16(luma_ptr + 24, wide_fill);
+ }
+ luma_ptr += kCflLumaBufferStride;
+ } while (--y != 0);
+}
+
+template <int block_width_log2, int block_height_log2>
+void CflSubsampler420_WxH_SSE4_1(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_width, const int max_luma_height,
+ const void* const source, ptrdiff_t stride) {
+ switch (max_luma_width) {
+ case 8:
+ CflSubsampler420Impl_WxH_SSE4_1<block_width_log2, block_height_log2, 8>(
+ luma, max_luma_height, source, stride);
+ return;
+ case 16:
+ CflSubsampler420Impl_WxH_SSE4_1<block_width_log2, block_height_log2, 16>(
+ luma, max_luma_height, source, stride);
+ return;
+ case 24:
+ CflSubsampler420Impl_WxH_SSE4_1<block_width_log2, block_height_log2, 24>(
+ luma, max_luma_height, source, stride);
+ return;
+ default:
+ assert(max_luma_width == 32);
+ CflSubsampler420Impl_WxH_SSE4_1<block_width_log2, block_height_log2, 32>(
+ luma, max_luma_height, source, stride);
+ return;
+ }
+}
+
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x4_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize4x4] =
+ CflIntraPredictor_10bpp_SSE4_1<4, 4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x8_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize4x8] =
+ CflIntraPredictor_10bpp_SSE4_1<4, 8>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x16_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize4x16] =
+ CflIntraPredictor_10bpp_SSE4_1<4, 16>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x4_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize8x4] =
+ CflIntraPredictor_10bpp_SSE4_1<8, 4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x8_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize8x8] =
+ CflIntraPredictor_10bpp_SSE4_1<8, 8>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x16_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize8x16] =
+ CflIntraPredictor_10bpp_SSE4_1<8, 16>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x32_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize8x32] =
+ CflIntraPredictor_10bpp_SSE4_1<8, 32>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x4_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize16x4] =
+ CflIntraPredictor_10bpp_SSE4_1<16, 4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x8_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize16x8] =
+ CflIntraPredictor_10bpp_SSE4_1<16, 8>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x16_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize16x16] =
+ CflIntraPredictor_10bpp_SSE4_1<16, 16>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x32_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize16x32] =
+ CflIntraPredictor_10bpp_SSE4_1<16, 32>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x8_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize32x8] =
+ CflIntraPredictor_10bpp_SSE4_1<32, 8>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x16_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize32x16] =
+ CflIntraPredictor_10bpp_SSE4_1<32, 16>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x32_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize32x32] =
+ CflIntraPredictor_10bpp_SSE4_1<32, 32>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x4_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType420] =
+ CflSubsampler420_4xH_SSE4_1<2>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x8_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType420] =
+ CflSubsampler420_4xH_SSE4_1<3>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x16_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType420] =
+ CflSubsampler420_4xH_SSE4_1<4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x4_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType420] =
+ CflSubsampler420_8xH_SSE4_1<2>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x8_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType420] =
+ CflSubsampler420_8xH_SSE4_1<3>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x16_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType420] =
+ CflSubsampler420_8xH_SSE4_1<4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x32_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType420] =
+ CflSubsampler420_8xH_SSE4_1<5>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x4_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType420] =
+ CflSubsampler420_WxH_SSE4_1<4, 2>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x8_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType420] =
+ CflSubsampler420_WxH_SSE4_1<4, 3>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x16_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType420] =
+ CflSubsampler420_WxH_SSE4_1<4, 4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x32_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType420] =
+ CflSubsampler420_WxH_SSE4_1<4, 5>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x8_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType420] =
+ CflSubsampler420_WxH_SSE4_1<5, 3>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x16_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType420] =
+ CflSubsampler420_WxH_SSE4_1<5, 4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x32_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType420] =
+ CflSubsampler420_WxH_SSE4_1<5, 5>;
+#endif
+
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x4_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType444] =
+ CflSubsampler444_4xH_SSE4_1<2>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x8_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType444] =
+ CflSubsampler444_4xH_SSE4_1<3>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x16_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType444] =
+ CflSubsampler444_4xH_SSE4_1<4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x4_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType444] =
+ CflSubsampler444_8xH_SSE4_1<2>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x8_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType444] =
+ CflSubsampler444_8xH_SSE4_1<3>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x16_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType444] =
+ CflSubsampler444_8xH_SSE4_1<4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x32_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType444] =
+ CflSubsampler444_8xH_SSE4_1<5>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x4_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType444] =
+ CflSubsampler444_WxH_SSE4_1<4, 2>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x8_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType444] =
+ CflSubsampler444_WxH_SSE4_1<4, 3>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x16_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType444] =
+ CflSubsampler444_WxH_SSE4_1<4, 4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x32_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType444] =
+ CflSubsampler444_WxH_SSE4_1<4, 5>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x8_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType444] =
+ CflSubsampler444_WxH_SSE4_1<5, 3>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x16_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType444] =
+ CflSubsampler444_WxH_SSE4_1<5, 4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x32_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType444] =
+ CflSubsampler444_WxH_SSE4_1<5, 5>;
+#endif
+}
+
+} // namespace
+} // namespace high_bitdepth
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+void IntraPredCflInit_SSE4_1() {
+ low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ high_bitdepth::Init10bpp();
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+}
} // namespace dsp
} // namespace libgav1