diff options
Diffstat (limited to 'src/dsp/x86/intrapred_cfl_sse4.cc')
-rw-r--r-- | src/dsp/x86/intrapred_cfl_sse4.cc | 1057 |
1 files changed, 960 insertions, 97 deletions
diff --git a/src/dsp/x86/intrapred_cfl_sse4.cc b/src/dsp/x86/intrapred_cfl_sse4.cc index fac1556..f2dcfdb 100644 --- a/src/dsp/x86/intrapred_cfl_sse4.cc +++ b/src/dsp/x86/intrapred_cfl_sse4.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "src/dsp/intrapred.h" +#include "src/dsp/intrapred_cfl.h" #include "src/utils/cpu.h" #if LIBGAV1_TARGETING_SSE4_1 @@ -29,9 +29,48 @@ #include "src/dsp/x86/common_sse4.h" #include "src/utils/common.h" #include "src/utils/compiler_attributes.h" +#include "src/utils/constants.h" namespace libgav1 { namespace dsp { +namespace { + +// This duplicates the last two 16-bit values in |row|. +inline __m128i LastRowSamples(const __m128i row) { + return _mm_shuffle_epi32(row, 0xFF); +} + +// This duplicates the last 16-bit value in |row|. +inline __m128i LastRowResult(const __m128i row) { + const __m128i dup_row = _mm_shufflehi_epi16(row, 0xFF); + return _mm_shuffle_epi32(dup_row, 0xFF); +} + +// Takes in two sums of input row pairs, and completes the computation for two +// output rows. +inline __m128i StoreLumaResults4_420(const __m128i vertical_sum0, + const __m128i vertical_sum1, + int16_t* luma_ptr) { + __m128i result = _mm_hadd_epi16(vertical_sum0, vertical_sum1); + result = _mm_slli_epi16(result, 1); + StoreLo8(luma_ptr, result); + StoreHi8(luma_ptr + kCflLumaBufferStride, result); + return result; +} + +// Takes two halves of a vertically added pair of rows and completes the +// computation for one output row. +inline __m128i StoreLumaResults8_420(const __m128i vertical_sum0, + const __m128i vertical_sum1, + int16_t* luma_ptr) { + __m128i result = _mm_hadd_epi16(vertical_sum0, vertical_sum1); + result = _mm_slli_epi16(result, 1); + StoreUnaligned16(luma_ptr, result); + return result; +} + +} // namespace + namespace low_bitdepth { namespace { @@ -40,8 +79,8 @@ namespace { inline __m128i CflPredictUnclipped(const __m128i* input, __m128i alpha_q12, __m128i alpha_sign, __m128i dc_q0) { - __m128i ac_q3 = LoadUnaligned16(input); - __m128i ac_sign = _mm_sign_epi16(alpha_sign, ac_q3); + const __m128i ac_q3 = LoadUnaligned16(input); + const __m128i ac_sign = _mm_sign_epi16(alpha_sign, ac_q3); __m128i scaled_luma_q0 = _mm_mulhrs_epi16(_mm_abs_epi16(ac_q3), alpha_q12); scaled_luma_q0 = _mm_sign_epi16(scaled_luma_q0, ac_sign); return _mm_add_epi16(scaled_luma_q0, dc_q0); @@ -88,8 +127,7 @@ void CflIntraPredictor_SSE4_1( template <int block_height_log2, bool is_inside> void CflSubsampler444_4xH_SSE4_1( int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], - const int /*max_luma_width*/, const int max_luma_height, - const void* const source, ptrdiff_t stride) { + const int max_luma_height, const void* const source, ptrdiff_t stride) { static_assert(block_height_log2 <= 4, ""); const int block_height = 1 << block_height_log2; const int visible_height = max_luma_height; @@ -119,12 +157,15 @@ void CflSubsampler444_4xH_SSE4_1( } while (y < visible_height); if (!is_inside) { - int y = visible_height; + // Replicate the 2 high lanes. + samples = _mm_shuffle_epi32(samples, 0xee); do { + StoreLo8(luma_ptr, samples); + luma_ptr += kCflLumaBufferStride; StoreHi8(luma_ptr, samples); luma_ptr += kCflLumaBufferStride; sum = _mm_add_epi16(sum, samples); - ++y; + y += 2; } while (y < block_height); } @@ -152,15 +193,15 @@ void CflSubsampler444_4xH_SSE4_1( static_assert(block_height_log2 <= 4, ""); assert(max_luma_width >= 4); assert(max_luma_height >= 4); - const int block_height = 1 << block_height_log2; - const int block_width = 4; + static_cast<void>(max_luma_width); + constexpr int block_height = 1 << block_height_log2; - if (block_height <= max_luma_height && block_width <= max_luma_width) { - CflSubsampler444_4xH_SSE4_1<block_height_log2, true>( - luma, max_luma_width, max_luma_height, source, stride); + if (block_height <= max_luma_height) { + CflSubsampler444_4xH_SSE4_1<block_height_log2, true>(luma, max_luma_height, + source, stride); } else { - CflSubsampler444_4xH_SSE4_1<block_height_log2, false>( - luma, max_luma_width, max_luma_height, source, stride); + CflSubsampler444_4xH_SSE4_1<block_height_log2, false>(luma, max_luma_height, + source, stride); } } @@ -302,19 +343,9 @@ void CflSubsampler444_SSE4_1( __m128i inner_sum_lo, inner_sum_hi; int y = 0; do { -#if LIBGAV1_MSAN // We can load uninitialized values here. Even though they are - // then masked off by blendv, MSAN isn't smart enough to - // understand that. So we switch to a C implementation here. - uint16_t c_arr[16]; - for (int x = 0; x < 16; x++) { - const int x_index = std::min(x, visible_width_16 - 1); - c_arr[x] = src[x_index] << 3; - } - samples0 = LoadUnaligned16(c_arr); - samples1 = LoadUnaligned16(c_arr + 8); - static_cast<void>(blend_mask_16); -#else - __m128i samples01 = LoadUnaligned16(src); + // We can load uninitialized values here. Even though they are then masked + // off by blendv, MSAN doesn't model that behavior. + __m128i samples01 = LoadUnaligned16Msan(src, invisible_width_16); if (!inside) { const __m128i border16 = @@ -323,26 +354,15 @@ void CflSubsampler444_SSE4_1( } samples0 = _mm_slli_epi16(_mm_cvtepu8_epi16(samples01), 3); samples1 = _mm_slli_epi16(_mm_unpackhi_epi8(samples01, zero), 3); -#endif // LIBGAV1_MSAN StoreUnaligned16(luma_ptr, samples0); StoreUnaligned16(luma_ptr + 8, samples1); __m128i inner_sum = _mm_add_epi16(samples0, samples1); if (block_width == 32) { -#if LIBGAV1_MSAN // We can load uninitialized values here. Even though they are - // then masked off by blendv, MSAN isn't smart enough to - // understand that. So we switch to a C implementation here. - uint16_t c_arr[16]; - for (int x = 16; x < 32; x++) { - const int x_index = std::min(x, visible_width_32 - 1); - c_arr[x - 16] = src[x_index] << 3; - } - samples2 = LoadUnaligned16(c_arr); - samples3 = LoadUnaligned16(c_arr + 8); - static_cast<void>(blend_mask_32); -#else - __m128i samples23 = LoadUnaligned16(src + 16); + // We can load uninitialized values here. Even though they are then masked + // off by blendv, MSAN doesn't model that behavior. + __m128i samples23 = LoadUnaligned16Msan(src + 16, invisible_width_32); if (!inside) { const __m128i border32 = _mm_set1_epi8(static_cast<int8_t>(src[visible_width_32 - 1])); @@ -350,7 +370,6 @@ void CflSubsampler444_SSE4_1( } samples2 = _mm_slli_epi16(_mm_cvtepu8_epi16(samples23), 3); samples3 = _mm_slli_epi16(_mm_unpackhi_epi8(samples23, zero), 3); -#endif // LIBGAV1_MSAN StoreUnaligned16(luma_ptr + 16, samples2); StoreUnaligned16(luma_ptr + 24, samples3); @@ -418,29 +437,6 @@ void CflSubsampler444_SSE4_1( } } -// Takes in two sums of input row pairs, and completes the computation for two -// output rows. -inline __m128i StoreLumaResults4_420(const __m128i vertical_sum0, - const __m128i vertical_sum1, - int16_t* luma_ptr) { - __m128i result = _mm_hadd_epi16(vertical_sum0, vertical_sum1); - result = _mm_slli_epi16(result, 1); - StoreLo8(luma_ptr, result); - StoreHi8(luma_ptr + kCflLumaBufferStride, result); - return result; -} - -// Takes two halves of a vertically added pair of rows and completes the -// computation for one output row. -inline __m128i StoreLumaResults8_420(const __m128i vertical_sum0, - const __m128i vertical_sum1, - int16_t* luma_ptr) { - __m128i result = _mm_hadd_epi16(vertical_sum0, vertical_sum1); - result = _mm_slli_epi16(result, 1); - StoreUnaligned16(luma_ptr, result); - return result; -} - template <int block_height_log2> void CflSubsampler420_4xH_SSE4_1( int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], @@ -511,17 +507,6 @@ void CflSubsampler420_4xH_SSE4_1( } } -// This duplicates the last two 16-bit values in |row|. -inline __m128i LastRowSamples(const __m128i row) { - return _mm_shuffle_epi32(row, 0xFF); -} - -// This duplicates the last 16-bit value in |row|. -inline __m128i LastRowResult(const __m128i row) { - const __m128i dup_row = _mm_shufflehi_epi16(row, 0xFF); - return _mm_shuffle_epi32(dup_row, 0xFF); -} - template <int block_height_log2, int max_luma_width> inline void CflSubsampler420Impl_8xH_SSE4_1( int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], @@ -655,10 +640,11 @@ inline void CflSubsampler420Impl_WxH_SSE4_1( __m128i final_sum = zero; const int block_height = 1 << block_height_log2; const int luma_height = std::min(block_height, max_luma_height >> 1); + static_assert(max_luma_width <= 32, ""); int16_t* luma_ptr = luma[0]; __m128i final_row_result; - // Begin first y section, covering width up to 16. + // Begin first y section, covering width up to 32. int y = 0; do { const uint8_t* src_next = src + stride; @@ -694,29 +680,32 @@ inline void CflSubsampler420Impl_WxH_SSE4_1( final_row_result = StoreLumaResults8_420(luma_sum2, luma_sum3, luma_ptr + 8); sum = _mm_add_epi16(sum, final_row_result); + if (block_width_log2 == 5) { + const __m128i wide_fill = LastRowResult(final_row_result); + sum = _mm_add_epi16(sum, wide_fill); + sum = _mm_add_epi16(sum, wide_fill); + } final_sum = _mm_add_epi32(final_sum, _mm_cvtepu16_epi32(sum)); final_sum = _mm_add_epi32(final_sum, _mm_unpackhi_epi16(sum, zero)); src += stride << 1; luma_ptr += kCflLumaBufferStride; } while (++y < luma_height); - // Because max_luma_width is at most 32, any values beyond x=16 will - // necessarily be duplicated. - if (block_width_log2 == 5) { - const __m128i wide_fill = LastRowResult(final_row_result); - // Multiply duplicated value by number of occurrences, height * 4, since - // there are 16 in each row and the value appears in the vector 4 times. - final_sum = _mm_add_epi32( - final_sum, - _mm_slli_epi32(_mm_cvtepi16_epi32(wide_fill), block_height_log2 + 2)); - } - // Begin second y section. if (y < block_height) { const __m128i final_fill0 = LoadUnaligned16(luma_ptr - kCflLumaBufferStride); const __m128i final_fill1 = LoadUnaligned16(luma_ptr - kCflLumaBufferStride + 8); + __m128i wide_fill; + + if (block_width_log2 == 5) { + // There are 16 16-bit fill values per row, shifting by 2 accounts for + // the widening to 32-bit. + wide_fill = + _mm_slli_epi32(_mm_cvtepi16_epi32(LastRowResult(final_fill1)), 2); + } + const __m128i final_inner_sum = _mm_add_epi16(final_fill0, final_fill1); const __m128i final_inner_sum0 = _mm_cvtepu16_epi32(final_inner_sum); const __m128i final_inner_sum1 = _mm_unpackhi_epi16(final_inner_sum, zero); @@ -726,6 +715,9 @@ inline void CflSubsampler420Impl_WxH_SSE4_1( do { StoreUnaligned16(luma_ptr, final_fill0); StoreUnaligned16(luma_ptr + 8, final_fill1); + if (block_width_log2 == 5) { + final_sum = _mm_add_epi32(final_sum, wide_fill); + } luma_ptr += kCflLumaBufferStride; final_sum = _mm_add_epi32(final_sum, final_fill_to_sum); @@ -747,14 +739,10 @@ inline void CflSubsampler420Impl_WxH_SSE4_1( const __m128i samples1 = LoadUnaligned16(luma_ptr + 8); final_row_result = _mm_sub_epi16(samples1, averages); StoreUnaligned16(luma_ptr + 8, final_row_result); - } - if (block_width_log2 == 5) { - int16_t* wide_luma_ptr = luma[0] + 16; - const __m128i wide_fill = LastRowResult(final_row_result); - for (int i = 0; i < block_height; - ++i, wide_luma_ptr += kCflLumaBufferStride) { - StoreUnaligned16(wide_luma_ptr, wide_fill); - StoreUnaligned16(wide_luma_ptr + 8, wide_fill); + if (block_width_log2 == 5) { + const __m128i wide_fill = LastRowResult(final_row_result); + StoreUnaligned16(luma_ptr + 16, wide_fill); + StoreUnaligned16(luma_ptr + 24, wide_fill); } } } @@ -958,7 +946,882 @@ void Init8bpp() { } // namespace } // namespace low_bitdepth -void IntraPredCflInit_SSE4_1() { low_bitdepth::Init8bpp(); } +#if LIBGAV1_MAX_BITDEPTH >= 10 +namespace high_bitdepth { +namespace { + +//------------------------------------------------------------------------------ +// CflIntraPredictor_10bpp_SSE4_1 + +inline __m128i CflPredictUnclipped(const __m128i* input, __m128i alpha_q12, + __m128i alpha_sign, __m128i dc_q0) { + const __m128i ac_q3 = LoadUnaligned16(input); + const __m128i ac_sign = _mm_sign_epi16(alpha_sign, ac_q3); + __m128i scaled_luma_q0 = _mm_mulhrs_epi16(_mm_abs_epi16(ac_q3), alpha_q12); + scaled_luma_q0 = _mm_sign_epi16(scaled_luma_q0, ac_sign); + return _mm_add_epi16(scaled_luma_q0, dc_q0); +} + +inline __m128i ClipEpi16(__m128i x, __m128i min, __m128i max) { + return _mm_max_epi16(_mm_min_epi16(x, max), min); +} + +template <int width, int height> +void CflIntraPredictor_10bpp_SSE4_1( + void* const dest, ptrdiff_t stride, + const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], + const int alpha) { + constexpr int kCflLumaBufferStrideLog2_16i = 5; + constexpr int kCflLumaBufferStrideLog2_128i = + kCflLumaBufferStrideLog2_16i - 3; + constexpr int kRowIncr = 1 << kCflLumaBufferStrideLog2_128i; + auto* dst = static_cast<uint16_t*>(dest); + const __m128i alpha_sign = _mm_set1_epi16(alpha); + const __m128i alpha_q12 = _mm_slli_epi16(_mm_abs_epi16(alpha_sign), 9); + auto* row = reinterpret_cast<const __m128i*>(luma); + const __m128i* row_end = row + (height << kCflLumaBufferStrideLog2_128i); + const __m128i dc_val = _mm_set1_epi16(dst[0]); + const __m128i min = _mm_setzero_si128(); + const __m128i max = _mm_set1_epi16((1 << kBitdepth10) - 1); + + stride >>= 1; + + do { + __m128i res = CflPredictUnclipped(row, alpha_q12, alpha_sign, dc_val); + res = ClipEpi16(res, min, max); + if (width == 4) { + StoreLo8(dst, res); + } else if (width == 8) { + StoreUnaligned16(dst, res); + } else if (width == 16) { + StoreUnaligned16(dst, res); + const __m128i res_1 = + CflPredictUnclipped(row + 1, alpha_q12, alpha_sign, dc_val); + StoreUnaligned16(dst + 8, ClipEpi16(res_1, min, max)); + } else { + StoreUnaligned16(dst, res); + const __m128i res_1 = + CflPredictUnclipped(row + 1, alpha_q12, alpha_sign, dc_val); + StoreUnaligned16(dst + 8, ClipEpi16(res_1, min, max)); + const __m128i res_2 = + CflPredictUnclipped(row + 2, alpha_q12, alpha_sign, dc_val); + StoreUnaligned16(dst + 16, ClipEpi16(res_2, min, max)); + const __m128i res_3 = + CflPredictUnclipped(row + 3, alpha_q12, alpha_sign, dc_val); + StoreUnaligned16(dst + 24, ClipEpi16(res_3, min, max)); + } + + dst += stride; + } while ((row += kRowIncr) < row_end); +} + +template <int block_height_log2, bool is_inside> +void CflSubsampler444_4xH_SSE4_1( + int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], + const int max_luma_height, const void* const source, ptrdiff_t stride) { + static_assert(block_height_log2 <= 4, ""); + const int block_height = 1 << block_height_log2; + const int visible_height = max_luma_height; + const auto* src = static_cast<const uint16_t*>(source); + const ptrdiff_t src_stride = stride / sizeof(src[0]); + int16_t* luma_ptr = luma[0]; + __m128i zero = _mm_setzero_si128(); + __m128i sum = zero; + __m128i samples; + int y = visible_height; + + do { + samples = LoadHi8(LoadLo8(src), src + src_stride); + src += src_stride << 1; + sum = _mm_add_epi16(sum, samples); + y -= 2; + } while (y != 0); + + if (!is_inside) { + y = visible_height; + samples = _mm_unpackhi_epi64(samples, samples); + do { + sum = _mm_add_epi16(sum, samples); + y += 2; + } while (y < block_height); + } + + sum = _mm_add_epi32(_mm_unpackhi_epi16(sum, zero), _mm_cvtepu16_epi32(sum)); + sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 8)); + sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 4)); + + // Here the left shift by 3 (to increase precision) is nullified in right + // shift ((log2 of width 4) + 1). + __m128i averages = RightShiftWithRounding_U32(sum, block_height_log2 - 1); + averages = _mm_shufflelo_epi16(averages, 0); + src = static_cast<const uint16_t*>(source); + luma_ptr = luma[0]; + y = visible_height; + do { + samples = LoadLo8(src); + samples = _mm_slli_epi16(samples, 3); + StoreLo8(luma_ptr, _mm_sub_epi16(samples, averages)); + src += src_stride; + luma_ptr += kCflLumaBufferStride; + } while (--y != 0); + + if (!is_inside) { + y = visible_height; + // Replicate last line + do { + StoreLo8(luma_ptr, _mm_sub_epi16(samples, averages)); + luma_ptr += kCflLumaBufferStride; + } while (++y < block_height); + } +} + +template <int block_height_log2> +void CflSubsampler444_4xH_SSE4_1( + int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], + const int max_luma_width, const int max_luma_height, + const void* const source, ptrdiff_t stride) { + static_cast<void>(max_luma_width); + static_cast<void>(max_luma_height); + static_assert(block_height_log2 <= 4, ""); + assert(max_luma_width >= 4); + assert(max_luma_height >= 4); + const int block_height = 1 << block_height_log2; + + if (block_height <= max_luma_height) { + CflSubsampler444_4xH_SSE4_1<block_height_log2, true>(luma, max_luma_height, + source, stride); + } else { + CflSubsampler444_4xH_SSE4_1<block_height_log2, false>(luma, max_luma_height, + source, stride); + } +} + +template <int block_height_log2, bool is_inside> +void CflSubsampler444_8xH_SSE4_1( + int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], + const int max_luma_height, const void* const source, ptrdiff_t stride) { + const int block_height = 1 << block_height_log2; + const int visible_height = max_luma_height; + const __m128i dup16 = _mm_set1_epi32(0x01000100); + const auto* src = static_cast<const uint16_t*>(source); + const ptrdiff_t src_stride = stride / sizeof(src[0]); + int16_t* luma_ptr = luma[0]; + const __m128i zero = _mm_setzero_si128(); + __m128i sum = zero; + __m128i samples; + int y = visible_height; + + do { + samples = LoadUnaligned16(src); + src += src_stride; + sum = _mm_add_epi16(sum, samples); + } while (--y != 0); + + if (!is_inside) { + y = visible_height; + do { + sum = _mm_add_epi16(sum, samples); + } while (++y < block_height); + } + + sum = _mm_add_epi32(_mm_unpackhi_epi16(sum, zero), _mm_cvtepu16_epi32(sum)); + sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 8)); + sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 4)); + + // Here the left shift by 3 (to increase precision) is nullified in right + // shift (log2 of width 8). + __m128i averages = RightShiftWithRounding_U32(sum, block_height_log2); + averages = _mm_shuffle_epi8(averages, dup16); + + src = static_cast<const uint16_t*>(source); + luma_ptr = luma[0]; + y = visible_height; + do { + samples = LoadUnaligned16(src); + samples = _mm_slli_epi16(samples, 3); + StoreUnaligned16(luma_ptr, _mm_sub_epi16(samples, averages)); + src += src_stride; + luma_ptr += kCflLumaBufferStride; + } while (--y != 0); + + if (!is_inside) { + y = visible_height; + // Replicate last line + do { + StoreUnaligned16(luma_ptr, _mm_sub_epi16(samples, averages)); + luma_ptr += kCflLumaBufferStride; + } while (++y < block_height); + } +} + +template <int block_height_log2> +void CflSubsampler444_8xH_SSE4_1( + int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], + const int max_luma_width, const int max_luma_height, + const void* const source, ptrdiff_t stride) { + static_cast<void>(max_luma_width); + static_cast<void>(max_luma_height); + static_assert(block_height_log2 <= 5, ""); + assert(max_luma_width >= 4); + assert(max_luma_height >= 4); + const int block_height = 1 << block_height_log2; + const int block_width = 8; + + const int horz_inside = block_width <= max_luma_width; + const int vert_inside = block_height <= max_luma_height; + if (horz_inside && vert_inside) { + CflSubsampler444_8xH_SSE4_1<block_height_log2, true>(luma, max_luma_height, + source, stride); + } else { + CflSubsampler444_8xH_SSE4_1<block_height_log2, false>(luma, max_luma_height, + source, stride); + } +} + +template <int block_width_log2, int block_height_log2, bool is_inside> +void CflSubsampler444_WxH_SSE4_1( + int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], + const int max_luma_width, const int max_luma_height, + const void* const source, ptrdiff_t stride) { + const int block_height = 1 << block_height_log2; + const int visible_height = max_luma_height; + const int block_width = 1 << block_width_log2; + const __m128i dup16 = _mm_set1_epi32(0x01000100); + const __m128i zero = _mm_setzero_si128(); + const auto* src = static_cast<const uint16_t*>(source); + const ptrdiff_t src_stride = stride / sizeof(src[0]); + int16_t* luma_ptr = luma[0]; + __m128i sum = zero; + __m128i inner_sum_lo, inner_sum_hi; + __m128i samples[4]; + int y = visible_height; + + do { + samples[0] = LoadUnaligned16(src); + samples[1] = (max_luma_width >= 16) ? LoadUnaligned16(src + 8) + : LastRowResult(samples[0]); + __m128i inner_sum = _mm_add_epi16(samples[0], samples[1]); + if (block_width == 32) { + samples[2] = (max_luma_width >= 24) ? LoadUnaligned16(src + 16) + : LastRowResult(samples[1]); + samples[3] = (max_luma_width == 32) ? LoadUnaligned16(src + 24) + : LastRowResult(samples[2]); + + inner_sum = _mm_add_epi16(samples[2], inner_sum); + inner_sum = _mm_add_epi16(samples[3], inner_sum); + } + inner_sum_lo = _mm_cvtepu16_epi32(inner_sum); + inner_sum_hi = _mm_unpackhi_epi16(inner_sum, zero); + sum = _mm_add_epi32(sum, inner_sum_lo); + sum = _mm_add_epi32(sum, inner_sum_hi); + src += src_stride; + } while (--y != 0); + + if (!is_inside) { + y = visible_height; + __m128i inner_sum = _mm_add_epi16(samples[0], samples[1]); + if (block_width == 32) { + inner_sum = _mm_add_epi16(samples[2], inner_sum); + inner_sum = _mm_add_epi16(samples[3], inner_sum); + } + inner_sum_lo = _mm_cvtepu16_epi32(inner_sum); + inner_sum_hi = _mm_unpackhi_epi16(inner_sum, zero); + do { + sum = _mm_add_epi32(sum, inner_sum_lo); + sum = _mm_add_epi32(sum, inner_sum_hi); + } while (++y < block_height); + } + + sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 8)); + sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 4)); + + // Here the left shift by 3 (to increase precision) is subtracted in right + // shift factor (block_width_log2 + block_height_log2 - 3). + __m128i averages = + RightShiftWithRounding_U32(sum, block_width_log2 + block_height_log2 - 3); + averages = _mm_shuffle_epi8(averages, dup16); + + src = static_cast<const uint16_t*>(source); + __m128i samples_ext = zero; + luma_ptr = luma[0]; + y = visible_height; + do { + int idx = 0; + for (int x = 0; x < block_width; x += 8) { + if (max_luma_width > x) { + samples[idx] = LoadUnaligned16(&src[x]); + samples[idx] = _mm_slli_epi16(samples[idx], 3); + samples_ext = samples[idx]; + } else { + samples[idx] = LastRowResult(samples_ext); + } + StoreUnaligned16(&luma_ptr[x], _mm_sub_epi16(samples[idx++], averages)); + } + src += src_stride; + luma_ptr += kCflLumaBufferStride; + } while (--y != 0); + + if (!is_inside) { + y = visible_height; + // Replicate last line + do { + int idx = 0; + for (int x = 0; x < block_width; x += 8) { + StoreUnaligned16(&luma_ptr[x], _mm_sub_epi16(samples[idx++], averages)); + } + luma_ptr += kCflLumaBufferStride; + } while (++y < block_height); + } +} + +template <int block_width_log2, int block_height_log2> +void CflSubsampler444_WxH_SSE4_1( + int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], + const int max_luma_width, const int max_luma_height, + const void* const source, ptrdiff_t stride) { + static_assert(block_width_log2 == 4 || block_width_log2 == 5, + "This function will only work for block_width 16 and 32."); + static_assert(block_height_log2 <= 5, ""); + assert(max_luma_width >= 4); + assert(max_luma_height >= 4); + + const int block_height = 1 << block_height_log2; + const int vert_inside = block_height <= max_luma_height; + if (vert_inside) { + CflSubsampler444_WxH_SSE4_1<block_width_log2, block_height_log2, true>( + luma, max_luma_width, max_luma_height, source, stride); + } else { + CflSubsampler444_WxH_SSE4_1<block_width_log2, block_height_log2, false>( + luma, max_luma_width, max_luma_height, source, stride); + } +} + +template <int block_height_log2> +void CflSubsampler420_4xH_SSE4_1( + int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], + const int /*max_luma_width*/, const int max_luma_height, + const void* const source, ptrdiff_t stride) { + const int block_height = 1 << block_height_log2; + const auto* src = static_cast<const uint16_t*>(source); + const ptrdiff_t src_stride = stride / sizeof(src[0]); + int16_t* luma_ptr = luma[0]; + const __m128i zero = _mm_setzero_si128(); + __m128i final_sum = zero; + const int luma_height = std::min(block_height, max_luma_height >> 1); + int y = luma_height; + + do { + const __m128i samples_row0 = LoadUnaligned16(src); + src += src_stride; + const __m128i samples_row1 = LoadUnaligned16(src); + src += src_stride; + const __m128i luma_sum01 = _mm_add_epi16(samples_row0, samples_row1); + + const __m128i samples_row2 = LoadUnaligned16(src); + src += src_stride; + const __m128i samples_row3 = LoadUnaligned16(src); + src += src_stride; + const __m128i luma_sum23 = _mm_add_epi16(samples_row2, samples_row3); + __m128i sum = StoreLumaResults4_420(luma_sum01, luma_sum23, luma_ptr); + luma_ptr += kCflLumaBufferStride << 1; + + const __m128i samples_row4 = LoadUnaligned16(src); + src += src_stride; + const __m128i samples_row5 = LoadUnaligned16(src); + src += src_stride; + const __m128i luma_sum45 = _mm_add_epi16(samples_row4, samples_row5); + + const __m128i samples_row6 = LoadUnaligned16(src); + src += src_stride; + const __m128i samples_row7 = LoadUnaligned16(src); + src += src_stride; + const __m128i luma_sum67 = _mm_add_epi16(samples_row6, samples_row7); + sum = _mm_add_epi16( + sum, StoreLumaResults4_420(luma_sum45, luma_sum67, luma_ptr)); + luma_ptr += kCflLumaBufferStride << 1; + + final_sum = _mm_add_epi32(final_sum, _mm_cvtepu16_epi32(sum)); + final_sum = _mm_add_epi32(final_sum, _mm_unpackhi_epi16(sum, zero)); + y -= 4; + } while (y != 0); + + const __m128i final_fill = LoadLo8(luma_ptr - kCflLumaBufferStride); + const __m128i final_fill_to_sum = _mm_cvtepu16_epi32(final_fill); + for (y = luma_height; y < block_height; ++y) { + StoreLo8(luma_ptr, final_fill); + luma_ptr += kCflLumaBufferStride; + final_sum = _mm_add_epi32(final_sum, final_fill_to_sum); + } + final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 8)); + final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 4)); + + __m128i averages = RightShiftWithRounding_U32( + final_sum, block_height_log2 + 2 /*log2 of width 4*/); + + averages = _mm_shufflelo_epi16(averages, 0); + luma_ptr = luma[0]; + y = block_height; + do { + const __m128i samples = LoadLo8(luma_ptr); + StoreLo8(luma_ptr, _mm_sub_epi16(samples, averages)); + luma_ptr += kCflLumaBufferStride; + } while (--y != 0); +} + +template <int block_height_log2, int max_luma_width> +inline void CflSubsampler420Impl_8xH_SSE4_1( + int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], + const int max_luma_height, const void* const source, ptrdiff_t stride) { + const int block_height = 1 << block_height_log2; + const auto* src = static_cast<const uint16_t*>(source); + const ptrdiff_t src_stride = stride / sizeof(src[0]); + const __m128i zero = _mm_setzero_si128(); + __m128i final_sum = zero; + int16_t* luma_ptr = luma[0]; + const int luma_height = std::min(block_height, max_luma_height >> 1); + int y = luma_height; + + do { + const __m128i samples_row00 = LoadUnaligned16(src); + const __m128i samples_row01 = (max_luma_width == 16) + ? LoadUnaligned16(src + 8) + : LastRowSamples(samples_row00); + src += src_stride; + const __m128i samples_row10 = LoadUnaligned16(src); + const __m128i samples_row11 = (max_luma_width == 16) + ? LoadUnaligned16(src + 8) + : LastRowSamples(samples_row10); + src += src_stride; + const __m128i luma_sum00 = _mm_add_epi16(samples_row00, samples_row10); + const __m128i luma_sum01 = _mm_add_epi16(samples_row01, samples_row11); + __m128i sum = StoreLumaResults8_420(luma_sum00, luma_sum01, luma_ptr); + luma_ptr += kCflLumaBufferStride; + + const __m128i samples_row20 = LoadUnaligned16(src); + const __m128i samples_row21 = (max_luma_width == 16) + ? LoadUnaligned16(src + 8) + : LastRowSamples(samples_row20); + src += src_stride; + const __m128i samples_row30 = LoadUnaligned16(src); + const __m128i samples_row31 = (max_luma_width == 16) + ? LoadUnaligned16(src + 8) + : LastRowSamples(samples_row30); + src += src_stride; + const __m128i luma_sum10 = _mm_add_epi16(samples_row20, samples_row30); + const __m128i luma_sum11 = _mm_add_epi16(samples_row21, samples_row31); + sum = _mm_add_epi16( + sum, StoreLumaResults8_420(luma_sum10, luma_sum11, luma_ptr)); + luma_ptr += kCflLumaBufferStride; + + const __m128i samples_row40 = LoadUnaligned16(src); + const __m128i samples_row41 = (max_luma_width == 16) + ? LoadUnaligned16(src + 8) + : LastRowSamples(samples_row40); + src += src_stride; + const __m128i samples_row50 = LoadUnaligned16(src); + const __m128i samples_row51 = (max_luma_width == 16) + ? LoadUnaligned16(src + 8) + : LastRowSamples(samples_row50); + src += src_stride; + const __m128i luma_sum20 = _mm_add_epi16(samples_row40, samples_row50); + const __m128i luma_sum21 = _mm_add_epi16(samples_row41, samples_row51); + sum = _mm_add_epi16( + sum, StoreLumaResults8_420(luma_sum20, luma_sum21, luma_ptr)); + luma_ptr += kCflLumaBufferStride; + + const __m128i samples_row60 = LoadUnaligned16(src); + const __m128i samples_row61 = (max_luma_width == 16) + ? LoadUnaligned16(src + 8) + : LastRowSamples(samples_row60); + src += src_stride; + const __m128i samples_row70 = LoadUnaligned16(src); + const __m128i samples_row71 = (max_luma_width == 16) + ? LoadUnaligned16(src + 8) + : LastRowSamples(samples_row70); + src += src_stride; + const __m128i luma_sum30 = _mm_add_epi16(samples_row60, samples_row70); + const __m128i luma_sum31 = _mm_add_epi16(samples_row61, samples_row71); + sum = _mm_add_epi16( + sum, StoreLumaResults8_420(luma_sum30, luma_sum31, luma_ptr)); + luma_ptr += kCflLumaBufferStride; + + final_sum = _mm_add_epi32(final_sum, _mm_cvtepu16_epi32(sum)); + final_sum = _mm_add_epi32(final_sum, _mm_unpackhi_epi16(sum, zero)); + y -= 4; + } while (y != 0); + + // Duplicate the final row downward to the end after max_luma_height. + const __m128i final_fill = LoadUnaligned16(luma_ptr - kCflLumaBufferStride); + const __m128i final_fill_to_sum0 = _mm_cvtepi16_epi32(final_fill); + const __m128i final_fill_to_sum1 = + _mm_cvtepi16_epi32(_mm_srli_si128(final_fill, 8)); + const __m128i final_fill_to_sum = + _mm_add_epi32(final_fill_to_sum0, final_fill_to_sum1); + for (y = luma_height; y < block_height; ++y) { + StoreUnaligned16(luma_ptr, final_fill); + luma_ptr += kCflLumaBufferStride; + final_sum = _mm_add_epi32(final_sum, final_fill_to_sum); + } + final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 8)); + final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 4)); + + __m128i averages = RightShiftWithRounding_S32( + final_sum, block_height_log2 + 3 /*log2 of width 8*/); + + averages = _mm_shufflelo_epi16(averages, 0); + averages = _mm_shuffle_epi32(averages, 0); + luma_ptr = luma[0]; + y = block_height; + do { + const __m128i samples = LoadUnaligned16(luma_ptr); + StoreUnaligned16(luma_ptr, _mm_sub_epi16(samples, averages)); + luma_ptr += kCflLumaBufferStride; + } while (--y != 0); +} + +template <int block_height_log2> +void CflSubsampler420_8xH_SSE4_1( + int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], + const int max_luma_width, const int max_luma_height, + const void* const source, ptrdiff_t stride) { + if (max_luma_width == 8) { + CflSubsampler420Impl_8xH_SSE4_1<block_height_log2, 8>(luma, max_luma_height, + source, stride); + } else { + CflSubsampler420Impl_8xH_SSE4_1<block_height_log2, 16>( + luma, max_luma_height, source, stride); + } +} + +template <int block_width_log2, int block_height_log2, int max_luma_width> +inline void CflSubsampler420Impl_WxH_SSE4_1( + int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], + const int max_luma_height, const void* const source, ptrdiff_t stride) { + const auto* src = static_cast<const uint16_t*>(source); + const ptrdiff_t src_stride = stride / sizeof(src[0]); + const __m128i zero = _mm_setzero_si128(); + __m128i final_sum = zero; + const int block_height = 1 << block_height_log2; + const int luma_height = std::min(block_height, max_luma_height >> 1); + int16_t* luma_ptr = luma[0]; + __m128i final_row_result; + // Begin first y section, covering width up to 32. + int y = luma_height; + + do { + const uint16_t* src_next = src + src_stride; + const __m128i samples_row00 = LoadUnaligned16(src); + const __m128i samples_row01 = (max_luma_width >= 16) + ? LoadUnaligned16(src + 8) + : LastRowSamples(samples_row00); + const __m128i samples_row02 = (max_luma_width >= 24) + ? LoadUnaligned16(src + 16) + : LastRowSamples(samples_row01); + const __m128i samples_row03 = (max_luma_width == 32) + ? LoadUnaligned16(src + 24) + : LastRowSamples(samples_row02); + const __m128i samples_row10 = LoadUnaligned16(src_next); + const __m128i samples_row11 = (max_luma_width >= 16) + ? LoadUnaligned16(src_next + 8) + : LastRowSamples(samples_row10); + const __m128i samples_row12 = (max_luma_width >= 24) + ? LoadUnaligned16(src_next + 16) + : LastRowSamples(samples_row11); + const __m128i samples_row13 = (max_luma_width == 32) + ? LoadUnaligned16(src_next + 24) + : LastRowSamples(samples_row12); + const __m128i luma_sum0 = _mm_add_epi16(samples_row00, samples_row10); + const __m128i luma_sum1 = _mm_add_epi16(samples_row01, samples_row11); + const __m128i luma_sum2 = _mm_add_epi16(samples_row02, samples_row12); + const __m128i luma_sum3 = _mm_add_epi16(samples_row03, samples_row13); + __m128i sum = StoreLumaResults8_420(luma_sum0, luma_sum1, luma_ptr); + final_row_result = + StoreLumaResults8_420(luma_sum2, luma_sum3, luma_ptr + 8); + sum = _mm_add_epi16(sum, final_row_result); + final_sum = _mm_add_epi32(final_sum, _mm_cvtepu16_epi32(sum)); + final_sum = _mm_add_epi32(final_sum, _mm_unpackhi_epi16(sum, zero)); + + // Because max_luma_width is at most 32, any values beyond x=16 will + // necessarily be duplicated. + if (block_width_log2 == 5) { + const __m128i wide_fill = LastRowResult(final_row_result); + // There are 16 16-bit fill values per row, shifting by 2 accounts for + // the widening to 32-bit. + final_sum = _mm_add_epi32( + final_sum, _mm_slli_epi32(_mm_cvtepi16_epi32(wide_fill), 2)); + } + src += src_stride << 1; + luma_ptr += kCflLumaBufferStride; + } while (--y != 0); + + // Begin second y section. + y = luma_height; + if (y < block_height) { + const __m128i final_fill0 = + LoadUnaligned16(luma_ptr - kCflLumaBufferStride); + const __m128i final_fill1 = + LoadUnaligned16(luma_ptr - kCflLumaBufferStride + 8); + __m128i wide_fill; + if (block_width_log2 == 5) { + // There are 16 16-bit fill values per row, shifting by 2 accounts for + // the widening to 32-bit. + wide_fill = + _mm_slli_epi32(_mm_cvtepi16_epi32(LastRowResult(final_fill1)), 2); + } + const __m128i final_inner_sum = _mm_add_epi16(final_fill0, final_fill1); + const __m128i final_inner_sum0 = _mm_cvtepu16_epi32(final_inner_sum); + const __m128i final_inner_sum1 = _mm_unpackhi_epi16(final_inner_sum, zero); + const __m128i final_fill_to_sum = + _mm_add_epi32(final_inner_sum0, final_inner_sum1); + + do { + StoreUnaligned16(luma_ptr, final_fill0); + StoreUnaligned16(luma_ptr + 8, final_fill1); + if (block_width_log2 == 5) { + final_sum = _mm_add_epi32(final_sum, wide_fill); + } + luma_ptr += kCflLumaBufferStride; + final_sum = _mm_add_epi32(final_sum, final_fill_to_sum); + } while (++y < block_height); + } // End second y section. + + final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 8)); + final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 4)); + + __m128i averages = RightShiftWithRounding_S32( + final_sum, block_width_log2 + block_height_log2); + averages = _mm_shufflelo_epi16(averages, 0); + averages = _mm_shuffle_epi32(averages, 0); + + luma_ptr = luma[0]; + y = block_height; + do { + const __m128i samples0 = LoadUnaligned16(luma_ptr); + StoreUnaligned16(luma_ptr, _mm_sub_epi16(samples0, averages)); + const __m128i samples1 = LoadUnaligned16(luma_ptr + 8); + final_row_result = _mm_sub_epi16(samples1, averages); + StoreUnaligned16(luma_ptr + 8, final_row_result); + + if (block_width_log2 == 5) { + const __m128i wide_fill = LastRowResult(final_row_result); + StoreUnaligned16(luma_ptr + 16, wide_fill); + StoreUnaligned16(luma_ptr + 24, wide_fill); + } + luma_ptr += kCflLumaBufferStride; + } while (--y != 0); +} + +template <int block_width_log2, int block_height_log2> +void CflSubsampler420_WxH_SSE4_1( + int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], + const int max_luma_width, const int max_luma_height, + const void* const source, ptrdiff_t stride) { + switch (max_luma_width) { + case 8: + CflSubsampler420Impl_WxH_SSE4_1<block_width_log2, block_height_log2, 8>( + luma, max_luma_height, source, stride); + return; + case 16: + CflSubsampler420Impl_WxH_SSE4_1<block_width_log2, block_height_log2, 16>( + luma, max_luma_height, source, stride); + return; + case 24: + CflSubsampler420Impl_WxH_SSE4_1<block_width_log2, block_height_log2, 24>( + luma, max_luma_height, source, stride); + return; + default: + assert(max_luma_width == 32); + CflSubsampler420Impl_WxH_SSE4_1<block_width_log2, block_height_log2, 32>( + luma, max_luma_height, source, stride); + return; + } +} + +void Init10bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10); + assert(dsp != nullptr); + +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x4_CflIntraPredictor) + dsp->cfl_intra_predictors[kTransformSize4x4] = + CflIntraPredictor_10bpp_SSE4_1<4, 4>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x8_CflIntraPredictor) + dsp->cfl_intra_predictors[kTransformSize4x8] = + CflIntraPredictor_10bpp_SSE4_1<4, 8>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x16_CflIntraPredictor) + dsp->cfl_intra_predictors[kTransformSize4x16] = + CflIntraPredictor_10bpp_SSE4_1<4, 16>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x4_CflIntraPredictor) + dsp->cfl_intra_predictors[kTransformSize8x4] = + CflIntraPredictor_10bpp_SSE4_1<8, 4>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x8_CflIntraPredictor) + dsp->cfl_intra_predictors[kTransformSize8x8] = + CflIntraPredictor_10bpp_SSE4_1<8, 8>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x16_CflIntraPredictor) + dsp->cfl_intra_predictors[kTransformSize8x16] = + CflIntraPredictor_10bpp_SSE4_1<8, 16>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x32_CflIntraPredictor) + dsp->cfl_intra_predictors[kTransformSize8x32] = + CflIntraPredictor_10bpp_SSE4_1<8, 32>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x4_CflIntraPredictor) + dsp->cfl_intra_predictors[kTransformSize16x4] = + CflIntraPredictor_10bpp_SSE4_1<16, 4>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x8_CflIntraPredictor) + dsp->cfl_intra_predictors[kTransformSize16x8] = + CflIntraPredictor_10bpp_SSE4_1<16, 8>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x16_CflIntraPredictor) + dsp->cfl_intra_predictors[kTransformSize16x16] = + CflIntraPredictor_10bpp_SSE4_1<16, 16>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x32_CflIntraPredictor) + dsp->cfl_intra_predictors[kTransformSize16x32] = + CflIntraPredictor_10bpp_SSE4_1<16, 32>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x8_CflIntraPredictor) + dsp->cfl_intra_predictors[kTransformSize32x8] = + CflIntraPredictor_10bpp_SSE4_1<32, 8>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x16_CflIntraPredictor) + dsp->cfl_intra_predictors[kTransformSize32x16] = + CflIntraPredictor_10bpp_SSE4_1<32, 16>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x32_CflIntraPredictor) + dsp->cfl_intra_predictors[kTransformSize32x32] = + CflIntraPredictor_10bpp_SSE4_1<32, 32>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x4_CflSubsampler420) + dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType420] = + CflSubsampler420_4xH_SSE4_1<2>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x8_CflSubsampler420) + dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType420] = + CflSubsampler420_4xH_SSE4_1<3>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x16_CflSubsampler420) + dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType420] = + CflSubsampler420_4xH_SSE4_1<4>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x4_CflSubsampler420) + dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType420] = + CflSubsampler420_8xH_SSE4_1<2>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x8_CflSubsampler420) + dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType420] = + CflSubsampler420_8xH_SSE4_1<3>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x16_CflSubsampler420) + dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType420] = + CflSubsampler420_8xH_SSE4_1<4>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x32_CflSubsampler420) + dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType420] = + CflSubsampler420_8xH_SSE4_1<5>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x4_CflSubsampler420) + dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType420] = + CflSubsampler420_WxH_SSE4_1<4, 2>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x8_CflSubsampler420) + dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType420] = + CflSubsampler420_WxH_SSE4_1<4, 3>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x16_CflSubsampler420) + dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType420] = + CflSubsampler420_WxH_SSE4_1<4, 4>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x32_CflSubsampler420) + dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType420] = + CflSubsampler420_WxH_SSE4_1<4, 5>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x8_CflSubsampler420) + dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType420] = + CflSubsampler420_WxH_SSE4_1<5, 3>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x16_CflSubsampler420) + dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType420] = + CflSubsampler420_WxH_SSE4_1<5, 4>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x32_CflSubsampler420) + dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType420] = + CflSubsampler420_WxH_SSE4_1<5, 5>; +#endif + +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x4_CflSubsampler444) + dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType444] = + CflSubsampler444_4xH_SSE4_1<2>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x8_CflSubsampler444) + dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType444] = + CflSubsampler444_4xH_SSE4_1<3>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x16_CflSubsampler444) + dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType444] = + CflSubsampler444_4xH_SSE4_1<4>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x4_CflSubsampler444) + dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType444] = + CflSubsampler444_8xH_SSE4_1<2>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x8_CflSubsampler444) + dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType444] = + CflSubsampler444_8xH_SSE4_1<3>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x16_CflSubsampler444) + dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType444] = + CflSubsampler444_8xH_SSE4_1<4>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x32_CflSubsampler444) + dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType444] = + CflSubsampler444_8xH_SSE4_1<5>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x4_CflSubsampler444) + dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType444] = + CflSubsampler444_WxH_SSE4_1<4, 2>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x8_CflSubsampler444) + dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType444] = + CflSubsampler444_WxH_SSE4_1<4, 3>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x16_CflSubsampler444) + dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType444] = + CflSubsampler444_WxH_SSE4_1<4, 4>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x32_CflSubsampler444) + dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType444] = + CflSubsampler444_WxH_SSE4_1<4, 5>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x8_CflSubsampler444) + dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType444] = + CflSubsampler444_WxH_SSE4_1<5, 3>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x16_CflSubsampler444) + dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType444] = + CflSubsampler444_WxH_SSE4_1<5, 4>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x32_CflSubsampler444) + dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType444] = + CflSubsampler444_WxH_SSE4_1<5, 5>; +#endif +} + +} // namespace +} // namespace high_bitdepth +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +void IntraPredCflInit_SSE4_1() { + low_bitdepth::Init8bpp(); +#if LIBGAV1_MAX_BITDEPTH >= 10 + high_bitdepth::Init10bpp(); +#endif // LIBGAV1_MAX_BITDEPTH >= 10 +} } // namespace dsp } // namespace libgav1 |