diff options
Diffstat (limited to 'src/dsp/x86/obmc_sse4.cc')
-rw-r--r-- | src/dsp/x86/obmc_sse4.cc | 287 |
1 files changed, 285 insertions, 2 deletions
diff --git a/src/dsp/x86/obmc_sse4.cc b/src/dsp/x86/obmc_sse4.cc index 3a1d1fd..c34a7f7 100644 --- a/src/dsp/x86/obmc_sse4.cc +++ b/src/dsp/x86/obmc_sse4.cc @@ -31,6 +31,7 @@ namespace libgav1 { namespace dsp { +namespace low_bitdepth { namespace { #include "src/dsp/obmc.inc" @@ -311,13 +312,295 @@ void Init8bpp() { } } // namespace +} // namespace low_bitdepth -void ObmcInit_SSE4_1() { Init8bpp(); } +#if LIBGAV1_MAX_BITDEPTH >= 10 +namespace high_bitdepth { +namespace { + +#include "src/dsp/obmc.inc" + +constexpr int kRoundBitsObmcBlend = 6; + +inline void OverlapBlendFromLeft2xH_SSE4_1( + uint16_t* const prediction, const ptrdiff_t pred_stride, const int height, + const uint16_t* const obmc_prediction, const ptrdiff_t obmc_pred_stride) { + uint16_t* pred = prediction; + const uint16_t* obmc_pred = obmc_prediction; + const ptrdiff_t pred_stride2 = pred_stride << 1; + const ptrdiff_t obmc_pred_stride2 = obmc_pred_stride << 1; + const __m128i mask_inverter = _mm_cvtsi32_si128(0x40404040); + const __m128i mask_val = _mm_shufflelo_epi16(Load2(kObmcMask), 0x00); + // 64 - mask. + const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val); + const __m128i masks = + _mm_cvtepi8_epi16(_mm_unpacklo_epi8(mask_val, obmc_mask_val)); + int y = height; + do { + const __m128i pred_val = Load4x2(pred, pred + pred_stride); + const __m128i obmc_pred_val = + Load4x2(obmc_pred, obmc_pred + obmc_pred_stride); + const __m128i terms = _mm_unpacklo_epi16(pred_val, obmc_pred_val); + const __m128i result = RightShiftWithRounding_U32( + _mm_madd_epi16(terms, masks), kRoundBitsObmcBlend); + const __m128i packed_result = _mm_packus_epi32(result, result); + Store4(pred, packed_result); + Store4(pred + pred_stride, _mm_srli_si128(packed_result, 4)); + pred += pred_stride2; + obmc_pred += obmc_pred_stride2; + y -= 2; + } while (y != 0); +} + +inline void OverlapBlendFromLeft4xH_SSE4_1( + uint16_t* const prediction, const ptrdiff_t pred_stride, const int height, + const uint16_t* const obmc_prediction, const ptrdiff_t obmc_pred_stride) { + uint16_t* pred = prediction; + const uint16_t* obmc_pred = obmc_prediction; + const ptrdiff_t pred_stride2 = pred_stride << 1; + const ptrdiff_t obmc_pred_stride2 = obmc_pred_stride << 1; + const __m128i mask_inverter = _mm_cvtsi32_si128(0x40404040); + const __m128i mask_val = Load4(kObmcMask + 2); + // 64 - mask. + const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val); + const __m128i masks = + _mm_cvtepi8_epi16(_mm_unpacklo_epi8(mask_val, obmc_mask_val)); + int y = height; + do { + const __m128i pred_val = LoadHi8(LoadLo8(pred), pred + pred_stride); + const __m128i obmc_pred_val = + LoadHi8(LoadLo8(obmc_pred), obmc_pred + obmc_pred_stride); + const __m128i terms_lo = _mm_unpacklo_epi16(pred_val, obmc_pred_val); + const __m128i terms_hi = _mm_unpackhi_epi16(pred_val, obmc_pred_val); + const __m128i result_lo = RightShiftWithRounding_U32( + _mm_madd_epi16(terms_lo, masks), kRoundBitsObmcBlend); + const __m128i result_hi = RightShiftWithRounding_U32( + _mm_madd_epi16(terms_hi, masks), kRoundBitsObmcBlend); + const __m128i packed_result = _mm_packus_epi32(result_lo, result_hi); + StoreLo8(pred, packed_result); + StoreHi8(pred + pred_stride, packed_result); + pred += pred_stride2; + obmc_pred += obmc_pred_stride2; + y -= 2; + } while (y != 0); +} + +void OverlapBlendFromLeft10bpp_SSE4_1(void* const prediction, + const ptrdiff_t prediction_stride, + const int width, const int height, + const void* const obmc_prediction, + const ptrdiff_t obmc_prediction_stride) { + auto* pred = static_cast<uint16_t*>(prediction); + const auto* obmc_pred = static_cast<const uint16_t*>(obmc_prediction); + const ptrdiff_t pred_stride = prediction_stride / sizeof(pred[0]); + const ptrdiff_t obmc_pred_stride = + obmc_prediction_stride / sizeof(obmc_pred[0]); + + if (width == 2) { + OverlapBlendFromLeft2xH_SSE4_1(pred, pred_stride, height, obmc_pred, + obmc_pred_stride); + return; + } + if (width == 4) { + OverlapBlendFromLeft4xH_SSE4_1(pred, pred_stride, height, obmc_pred, + obmc_pred_stride); + return; + } + const __m128i mask_inverter = _mm_set1_epi8(64); + const uint8_t* mask = kObmcMask + width - 2; + int x = 0; + do { + pred = static_cast<uint16_t*>(prediction) + x; + obmc_pred = static_cast<const uint16_t*>(obmc_prediction) + x; + const __m128i mask_val = LoadLo8(mask + x); + // 64 - mask + const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val); + const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val); + const __m128i masks_lo = _mm_cvtepi8_epi16(masks); + const __m128i masks_hi = _mm_cvtepi8_epi16(_mm_srli_si128(masks, 8)); + int y = height; + do { + const __m128i pred_val = LoadUnaligned16(pred); + const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred); + const __m128i terms_lo = _mm_unpacklo_epi16(pred_val, obmc_pred_val); + const __m128i terms_hi = _mm_unpackhi_epi16(pred_val, obmc_pred_val); + const __m128i result_lo = RightShiftWithRounding_U32( + _mm_madd_epi16(terms_lo, masks_lo), kRoundBitsObmcBlend); + const __m128i result_hi = RightShiftWithRounding_U32( + _mm_madd_epi16(terms_hi, masks_hi), kRoundBitsObmcBlend); + StoreUnaligned16(pred, _mm_packus_epi32(result_lo, result_hi)); + + pred += pred_stride; + obmc_pred += obmc_pred_stride; + } while (--y != 0); + x += 8; + } while (x < width); +} + +inline void OverlapBlendFromTop2xH_SSE4_1(uint16_t* const prediction, + const ptrdiff_t pred_stride, + const int height, + const uint16_t* const obmc_prediction, + const ptrdiff_t obmc_pred_stride) { + uint16_t* pred = prediction; + const uint16_t* obmc_pred = obmc_prediction; + const __m128i mask_inverter = _mm_set1_epi16(64); + const __m128i mask_shuffler = _mm_set_epi32(0x01010101, 0x01010101, 0, 0); + const __m128i mask_preinverter = _mm_set1_epi16(-256 | 1); + const uint8_t* mask = kObmcMask + height - 2; + const int compute_height = + height - (height >> 2); // compute_height based on 8-bit opt + const ptrdiff_t pred_stride2 = pred_stride << 1; + const ptrdiff_t obmc_pred_stride2 = obmc_pred_stride << 1; + int y = 0; + do { + // First mask in the first half, second mask in the second half. + const __m128i mask_val = _mm_shuffle_epi8(Load4(mask + y), mask_shuffler); + const __m128i masks = + _mm_sub_epi8(mask_inverter, _mm_sign_epi8(mask_val, mask_preinverter)); + const __m128i masks_lo = _mm_cvtepi8_epi16(masks); + const __m128i masks_hi = _mm_cvtepi8_epi16(_mm_srli_si128(masks, 8)); + + const __m128i pred_val = LoadHi8(LoadLo8(pred), pred + pred_stride); + const __m128i obmc_pred_val = + LoadHi8(LoadLo8(obmc_pred), obmc_pred + obmc_pred_stride); + const __m128i terms_lo = _mm_unpacklo_epi16(obmc_pred_val, pred_val); + const __m128i terms_hi = _mm_unpackhi_epi16(obmc_pred_val, pred_val); + const __m128i result_lo = RightShiftWithRounding_U32( + _mm_madd_epi16(terms_lo, masks_lo), kRoundBitsObmcBlend); + const __m128i result_hi = RightShiftWithRounding_U32( + _mm_madd_epi16(terms_hi, masks_hi), kRoundBitsObmcBlend); + const __m128i packed_result = _mm_packus_epi32(result_lo, result_hi); + + Store4(pred, packed_result); + Store4(pred + pred_stride, _mm_srli_si128(packed_result, 8)); + pred += pred_stride2; + obmc_pred += obmc_pred_stride2; + y += 2; + } while (y < compute_height); +} + +inline void OverlapBlendFromTop4xH_SSE4_1(uint16_t* const prediction, + const ptrdiff_t pred_stride, + const int height, + const uint16_t* const obmc_prediction, + const ptrdiff_t obmc_pred_stride) { + uint16_t* pred = prediction; + const uint16_t* obmc_pred = obmc_prediction; + const __m128i mask_inverter = _mm_set1_epi16(64); + const __m128i mask_shuffler = _mm_set_epi32(0x01010101, 0x01010101, 0, 0); + const __m128i mask_preinverter = _mm_set1_epi16(-256 | 1); + const uint8_t* mask = kObmcMask + height - 2; + const int compute_height = height - (height >> 2); + const ptrdiff_t pred_stride2 = pred_stride << 1; + const ptrdiff_t obmc_pred_stride2 = obmc_pred_stride << 1; + int y = 0; + do { + // First mask in the first half, second mask in the second half. + const __m128i mask_val = _mm_shuffle_epi8(Load4(mask + y), mask_shuffler); + const __m128i masks = + _mm_sub_epi8(mask_inverter, _mm_sign_epi8(mask_val, mask_preinverter)); + const __m128i masks_lo = _mm_cvtepi8_epi16(masks); + const __m128i masks_hi = _mm_cvtepi8_epi16(_mm_srli_si128(masks, 8)); + + const __m128i pred_val = LoadHi8(LoadLo8(pred), pred + pred_stride); + const __m128i obmc_pred_val = + LoadHi8(LoadLo8(obmc_pred), obmc_pred + obmc_pred_stride); + const __m128i terms_lo = _mm_unpacklo_epi16(obmc_pred_val, pred_val); + const __m128i terms_hi = _mm_unpackhi_epi16(obmc_pred_val, pred_val); + const __m128i result_lo = RightShiftWithRounding_U32( + _mm_madd_epi16(terms_lo, masks_lo), kRoundBitsObmcBlend); + const __m128i result_hi = RightShiftWithRounding_U32( + _mm_madd_epi16(terms_hi, masks_hi), kRoundBitsObmcBlend); + const __m128i packed_result = _mm_packus_epi32(result_lo, result_hi); + + StoreLo8(pred, packed_result); + StoreHi8(pred + pred_stride, packed_result); + pred += pred_stride2; + obmc_pred += obmc_pred_stride2; + y += 2; + } while (y < compute_height); +} + +void OverlapBlendFromTop10bpp_SSE4_1(void* const prediction, + const ptrdiff_t prediction_stride, + const int width, const int height, + const void* const obmc_prediction, + const ptrdiff_t obmc_prediction_stride) { + auto* pred = static_cast<uint16_t*>(prediction); + const auto* obmc_pred = static_cast<const uint16_t*>(obmc_prediction); + const ptrdiff_t pred_stride = prediction_stride / sizeof(pred[0]); + const ptrdiff_t obmc_pred_stride = + obmc_prediction_stride / sizeof(obmc_pred[0]); + + if (width == 2) { + OverlapBlendFromTop2xH_SSE4_1(pred, pred_stride, height, obmc_pred, + obmc_pred_stride); + return; + } + if (width == 4) { + OverlapBlendFromTop4xH_SSE4_1(pred, pred_stride, height, obmc_pred, + obmc_pred_stride); + return; + } + + const __m128i mask_inverter = _mm_set1_epi8(64); + const int compute_height = height - (height >> 2); + const uint8_t* mask = kObmcMask + height - 2; + pred = static_cast<uint16_t*>(prediction); + obmc_pred = static_cast<const uint16_t*>(obmc_prediction); + int y = 0; + do { + const __m128i mask_val = _mm_set1_epi8(mask[y]); + // 64 - mask + const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val); + const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val); + const __m128i masks_lo = _mm_cvtepi8_epi16(masks); + const __m128i masks_hi = _mm_cvtepi8_epi16(_mm_srli_si128(masks, 8)); + int x = 0; + do { + const __m128i pred_val = LoadUnaligned16(pred + x); + const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred + x); + const __m128i terms_lo = _mm_unpacklo_epi16(pred_val, obmc_pred_val); + const __m128i terms_hi = _mm_unpackhi_epi16(pred_val, obmc_pred_val); + const __m128i result_lo = RightShiftWithRounding_U32( + _mm_madd_epi16(terms_lo, masks_lo), kRoundBitsObmcBlend); + const __m128i result_hi = RightShiftWithRounding_U32( + _mm_madd_epi16(terms_hi, masks_hi), kRoundBitsObmcBlend); + StoreUnaligned16(pred + x, _mm_packus_epi32(result_lo, result_hi)); + x += 8; + } while (x < width); + pred += pred_stride; + obmc_pred += obmc_pred_stride; + } while (++y < compute_height); +} + +void Init10bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10); + assert(dsp != nullptr); +#if DSP_ENABLED_10BPP_SSE4_1(ObmcVertical) + dsp->obmc_blend[kObmcDirectionVertical] = OverlapBlendFromTop10bpp_SSE4_1; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(ObmcHorizontal) + dsp->obmc_blend[kObmcDirectionHorizontal] = OverlapBlendFromLeft10bpp_SSE4_1; +#endif +} + +} // namespace +} // namespace high_bitdepth +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +void ObmcInit_SSE4_1() { + low_bitdepth::Init8bpp(); +#if LIBGAV1_MAX_BITDEPTH >= 10 + high_bitdepth::Init10bpp(); +#endif // LIBGAV1_MAX_BITDEPTH >= 10 +} } // namespace dsp } // namespace libgav1 -#else // !LIBGAV1_TARGETING_SSE4_1 +#else // !LIBGAV1_TARGETING_SSE4_1 namespace libgav1 { namespace dsp { |