diff options
Diffstat (limited to 'src/dsp/x86/mask_blend_sse4.cc')
-rw-r--r-- | src/dsp/x86/mask_blend_sse4.cc | 507 |
1 files changed, 505 insertions, 2 deletions
diff --git a/src/dsp/x86/mask_blend_sse4.cc b/src/dsp/x86/mask_blend_sse4.cc index d8036be..2e836af 100644 --- a/src/dsp/x86/mask_blend_sse4.cc +++ b/src/dsp/x86/mask_blend_sse4.cc @@ -430,12 +430,515 @@ void Init8bpp() { } // namespace } // namespace low_bitdepth -void MaskBlendInit_SSE4_1() { low_bitdepth::Init8bpp(); } +#if LIBGAV1_MAX_BITDEPTH >= 10 +namespace high_bitdepth { +namespace { + +constexpr int kMax10bppSample = (1 << 10) - 1; +constexpr int kMaskInverse = 64; +constexpr int kRoundBitsMaskBlend = 4; + +inline __m128i RightShiftWithRoundingZero_U16(const __m128i v_val_d, int bits, + const __m128i zero) { + // Shift out all but the last bit. + const __m128i v_tmp_d = _mm_srli_epi16(v_val_d, bits - 1); + // Avg with zero will shift by 1 and round. + return _mm_avg_epu16(v_tmp_d, zero); +} + +inline __m128i RightShiftWithRoundingConst_S32(const __m128i v_val_d, int bits, + const __m128i shift) { + const __m128i v_tmp_d = _mm_add_epi32(v_val_d, shift); + return _mm_srai_epi32(v_tmp_d, bits); +} + +template <int subsampling_x, int subsampling_y> +inline __m128i GetMask4x2(const uint8_t* mask, ptrdiff_t mask_stride, + const __m128i zero) { + if (subsampling_x == 1) { + if (subsampling_y == 0) { + const __m128i mask_val_0 = _mm_cvtepu8_epi16(LoadLo8(mask)); + const __m128i mask_val_1 = + _mm_cvtepu8_epi16(LoadLo8(mask + (mask_stride << subsampling_y))); + __m128i subsampled_mask = _mm_hadd_epi16(mask_val_0, mask_val_1); + return RightShiftWithRoundingZero_U16(subsampled_mask, 1, zero); + } + const __m128i one = _mm_set1_epi8(1); + const __m128i mask_val_0 = + LoadHi8(LoadLo8(mask), mask + (mask_stride << 1)); + const __m128i mask_val_1 = LoadHi8(LoadLo8(mask + mask_stride), + mask + (mask_stride << 1) + mask_stride); + const __m128i add = _mm_adds_epu8(mask_val_0, mask_val_1); + const __m128i subsampled_mask = _mm_maddubs_epi16(add, one); + return RightShiftWithRoundingZero_U16(subsampled_mask, 2, zero); + } + assert(subsampling_y == 0 && subsampling_x == 0); + const __m128i mask_val_0 = Load4(mask); + const __m128i mask_val_1 = Load4(mask + mask_stride); + return _mm_cvtepu8_epi16( + _mm_or_si128(mask_val_0, _mm_slli_si128(mask_val_1, 4))); +} + +template <int subsampling_x, int subsampling_y> +inline __m128i GetMask8(const uint8_t* mask, const ptrdiff_t stride, + const __m128i zero) { + if (subsampling_x == 1) { + if (subsampling_y == 0) { + const __m128i row_vals = LoadUnaligned16(mask); + const __m128i mask_val_0 = _mm_cvtepu8_epi16(row_vals); + const __m128i mask_val_1 = _mm_cvtepu8_epi16(_mm_srli_si128(row_vals, 8)); + __m128i subsampled_mask = _mm_hadd_epi16(mask_val_0, mask_val_1); + return RightShiftWithRoundingZero_U16(subsampled_mask, 1, zero); + } + const __m128i one = _mm_set1_epi8(1); + const __m128i mask_val_0 = LoadUnaligned16(mask); + const __m128i mask_val_1 = LoadUnaligned16(mask + stride); + const __m128i add_0 = _mm_adds_epu8(mask_val_0, mask_val_1); + const __m128i mask_0 = _mm_maddubs_epi16(add_0, one); + return RightShiftWithRoundingZero_U16(mask_0, 2, zero); + } + assert(subsampling_y == 0 && subsampling_x == 0); + const __m128i mask_val = LoadLo8(mask); + return _mm_cvtepu8_epi16(mask_val); +} + +inline void WriteMaskBlendLine10bpp4x2_SSE4_1( + const uint16_t* pred_0, const uint16_t* pred_1, + const ptrdiff_t pred_stride_1, const __m128i& pred_mask_0, + const __m128i& pred_mask_1, const __m128i& offset, const __m128i& max, + const __m128i& shift4, uint16_t* dst, const ptrdiff_t dst_stride) { + const __m128i pred_val_0 = LoadUnaligned16(pred_0); + const __m128i pred_val_1 = LoadHi8(LoadLo8(pred_1), pred_1 + pred_stride_1); + + // int res = (mask_value * pred_0[x] + (64 - mask_value) * pred_1[x]) >> 6; + const __m128i compound_pred_lo_0 = _mm_mullo_epi16(pred_val_0, pred_mask_0); + const __m128i compound_pred_hi_0 = _mm_mulhi_epu16(pred_val_0, pred_mask_0); + const __m128i compound_pred_lo_1 = _mm_mullo_epi16(pred_val_1, pred_mask_1); + const __m128i compound_pred_hi_1 = _mm_mulhi_epu16(pred_val_1, pred_mask_1); + const __m128i pack0_lo = + _mm_unpacklo_epi16(compound_pred_lo_0, compound_pred_hi_0); + const __m128i pack0_hi = + _mm_unpackhi_epi16(compound_pred_lo_0, compound_pred_hi_0); + const __m128i pack1_lo = + _mm_unpacklo_epi16(compound_pred_lo_1, compound_pred_hi_1); + const __m128i pack1_hi = + _mm_unpackhi_epi16(compound_pred_lo_1, compound_pred_hi_1); + const __m128i compound_pred_lo = _mm_add_epi32(pack0_lo, pack1_lo); + const __m128i compound_pred_hi = _mm_add_epi32(pack0_hi, pack1_hi); + // res -= (bitdepth == 8) ? 0 : kCompoundOffset; + const __m128i sub_0 = + _mm_sub_epi32(_mm_srli_epi32(compound_pred_lo, 6), offset); + const __m128i sub_1 = + _mm_sub_epi32(_mm_srli_epi32(compound_pred_hi, 6), offset); + + // dst[x] = static_cast<Pixel>( + // Clip3(RightShiftWithRounding(res, inter_post_round_bits), 0, + // (1 << kBitdepth8) - 1)); + const __m128i shift_0 = + RightShiftWithRoundingConst_S32(sub_0, kRoundBitsMaskBlend, shift4); + const __m128i shift_1 = + RightShiftWithRoundingConst_S32(sub_1, kRoundBitsMaskBlend, shift4); + const __m128i result = _mm_min_epi16(_mm_packus_epi32(shift_0, shift_1), max); + StoreLo8(dst, result); + StoreHi8(dst + dst_stride, result); +} + +template <int subsampling_x, int subsampling_y> +inline void MaskBlend10bpp4x4_SSE4_1(const uint16_t* pred_0, + const uint16_t* pred_1, + const ptrdiff_t pred_stride_1, + const uint8_t* mask, + const ptrdiff_t mask_stride, uint16_t* dst, + const ptrdiff_t dst_stride) { + const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse); + const __m128i zero = _mm_setzero_si128(); + const __m128i shift4 = _mm_set1_epi32((1 << kRoundBitsMaskBlend) >> 1); + const __m128i offset = _mm_set1_epi32(kCompoundOffset); + const __m128i max = _mm_set1_epi16(kMax10bppSample); + __m128i pred_mask_0 = + GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero); + __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); + WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1, pred_mask_0, + pred_mask_1, offset, max, shift4, dst, + dst_stride); + pred_0 += 4 << 1; + pred_1 += pred_stride_1 << 1; + mask += mask_stride << (1 + subsampling_y); + dst += dst_stride << 1; + + pred_mask_0 = + GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero); + pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); + WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1, pred_mask_0, + pred_mask_1, offset, max, shift4, dst, + dst_stride); +} + +template <int subsampling_x, int subsampling_y> +inline void MaskBlend10bpp4xH_SSE4_1(const uint16_t* pred_0, + const uint16_t* pred_1, + const ptrdiff_t pred_stride_1, + const uint8_t* const mask_ptr, + const ptrdiff_t mask_stride, + const int height, uint16_t* dst, + const ptrdiff_t dst_stride) { + const uint8_t* mask = mask_ptr; + if (height == 4) { + MaskBlend10bpp4x4_SSE4_1<subsampling_x, subsampling_y>( + pred_0, pred_1, pred_stride_1, mask, mask_stride, dst, dst_stride); + return; + } + const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse); + const __m128i zero = _mm_setzero_si128(); + const uint8_t pred0_stride2 = 4 << 1; + const ptrdiff_t pred1_stride2 = pred_stride_1 << 1; + const ptrdiff_t mask_stride2 = mask_stride << (1 + subsampling_y); + const ptrdiff_t dst_stride2 = dst_stride << 1; + const __m128i offset = _mm_set1_epi32(kCompoundOffset); + const __m128i max = _mm_set1_epi16(kMax10bppSample); + const __m128i shift4 = _mm_set1_epi32((1 << kRoundBitsMaskBlend) >> 1); + int y = height; + do { + __m128i pred_mask_0 = + GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero); + __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); + + WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1, + pred_mask_0, pred_mask_1, offset, max, + shift4, dst, dst_stride); + pred_0 += pred0_stride2; + pred_1 += pred1_stride2; + mask += mask_stride2; + dst += dst_stride2; + + pred_mask_0 = + GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero); + pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); + WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1, + pred_mask_0, pred_mask_1, offset, max, + shift4, dst, dst_stride); + pred_0 += pred0_stride2; + pred_1 += pred1_stride2; + mask += mask_stride2; + dst += dst_stride2; + + pred_mask_0 = + GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero); + pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); + WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1, + pred_mask_0, pred_mask_1, offset, max, + shift4, dst, dst_stride); + pred_0 += pred0_stride2; + pred_1 += pred1_stride2; + mask += mask_stride2; + dst += dst_stride2; + + pred_mask_0 = + GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero); + pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); + WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1, + pred_mask_0, pred_mask_1, offset, max, + shift4, dst, dst_stride); + pred_0 += pred0_stride2; + pred_1 += pred1_stride2; + mask += mask_stride2; + dst += dst_stride2; + y -= 8; + } while (y != 0); +} + +template <int subsampling_x, int subsampling_y> +inline void MaskBlend10bpp_SSE4_1(const void* prediction_0, + const void* prediction_1, + const ptrdiff_t prediction_stride_1, + const uint8_t* const mask_ptr, + const ptrdiff_t mask_stride, const int width, + const int height, void* dest, + const ptrdiff_t dest_stride) { + auto* dst = static_cast<uint16_t*>(dest); + const ptrdiff_t dst_stride = dest_stride / sizeof(dst[0]); + const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); + const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); + const ptrdiff_t pred_stride_0 = width; + const ptrdiff_t pred_stride_1 = prediction_stride_1; + if (width == 4) { + MaskBlend10bpp4xH_SSE4_1<subsampling_x, subsampling_y>( + pred_0, pred_1, pred_stride_1, mask_ptr, mask_stride, height, dst, + dst_stride); + return; + } + const uint8_t* mask = mask_ptr; + const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse); + const __m128i zero = _mm_setzero_si128(); + const ptrdiff_t mask_stride_ss = mask_stride << subsampling_y; + const __m128i offset = _mm_set1_epi32(kCompoundOffset); + const __m128i max = _mm_set1_epi16(kMax10bppSample); + const __m128i shift4 = _mm_set1_epi32((1 << kRoundBitsMaskBlend) >> 1); + int y = height; + do { + int x = 0; + do { + const __m128i pred_mask_0 = GetMask8<subsampling_x, subsampling_y>( + mask + (x << subsampling_x), mask_stride, zero); + const __m128i pred_val_0 = LoadUnaligned16(pred_0 + x); + const __m128i pred_val_1 = LoadUnaligned16(pred_1 + x); + // 64 - mask + const __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); + + const __m128i compound_pred_lo_0 = + _mm_mullo_epi16(pred_val_0, pred_mask_0); + const __m128i compound_pred_hi_0 = + _mm_mulhi_epu16(pred_val_0, pred_mask_0); + const __m128i compound_pred_lo_1 = + _mm_mullo_epi16(pred_val_1, pred_mask_1); + const __m128i compound_pred_hi_1 = + _mm_mulhi_epu16(pred_val_1, pred_mask_1); + const __m128i pack0_lo = + _mm_unpacklo_epi16(compound_pred_lo_0, compound_pred_hi_0); + const __m128i pack0_hi = + _mm_unpackhi_epi16(compound_pred_lo_0, compound_pred_hi_0); + const __m128i pack1_lo = + _mm_unpacklo_epi16(compound_pred_lo_1, compound_pred_hi_1); + const __m128i pack1_hi = + _mm_unpackhi_epi16(compound_pred_lo_1, compound_pred_hi_1); + const __m128i compound_pred_lo = _mm_add_epi32(pack0_lo, pack1_lo); + const __m128i compound_pred_hi = _mm_add_epi32(pack0_hi, pack1_hi); + + const __m128i sub_0 = + _mm_sub_epi32(_mm_srli_epi32(compound_pred_lo, 6), offset); + const __m128i sub_1 = + _mm_sub_epi32(_mm_srli_epi32(compound_pred_hi, 6), offset); + const __m128i shift_0 = + RightShiftWithRoundingConst_S32(sub_0, kRoundBitsMaskBlend, shift4); + const __m128i shift_1 = + RightShiftWithRoundingConst_S32(sub_1, kRoundBitsMaskBlend, shift4); + const __m128i result = + _mm_min_epi16(_mm_packus_epi32(shift_0, shift_1), max); + StoreUnaligned16(dst + x, result); + x += 8; + } while (x < width); + dst += dst_stride; + pred_0 += pred_stride_0; + pred_1 += pred_stride_1; + mask += mask_stride_ss; + } while (--y != 0); +} + +inline void InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1( + const uint16_t* prediction_0, const uint16_t* prediction_1, + const ptrdiff_t pred_stride_1, const __m128i& pred_mask_0, + const __m128i& pred_mask_1, const __m128i& shift6, uint16_t* dst, + const ptrdiff_t dst_stride) { + const __m128i pred_val_0 = LoadUnaligned16(prediction_0); + const __m128i pred_val_1 = + LoadHi8(LoadLo8(prediction_1), prediction_1 + pred_stride_1); + + const __m128i mask_0 = _mm_unpacklo_epi16(pred_mask_1, pred_mask_0); + const __m128i mask_1 = _mm_unpackhi_epi16(pred_mask_1, pred_mask_0); + const __m128i pred_0 = _mm_unpacklo_epi16(pred_val_0, pred_val_1); + const __m128i pred_1 = _mm_unpackhi_epi16(pred_val_0, pred_val_1); + + const __m128i compound_pred_0 = _mm_madd_epi16(pred_0, mask_0); + const __m128i compound_pred_1 = _mm_madd_epi16(pred_1, mask_1); + const __m128i shift_0 = + RightShiftWithRoundingConst_S32(compound_pred_0, 6, shift6); + const __m128i shift_1 = + RightShiftWithRoundingConst_S32(compound_pred_1, 6, shift6); + const __m128i res = _mm_packus_epi32(shift_0, shift_1); + StoreLo8(dst, res); + StoreHi8(dst + dst_stride, res); +} + +template <int subsampling_x, int subsampling_y> +inline void InterIntraMaskBlend10bpp4x4_SSE4_1( + const uint16_t* pred_0, const uint16_t* pred_1, + const ptrdiff_t pred_stride_1, const uint8_t* mask, + const ptrdiff_t mask_stride, uint16_t* dst, const ptrdiff_t dst_stride) { + const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse); + const __m128i shift6 = _mm_set1_epi32((1 << 6) >> 1); + const __m128i zero = _mm_setzero_si128(); + __m128i pred_mask_0 = + GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero); + __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); + InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1, + pred_mask_0, pred_mask_1, shift6, + dst, dst_stride); + pred_0 += 4 << 1; + pred_1 += pred_stride_1 << 1; + mask += mask_stride << (1 + subsampling_y); + dst += dst_stride << 1; + + pred_mask_0 = + GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero); + pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); + InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1, + pred_mask_0, pred_mask_1, shift6, + dst, dst_stride); +} + +template <int subsampling_x, int subsampling_y> +inline void InterIntraMaskBlend10bpp4xH_SSE4_1(const uint16_t* pred_0, + const uint16_t* pred_1, + const ptrdiff_t pred_stride_1, + const uint8_t* const mask_ptr, + const ptrdiff_t mask_stride, + const int height, uint16_t* dst, + const ptrdiff_t dst_stride) { + const uint8_t* mask = mask_ptr; + if (height == 4) { + InterIntraMaskBlend10bpp4x4_SSE4_1<subsampling_x, subsampling_y>( + pred_0, pred_1, pred_stride_1, mask, mask_stride, dst, dst_stride); + return; + } + const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse); + const __m128i zero = _mm_setzero_si128(); + const __m128i shift6 = _mm_set1_epi32((1 << 6) >> 1); + const uint8_t pred0_stride2 = 4 << 1; + const ptrdiff_t pred1_stride2 = pred_stride_1 << 1; + const ptrdiff_t mask_stride2 = mask_stride << (1 + subsampling_y); + const ptrdiff_t dst_stride2 = dst_stride << 1; + int y = height; + do { + __m128i pred_mask_0 = + GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero); + __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); + InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1, + pred_mask_0, pred_mask_1, + shift6, dst, dst_stride); + pred_0 += pred0_stride2; + pred_1 += pred1_stride2; + mask += mask_stride2; + dst += dst_stride2; + + pred_mask_0 = + GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero); + pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); + InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1, + pred_mask_0, pred_mask_1, + shift6, dst, dst_stride); + pred_0 += pred0_stride2; + pred_1 += pred1_stride2; + mask += mask_stride2; + dst += dst_stride2; + + pred_mask_0 = + GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero); + pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); + InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1, + pred_mask_0, pred_mask_1, + shift6, dst, dst_stride); + pred_0 += pred0_stride2; + pred_1 += pred1_stride2; + mask += mask_stride2; + dst += dst_stride2; + + pred_mask_0 = + GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero); + pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); + InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1, + pred_mask_0, pred_mask_1, + shift6, dst, dst_stride); + pred_0 += pred0_stride2; + pred_1 += pred1_stride2; + mask += mask_stride2; + dst += dst_stride2; + y -= 8; + } while (y != 0); +} + +template <int subsampling_x, int subsampling_y> +inline void InterIntraMaskBlend10bpp_SSE4_1( + const void* prediction_0, const void* prediction_1, + const ptrdiff_t prediction_stride_1, const uint8_t* const mask_ptr, + const ptrdiff_t mask_stride, const int width, const int height, void* dest, + const ptrdiff_t dest_stride) { + auto* dst = static_cast<uint16_t*>(dest); + const ptrdiff_t dst_stride = dest_stride / sizeof(dst[0]); + const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); + const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); + const ptrdiff_t pred_stride_0 = width; + const ptrdiff_t pred_stride_1 = prediction_stride_1; + if (width == 4) { + InterIntraMaskBlend10bpp4xH_SSE4_1<subsampling_x, subsampling_y>( + pred_0, pred_1, pred_stride_1, mask_ptr, mask_stride, height, dst, + dst_stride); + return; + } + const uint8_t* mask = mask_ptr; + const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse); + const __m128i shift6 = _mm_set1_epi32((1 << 6) >> 1); + const __m128i zero = _mm_setzero_si128(); + const ptrdiff_t mask_stride_ss = mask_stride << subsampling_y; + int y = height; + do { + int x = 0; + do { + const __m128i pred_mask_0 = GetMask8<subsampling_x, subsampling_y>( + mask + (x << subsampling_x), mask_stride, zero); + const __m128i pred_val_0 = LoadUnaligned16(pred_0 + x); + const __m128i pred_val_1 = LoadUnaligned16(pred_1 + x); + // 64 - mask + const __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); + const __m128i mask_0 = _mm_unpacklo_epi16(pred_mask_1, pred_mask_0); + const __m128i mask_1 = _mm_unpackhi_epi16(pred_mask_1, pred_mask_0); + const __m128i pred_0 = _mm_unpacklo_epi16(pred_val_0, pred_val_1); + const __m128i pred_1 = _mm_unpackhi_epi16(pred_val_0, pred_val_1); + + const __m128i compound_pred_0 = _mm_madd_epi16(pred_0, mask_0); + const __m128i compound_pred_1 = _mm_madd_epi16(pred_1, mask_1); + const __m128i shift_0 = + RightShiftWithRoundingConst_S32(compound_pred_0, 6, shift6); + const __m128i shift_1 = + RightShiftWithRoundingConst_S32(compound_pred_1, 6, shift6); + StoreUnaligned16(dst + x, _mm_packus_epi32(shift_0, shift_1)); + x += 8; + } while (x < width); + dst += dst_stride; + pred_0 += pred_stride_0; + pred_1 += pred_stride_1; + mask += mask_stride_ss; + } while (--y != 0); +} + +void Init10bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10); + assert(dsp != nullptr); + +#if DSP_ENABLED_10BPP_SSE4_1(MaskBlend444) + dsp->mask_blend[0][0] = MaskBlend10bpp_SSE4_1<0, 0>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(MaskBlend422) + dsp->mask_blend[1][0] = MaskBlend10bpp_SSE4_1<1, 0>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(MaskBlend420) + dsp->mask_blend[2][0] = MaskBlend10bpp_SSE4_1<1, 1>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(MaskBlendInterIntra444) + dsp->mask_blend[0][1] = InterIntraMaskBlend10bpp_SSE4_1<0, 0>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(MaskBlendInterIntra422) + dsp->mask_blend[1][1] = InterIntraMaskBlend10bpp_SSE4_1<1, 0>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(MaskBlendInterIntra420) + dsp->mask_blend[2][1] = InterIntraMaskBlend10bpp_SSE4_1<1, 1>; +#endif +} + +} // namespace +} // namespace high_bitdepth +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +void MaskBlendInit_SSE4_1() { + low_bitdepth::Init8bpp(); +#if LIBGAV1_MAX_BITDEPTH >= 10 + high_bitdepth::Init10bpp(); +#endif // LIBGAV1_MAX_BITDEPTH >= 10 +} } // namespace dsp } // namespace libgav1 -#else // !LIBGAV1_TARGETING_SSE4_1 +#else // !LIBGAV1_TARGETING_SSE4_1 namespace libgav1 { namespace dsp { |