diff options
Diffstat (limited to 'src/dsp/x86/mask_blend_sse4.cc')
-rw-r--r-- | src/dsp/x86/mask_blend_sse4.cc | 159 |
1 files changed, 84 insertions, 75 deletions
diff --git a/src/dsp/x86/mask_blend_sse4.cc b/src/dsp/x86/mask_blend_sse4.cc index 2e836af..a18444b 100644 --- a/src/dsp/x86/mask_blend_sse4.cc +++ b/src/dsp/x86/mask_blend_sse4.cc @@ -36,7 +36,8 @@ namespace { // Width can only be 4 when it is subsampled from a block of width 8, hence // subsampling_x is always 1 when this function is called. template <int subsampling_x, int subsampling_y> -inline __m128i GetMask4x2(const uint8_t* mask, ptrdiff_t mask_stride) { +inline __m128i GetMask4x2(const uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { if (subsampling_x == 1) { const __m128i mask_val_0 = _mm_cvtepu8_epi16(LoadLo8(mask)); const __m128i mask_val_1 = @@ -62,7 +63,8 @@ inline __m128i GetMask4x2(const uint8_t* mask, ptrdiff_t mask_stride) { // 16-bit is also the lowest packing for hadd, but without subsampling there is // an unfortunate conversion required. template <int subsampling_x, int subsampling_y> -inline __m128i GetMask8(const uint8_t* mask, ptrdiff_t stride) { +inline __m128i GetMask8(const uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t stride) { if (subsampling_x == 1) { const __m128i row_vals = LoadUnaligned16(mask); @@ -89,7 +91,8 @@ inline __m128i GetMask8(const uint8_t* mask, ptrdiff_t stride) { // when is_inter_intra is true, the prediction values are brought to 8-bit // packing as well. template <int subsampling_x, int subsampling_y> -inline __m128i GetInterIntraMask8(const uint8_t* mask, ptrdiff_t stride) { +inline __m128i GetInterIntraMask8(const uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t stride) { if (subsampling_x == 1) { const __m128i row_vals = LoadUnaligned16(mask); @@ -116,10 +119,11 @@ inline __m128i GetInterIntraMask8(const uint8_t* mask, ptrdiff_t stride) { return mask_val; } -inline void WriteMaskBlendLine4x2(const int16_t* const pred_0, - const int16_t* const pred_1, +inline void WriteMaskBlendLine4x2(const int16_t* LIBGAV1_RESTRICT const pred_0, + const int16_t* LIBGAV1_RESTRICT const pred_1, const __m128i pred_mask_0, - const __m128i pred_mask_1, uint8_t* dst, + const __m128i pred_mask_1, + uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t dst_stride) { const __m128i pred_val_0 = LoadAligned16(pred_0); const __m128i pred_val_1 = LoadAligned16(pred_1); @@ -145,9 +149,11 @@ inline void WriteMaskBlendLine4x2(const int16_t* const pred_0, } template <int subsampling_x, int subsampling_y> -inline void MaskBlending4x4_SSE4(const int16_t* pred_0, const int16_t* pred_1, - const uint8_t* mask, - const ptrdiff_t mask_stride, uint8_t* dst, +inline void MaskBlending4x4_SSE4(const int16_t* LIBGAV1_RESTRICT pred_0, + const int16_t* LIBGAV1_RESTRICT pred_1, + const uint8_t* LIBGAV1_RESTRICT mask, + const ptrdiff_t mask_stride, + uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t dst_stride) { const __m128i mask_inverter = _mm_set1_epi16(64); __m128i pred_mask_0 = @@ -167,10 +173,12 @@ inline void MaskBlending4x4_SSE4(const int16_t* pred_0, const int16_t* pred_1, } template <int subsampling_x, int subsampling_y> -inline void MaskBlending4xH_SSE4(const int16_t* pred_0, const int16_t* pred_1, - const uint8_t* const mask_ptr, +inline void MaskBlending4xH_SSE4(const int16_t* LIBGAV1_RESTRICT pred_0, + const int16_t* LIBGAV1_RESTRICT pred_1, + const uint8_t* LIBGAV1_RESTRICT const mask_ptr, const ptrdiff_t mask_stride, const int height, - uint8_t* dst, const ptrdiff_t dst_stride) { + uint8_t* LIBGAV1_RESTRICT dst, + const ptrdiff_t dst_stride) { const uint8_t* mask = mask_ptr; if (height == 4) { MaskBlending4x4_SSE4<subsampling_x, subsampling_y>( @@ -222,11 +230,12 @@ inline void MaskBlending4xH_SSE4(const int16_t* pred_0, const int16_t* pred_1, } template <int subsampling_x, int subsampling_y> -inline void MaskBlend_SSE4(const void* prediction_0, const void* prediction_1, +inline void MaskBlend_SSE4(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, const ptrdiff_t /*prediction_stride_1*/, - const uint8_t* const mask_ptr, + const uint8_t* LIBGAV1_RESTRICT const mask_ptr, const ptrdiff_t mask_stride, const int width, - const int height, void* dest, + const int height, void* LIBGAV1_RESTRICT dest, const ptrdiff_t dst_stride) { auto* dst = static_cast<uint8_t*>(dest); const auto* pred_0 = static_cast<const int16_t*>(prediction_0); @@ -277,11 +286,10 @@ inline void MaskBlend_SSE4(const void* prediction_0, const void* prediction_1, } while (++y < height); } -inline void InterIntraWriteMaskBlendLine8bpp4x2(const uint8_t* const pred_0, - uint8_t* const pred_1, - const ptrdiff_t pred_stride_1, - const __m128i pred_mask_0, - const __m128i pred_mask_1) { +inline void InterIntraWriteMaskBlendLine8bpp4x2( + const uint8_t* LIBGAV1_RESTRICT const pred_0, + uint8_t* LIBGAV1_RESTRICT const pred_1, const ptrdiff_t pred_stride_1, + const __m128i pred_mask_0, const __m128i pred_mask_1) { const __m128i pred_mask = _mm_unpacklo_epi8(pred_mask_0, pred_mask_1); const __m128i pred_val_0 = LoadLo8(pred_0); @@ -301,11 +309,10 @@ inline void InterIntraWriteMaskBlendLine8bpp4x2(const uint8_t* const pred_0, } template <int subsampling_x, int subsampling_y> -inline void InterIntraMaskBlending8bpp4x4_SSE4(const uint8_t* pred_0, - uint8_t* pred_1, - const ptrdiff_t pred_stride_1, - const uint8_t* mask, - const ptrdiff_t mask_stride) { +inline void InterIntraMaskBlending8bpp4x4_SSE4( + const uint8_t* LIBGAV1_RESTRICT pred_0, uint8_t* LIBGAV1_RESTRICT pred_1, + const ptrdiff_t pred_stride_1, const uint8_t* LIBGAV1_RESTRICT mask, + const ptrdiff_t mask_stride) { const __m128i mask_inverter = _mm_set1_epi8(64); const __m128i pred_mask_u16_first = GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride); @@ -328,12 +335,11 @@ inline void InterIntraMaskBlending8bpp4x4_SSE4(const uint8_t* pred_0, } template <int subsampling_x, int subsampling_y> -inline void InterIntraMaskBlending8bpp4xH_SSE4(const uint8_t* pred_0, - uint8_t* pred_1, - const ptrdiff_t pred_stride_1, - const uint8_t* const mask_ptr, - const ptrdiff_t mask_stride, - const int height) { +inline void InterIntraMaskBlending8bpp4xH_SSE4( + const uint8_t* LIBGAV1_RESTRICT pred_0, uint8_t* LIBGAV1_RESTRICT pred_1, + const ptrdiff_t pred_stride_1, + const uint8_t* LIBGAV1_RESTRICT const mask_ptr, const ptrdiff_t mask_stride, + const int height) { const uint8_t* mask = mask_ptr; if (height == 4) { InterIntraMaskBlending8bpp4x4_SSE4<subsampling_x, subsampling_y>( @@ -358,12 +364,11 @@ inline void InterIntraMaskBlending8bpp4xH_SSE4(const uint8_t* pred_0, } template <int subsampling_x, int subsampling_y> -void InterIntraMaskBlend8bpp_SSE4(const uint8_t* prediction_0, - uint8_t* prediction_1, - const ptrdiff_t prediction_stride_1, - const uint8_t* const mask_ptr, - const ptrdiff_t mask_stride, const int width, - const int height) { +void InterIntraMaskBlend8bpp_SSE4( + const uint8_t* LIBGAV1_RESTRICT prediction_0, + uint8_t* LIBGAV1_RESTRICT prediction_1, const ptrdiff_t prediction_stride_1, + const uint8_t* LIBGAV1_RESTRICT const mask_ptr, const ptrdiff_t mask_stride, + const int width, const int height) { if (width == 4) { InterIntraMaskBlending8bpp4xH_SSE4<subsampling_x, subsampling_y>( prediction_0, prediction_1, prediction_stride_1, mask_ptr, mask_stride, @@ -503,10 +508,11 @@ inline __m128i GetMask8(const uint8_t* mask, const ptrdiff_t stride, } inline void WriteMaskBlendLine10bpp4x2_SSE4_1( - const uint16_t* pred_0, const uint16_t* pred_1, - const ptrdiff_t pred_stride_1, const __m128i& pred_mask_0, - const __m128i& pred_mask_1, const __m128i& offset, const __m128i& max, - const __m128i& shift4, uint16_t* dst, const ptrdiff_t dst_stride) { + const uint16_t* LIBGAV1_RESTRICT pred_0, + const uint16_t* LIBGAV1_RESTRICT pred_1, const ptrdiff_t pred_stride_1, + const __m128i& pred_mask_0, const __m128i& pred_mask_1, + const __m128i& offset, const __m128i& max, const __m128i& shift4, + uint16_t* LIBGAV1_RESTRICT dst, const ptrdiff_t dst_stride) { const __m128i pred_val_0 = LoadUnaligned16(pred_0); const __m128i pred_val_1 = LoadHi8(LoadLo8(pred_1), pred_1 + pred_stride_1); @@ -544,11 +550,12 @@ inline void WriteMaskBlendLine10bpp4x2_SSE4_1( } template <int subsampling_x, int subsampling_y> -inline void MaskBlend10bpp4x4_SSE4_1(const uint16_t* pred_0, - const uint16_t* pred_1, +inline void MaskBlend10bpp4x4_SSE4_1(const uint16_t* LIBGAV1_RESTRICT pred_0, + const uint16_t* LIBGAV1_RESTRICT pred_1, const ptrdiff_t pred_stride_1, - const uint8_t* mask, - const ptrdiff_t mask_stride, uint16_t* dst, + const uint8_t* LIBGAV1_RESTRICT mask, + const ptrdiff_t mask_stride, + uint16_t* LIBGAV1_RESTRICT dst, const ptrdiff_t dst_stride) { const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse); const __m128i zero = _mm_setzero_si128(); @@ -575,13 +582,12 @@ inline void MaskBlend10bpp4x4_SSE4_1(const uint16_t* pred_0, } template <int subsampling_x, int subsampling_y> -inline void MaskBlend10bpp4xH_SSE4_1(const uint16_t* pred_0, - const uint16_t* pred_1, - const ptrdiff_t pred_stride_1, - const uint8_t* const mask_ptr, - const ptrdiff_t mask_stride, - const int height, uint16_t* dst, - const ptrdiff_t dst_stride) { +inline void MaskBlend10bpp4xH_SSE4_1( + const uint16_t* LIBGAV1_RESTRICT pred_0, + const uint16_t* LIBGAV1_RESTRICT pred_1, const ptrdiff_t pred_stride_1, + const uint8_t* LIBGAV1_RESTRICT const mask_ptr, const ptrdiff_t mask_stride, + const int height, uint16_t* LIBGAV1_RESTRICT dst, + const ptrdiff_t dst_stride) { const uint8_t* mask = mask_ptr; if (height == 4) { MaskBlend10bpp4x4_SSE4_1<subsampling_x, subsampling_y>( @@ -648,13 +654,13 @@ inline void MaskBlend10bpp4xH_SSE4_1(const uint16_t* pred_0, } template <int subsampling_x, int subsampling_y> -inline void MaskBlend10bpp_SSE4_1(const void* prediction_0, - const void* prediction_1, - const ptrdiff_t prediction_stride_1, - const uint8_t* const mask_ptr, - const ptrdiff_t mask_stride, const int width, - const int height, void* dest, - const ptrdiff_t dest_stride) { +inline void MaskBlend10bpp_SSE4_1( + const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + const ptrdiff_t prediction_stride_1, + const uint8_t* LIBGAV1_RESTRICT const mask_ptr, const ptrdiff_t mask_stride, + const int width, const int height, void* LIBGAV1_RESTRICT dest, + const ptrdiff_t dest_stride) { auto* dst = static_cast<uint16_t*>(dest); const ptrdiff_t dst_stride = dest_stride / sizeof(dst[0]); const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); @@ -725,10 +731,11 @@ inline void MaskBlend10bpp_SSE4_1(const void* prediction_0, } inline void InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1( - const uint16_t* prediction_0, const uint16_t* prediction_1, + const uint16_t* LIBGAV1_RESTRICT prediction_0, + const uint16_t* LIBGAV1_RESTRICT prediction_1, const ptrdiff_t pred_stride_1, const __m128i& pred_mask_0, - const __m128i& pred_mask_1, const __m128i& shift6, uint16_t* dst, - const ptrdiff_t dst_stride) { + const __m128i& pred_mask_1, const __m128i& shift6, + uint16_t* LIBGAV1_RESTRICT dst, const ptrdiff_t dst_stride) { const __m128i pred_val_0 = LoadUnaligned16(prediction_0); const __m128i pred_val_1 = LoadHi8(LoadLo8(prediction_1), prediction_1 + pred_stride_1); @@ -751,9 +758,10 @@ inline void InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1( template <int subsampling_x, int subsampling_y> inline void InterIntraMaskBlend10bpp4x4_SSE4_1( - const uint16_t* pred_0, const uint16_t* pred_1, - const ptrdiff_t pred_stride_1, const uint8_t* mask, - const ptrdiff_t mask_stride, uint16_t* dst, const ptrdiff_t dst_stride) { + const uint16_t* LIBGAV1_RESTRICT pred_0, + const uint16_t* LIBGAV1_RESTRICT pred_1, const ptrdiff_t pred_stride_1, + const uint8_t* LIBGAV1_RESTRICT mask, const ptrdiff_t mask_stride, + uint16_t* LIBGAV1_RESTRICT dst, const ptrdiff_t dst_stride) { const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse); const __m128i shift6 = _mm_set1_epi32((1 << 6) >> 1); const __m128i zero = _mm_setzero_si128(); @@ -777,13 +785,12 @@ inline void InterIntraMaskBlend10bpp4x4_SSE4_1( } template <int subsampling_x, int subsampling_y> -inline void InterIntraMaskBlend10bpp4xH_SSE4_1(const uint16_t* pred_0, - const uint16_t* pred_1, - const ptrdiff_t pred_stride_1, - const uint8_t* const mask_ptr, - const ptrdiff_t mask_stride, - const int height, uint16_t* dst, - const ptrdiff_t dst_stride) { +inline void InterIntraMaskBlend10bpp4xH_SSE4_1( + const uint16_t* LIBGAV1_RESTRICT pred_0, + const uint16_t* LIBGAV1_RESTRICT pred_1, const ptrdiff_t pred_stride_1, + const uint8_t* LIBGAV1_RESTRICT const mask_ptr, const ptrdiff_t mask_stride, + const int height, uint16_t* LIBGAV1_RESTRICT dst, + const ptrdiff_t dst_stride) { const uint8_t* mask = mask_ptr; if (height == 4) { InterIntraMaskBlend10bpp4x4_SSE4_1<subsampling_x, subsampling_y>( @@ -848,9 +855,11 @@ inline void InterIntraMaskBlend10bpp4xH_SSE4_1(const uint16_t* pred_0, template <int subsampling_x, int subsampling_y> inline void InterIntraMaskBlend10bpp_SSE4_1( - const void* prediction_0, const void* prediction_1, - const ptrdiff_t prediction_stride_1, const uint8_t* const mask_ptr, - const ptrdiff_t mask_stride, const int width, const int height, void* dest, + const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + const ptrdiff_t prediction_stride_1, + const uint8_t* LIBGAV1_RESTRICT const mask_ptr, const ptrdiff_t mask_stride, + const int width, const int height, void* LIBGAV1_RESTRICT dest, const ptrdiff_t dest_stride) { auto* dst = static_cast<uint16_t*>(dest); const ptrdiff_t dst_stride = dest_stride / sizeof(dst[0]); |