diff options
Diffstat (limited to 'src/dsp/x86/obmc_sse4.cc')
-rw-r--r-- | src/dsp/x86/obmc_sse4.cc | 144 |
1 files changed, 75 insertions, 69 deletions
diff --git a/src/dsp/x86/obmc_sse4.cc b/src/dsp/x86/obmc_sse4.cc index 8ce23b4..f068ff3 100644 --- a/src/dsp/x86/obmc_sse4.cc +++ b/src/dsp/x86/obmc_sse4.cc @@ -39,8 +39,8 @@ namespace { inline void OverlapBlendFromLeft2xH_SSE4_1( uint8_t* LIBGAV1_RESTRICT const prediction, const ptrdiff_t prediction_stride, const int height, - const uint8_t* LIBGAV1_RESTRICT const obmc_prediction, - const ptrdiff_t obmc_prediction_stride) { + const uint8_t* LIBGAV1_RESTRICT const obmc_prediction) { + constexpr int obmc_prediction_stride = 2; uint8_t* pred = prediction; const uint8_t* obmc_pred = obmc_prediction; const __m128i mask_inverter = _mm_cvtsi32_si128(0x40404040); @@ -51,8 +51,7 @@ inline void OverlapBlendFromLeft2xH_SSE4_1( int y = height; do { const __m128i pred_val = Load2x2(pred, pred + prediction_stride); - const __m128i obmc_pred_val = - Load2x2(obmc_pred, obmc_pred + obmc_prediction_stride); + const __m128i obmc_pred_val = Load4(obmc_pred); const __m128i terms = _mm_unpacklo_epi8(pred_val, obmc_pred_val); const __m128i result = @@ -71,8 +70,8 @@ inline void OverlapBlendFromLeft2xH_SSE4_1( inline void OverlapBlendFromLeft4xH_SSE4_1( uint8_t* LIBGAV1_RESTRICT const prediction, const ptrdiff_t prediction_stride, const int height, - const uint8_t* LIBGAV1_RESTRICT const obmc_prediction, - const ptrdiff_t obmc_prediction_stride) { + const uint8_t* LIBGAV1_RESTRICT const obmc_prediction) { + constexpr int obmc_prediction_stride = 4; uint8_t* pred = prediction; const uint8_t* obmc_pred = obmc_prediction; const __m128i mask_inverter = _mm_cvtsi32_si128(0x40404040); @@ -85,15 +84,12 @@ inline void OverlapBlendFromLeft4xH_SSE4_1( int y = height; do { const __m128i pred_val0 = Load4(pred); - const __m128i obmc_pred_val0 = Load4(obmc_pred); pred += prediction_stride; - obmc_pred += obmc_prediction_stride; // Place the second row of each source in the second four bytes. const __m128i pred_val = _mm_alignr_epi8(Load4(pred), _mm_slli_si128(pred_val0, 12), 12); - const __m128i obmc_pred_val = _mm_alignr_epi8( - Load4(obmc_pred), _mm_slli_si128(obmc_pred_val0, 12), 12); + const __m128i obmc_pred_val = LoadLo8(obmc_pred); const __m128i terms = _mm_unpacklo_epi8(pred_val, obmc_pred_val); const __m128i result = RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6); @@ -102,7 +98,7 @@ inline void OverlapBlendFromLeft4xH_SSE4_1( const int second_row_result = _mm_extract_epi32(packed_result, 1); memcpy(pred, &second_row_result, sizeof(second_row_result)); pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + obmc_pred += obmc_prediction_stride << 1; y -= 2; } while (y != 0); } @@ -110,8 +106,8 @@ inline void OverlapBlendFromLeft4xH_SSE4_1( inline void OverlapBlendFromLeft8xH_SSE4_1( uint8_t* LIBGAV1_RESTRICT const prediction, const ptrdiff_t prediction_stride, const int height, - const uint8_t* LIBGAV1_RESTRICT const obmc_prediction, - const ptrdiff_t obmc_prediction_stride) { + const uint8_t* LIBGAV1_RESTRICT const obmc_prediction) { + constexpr int obmc_prediction_stride = 8; uint8_t* pred = prediction; const uint8_t* obmc_pred = obmc_prediction; const __m128i mask_inverter = _mm_set1_epi8(64); @@ -121,16 +117,25 @@ inline void OverlapBlendFromLeft8xH_SSE4_1( const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val); int y = height; do { - const __m128i pred_val = LoadLo8(pred); - const __m128i obmc_pred_val = LoadLo8(obmc_pred); - const __m128i terms = _mm_unpacklo_epi8(pred_val, obmc_pred_val); - const __m128i result = - RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6); + const __m128i pred_val = LoadHi8(LoadLo8(pred), pred + prediction_stride); + const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred); + + const __m128i terms_lo = _mm_unpacklo_epi8(pred_val, obmc_pred_val); + const __m128i result_lo = + RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_lo, masks), 6); + + const __m128i terms_hi = _mm_unpackhi_epi8(pred_val, obmc_pred_val); + const __m128i result_hi = + RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_hi, masks), 6); - StoreLo8(pred, _mm_packus_epi16(result, result)); + const __m128i result = _mm_packus_epi16(result_lo, result_hi); + StoreLo8(pred, result); pred += prediction_stride; - obmc_pred += obmc_prediction_stride; - } while (--y != 0); + StoreHi8(pred, result); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride << 1; + y -= 2; + } while (y != 0); } void OverlapBlendFromLeft_SSE4_1( @@ -144,18 +149,15 @@ void OverlapBlendFromLeft_SSE4_1( assert(height >= 4); if (width == 2) { - OverlapBlendFromLeft2xH_SSE4_1(pred, prediction_stride, height, obmc_pred, - obmc_prediction_stride); + OverlapBlendFromLeft2xH_SSE4_1(pred, prediction_stride, height, obmc_pred); return; } if (width == 4) { - OverlapBlendFromLeft4xH_SSE4_1(pred, prediction_stride, height, obmc_pred, - obmc_prediction_stride); + OverlapBlendFromLeft4xH_SSE4_1(pred, prediction_stride, height, obmc_pred); return; } if (width == 8) { - OverlapBlendFromLeft8xH_SSE4_1(pred, prediction_stride, height, obmc_pred, - obmc_prediction_stride); + OverlapBlendFromLeft8xH_SSE4_1(pred, prediction_stride, height, obmc_pred); return; } const __m128i mask_inverter = _mm_set1_epi8(64); @@ -192,8 +194,8 @@ void OverlapBlendFromLeft_SSE4_1( inline void OverlapBlendFromTop4xH_SSE4_1( uint8_t* LIBGAV1_RESTRICT const prediction, const ptrdiff_t prediction_stride, const int height, - const uint8_t* LIBGAV1_RESTRICT const obmc_prediction, - const ptrdiff_t obmc_prediction_stride) { + const uint8_t* LIBGAV1_RESTRICT const obmc_prediction) { + constexpr int obmc_prediction_stride = 4; uint8_t* pred = prediction; const uint8_t* obmc_pred = obmc_prediction; const __m128i mask_inverter = _mm_set1_epi16(64); @@ -212,13 +214,10 @@ inline void OverlapBlendFromTop4xH_SSE4_1( _mm_sub_epi8(mask_inverter, _mm_sign_epi8(mask_val, mask_preinverter)); const __m128i pred_val0 = Load4(pred); - const __m128i obmc_pred_val0 = Load4(obmc_pred); + const __m128i obmc_pred_val = LoadLo8(obmc_pred); pred += prediction_stride; - obmc_pred += obmc_prediction_stride; const __m128i pred_val = _mm_alignr_epi8(Load4(pred), _mm_slli_si128(pred_val0, 12), 12); - const __m128i obmc_pred_val = _mm_alignr_epi8( - Load4(obmc_pred), _mm_slli_si128(obmc_pred_val0, 12), 12); const __m128i terms = _mm_unpacklo_epi8(obmc_pred_val, pred_val); const __m128i result = RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6); @@ -227,7 +226,7 @@ inline void OverlapBlendFromTop4xH_SSE4_1( Store4(pred - prediction_stride, packed_result); Store4(pred, _mm_srli_si128(packed_result, 4)); pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + obmc_pred += obmc_prediction_stride << 1; y += 2; } while (y < compute_height); } @@ -235,8 +234,8 @@ inline void OverlapBlendFromTop4xH_SSE4_1( inline void OverlapBlendFromTop8xH_SSE4_1( uint8_t* LIBGAV1_RESTRICT const prediction, const ptrdiff_t prediction_stride, const int height, - const uint8_t* LIBGAV1_RESTRICT const obmc_prediction, - const ptrdiff_t obmc_prediction_stride) { + const uint8_t* LIBGAV1_RESTRICT const obmc_prediction) { + constexpr int obmc_prediction_stride = 8; uint8_t* pred = prediction; const uint8_t* obmc_pred = obmc_prediction; const uint8_t* mask = kObmcMask + height - 2; @@ -244,20 +243,35 @@ inline void OverlapBlendFromTop8xH_SSE4_1( const int compute_height = height - (height >> 2); int y = compute_height; do { - const __m128i mask_val = _mm_set1_epi8(mask[compute_height - y]); + const __m128i mask_val0 = _mm_set1_epi8(mask[compute_height - y]); // 64 - mask - const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val); - const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val); - const __m128i pred_val = LoadLo8(pred); - const __m128i obmc_pred_val = LoadLo8(obmc_pred); - const __m128i terms = _mm_unpacklo_epi8(pred_val, obmc_pred_val); - const __m128i result = - RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6); + const __m128i obmc_mask_val0 = _mm_sub_epi8(mask_inverter, mask_val0); + const __m128i masks0 = _mm_unpacklo_epi8(mask_val0, obmc_mask_val0); - StoreLo8(pred, _mm_packus_epi16(result, result)); + const __m128i pred_val = LoadHi8(LoadLo8(pred), pred + prediction_stride); + const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred); + + const __m128i terms_lo = _mm_unpacklo_epi8(pred_val, obmc_pred_val); + const __m128i result_lo = + RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_lo, masks0), 6); + + --y; + const __m128i mask_val1 = _mm_set1_epi8(mask[compute_height - y]); + // 64 - mask + const __m128i obmc_mask_val1 = _mm_sub_epi8(mask_inverter, mask_val1); + const __m128i masks1 = _mm_unpacklo_epi8(mask_val1, obmc_mask_val1); + + const __m128i terms_hi = _mm_unpackhi_epi8(pred_val, obmc_pred_val); + const __m128i result_hi = + RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_hi, masks1), 6); + + const __m128i result = _mm_packus_epi16(result_lo, result_hi); + StoreLo8(pred, result); pred += prediction_stride; - obmc_pred += obmc_prediction_stride; - } while (--y != 0); + StoreHi8(pred, result); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride << 1; + } while (--y > 0); } void OverlapBlendFromTop_SSE4_1( @@ -271,13 +285,11 @@ void OverlapBlendFromTop_SSE4_1( assert(height >= 2); if (width == 4) { - OverlapBlendFromTop4xH_SSE4_1(pred, prediction_stride, height, obmc_pred, - obmc_prediction_stride); + OverlapBlendFromTop4xH_SSE4_1(pred, prediction_stride, height, obmc_pred); return; } if (width == 8) { - OverlapBlendFromTop8xH_SSE4_1(pred, prediction_stride, height, obmc_pred, - obmc_prediction_stride); + OverlapBlendFromTop8xH_SSE4_1(pred, prediction_stride, height, obmc_pred); return; } @@ -333,8 +345,8 @@ constexpr int kRoundBitsObmcBlend = 6; inline void OverlapBlendFromLeft2xH_SSE4_1( uint16_t* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride, - const int height, const uint16_t* LIBGAV1_RESTRICT const obmc_prediction, - const ptrdiff_t obmc_pred_stride) { + const int height, const uint16_t* LIBGAV1_RESTRICT const obmc_prediction) { + constexpr int obmc_pred_stride = 2; uint16_t* pred = prediction; const uint16_t* obmc_pred = obmc_prediction; const ptrdiff_t pred_stride2 = pred_stride << 1; @@ -348,8 +360,7 @@ inline void OverlapBlendFromLeft2xH_SSE4_1( int y = height; do { const __m128i pred_val = Load4x2(pred, pred + pred_stride); - const __m128i obmc_pred_val = - Load4x2(obmc_pred, obmc_pred + obmc_pred_stride); + const __m128i obmc_pred_val = LoadLo8(obmc_pred); const __m128i terms = _mm_unpacklo_epi16(pred_val, obmc_pred_val); const __m128i result = RightShiftWithRounding_U32( _mm_madd_epi16(terms, masks), kRoundBitsObmcBlend); @@ -364,8 +375,8 @@ inline void OverlapBlendFromLeft2xH_SSE4_1( inline void OverlapBlendFromLeft4xH_SSE4_1( uint16_t* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride, - const int height, const uint16_t* LIBGAV1_RESTRICT const obmc_prediction, - const ptrdiff_t obmc_pred_stride) { + const int height, const uint16_t* LIBGAV1_RESTRICT const obmc_prediction) { + constexpr int obmc_pred_stride = 4; uint16_t* pred = prediction; const uint16_t* obmc_pred = obmc_prediction; const ptrdiff_t pred_stride2 = pred_stride << 1; @@ -379,8 +390,7 @@ inline void OverlapBlendFromLeft4xH_SSE4_1( int y = height; do { const __m128i pred_val = LoadHi8(LoadLo8(pred), pred + pred_stride); - const __m128i obmc_pred_val = - LoadHi8(LoadLo8(obmc_pred), obmc_pred + obmc_pred_stride); + const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred); const __m128i terms_lo = _mm_unpacklo_epi16(pred_val, obmc_pred_val); const __m128i terms_hi = _mm_unpackhi_epi16(pred_val, obmc_pred_val); const __m128i result_lo = RightShiftWithRounding_U32( @@ -410,13 +420,11 @@ void OverlapBlendFromLeft10bpp_SSE4_1( assert(height >= 4); if (width == 2) { - OverlapBlendFromLeft2xH_SSE4_1(pred, pred_stride, height, obmc_pred, - obmc_pred_stride); + OverlapBlendFromLeft2xH_SSE4_1(pred, pred_stride, height, obmc_pred); return; } if (width == 4) { - OverlapBlendFromLeft4xH_SSE4_1(pred, pred_stride, height, obmc_pred, - obmc_pred_stride); + OverlapBlendFromLeft4xH_SSE4_1(pred, pred_stride, height, obmc_pred); return; } const __m128i mask_inverter = _mm_set1_epi8(64); @@ -452,8 +460,8 @@ void OverlapBlendFromLeft10bpp_SSE4_1( inline void OverlapBlendFromTop4xH_SSE4_1( uint16_t* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride, - const int height, const uint16_t* LIBGAV1_RESTRICT const obmc_prediction, - const ptrdiff_t obmc_pred_stride) { + const int height, const uint16_t* LIBGAV1_RESTRICT const obmc_prediction) { + constexpr int obmc_pred_stride = 4; uint16_t* pred = prediction; const uint16_t* obmc_pred = obmc_prediction; const __m128i mask_inverter = _mm_set1_epi16(64); @@ -473,8 +481,7 @@ inline void OverlapBlendFromTop4xH_SSE4_1( const __m128i masks_hi = _mm_cvtepi8_epi16(_mm_srli_si128(masks, 8)); const __m128i pred_val = LoadHi8(LoadLo8(pred), pred + pred_stride); - const __m128i obmc_pred_val = - LoadHi8(LoadLo8(obmc_pred), obmc_pred + obmc_pred_stride); + const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred); const __m128i terms_lo = _mm_unpacklo_epi16(obmc_pred_val, pred_val); const __m128i terms_hi = _mm_unpackhi_epi16(obmc_pred_val, pred_val); const __m128i result_lo = RightShiftWithRounding_U32( @@ -505,8 +512,7 @@ void OverlapBlendFromTop10bpp_SSE4_1( assert(height >= 2); if (width == 4) { - OverlapBlendFromTop4xH_SSE4_1(pred, pred_stride, height, obmc_pred, - obmc_pred_stride); + OverlapBlendFromTop4xH_SSE4_1(pred, pred_stride, height, obmc_pred); return; } |