diff options
Diffstat (limited to 'src/dsp/x86/obmc_sse4.cc')
-rw-r--r-- | src/dsp/x86/obmc_sse4.cc | 329 |
1 files changed, 329 insertions, 0 deletions
diff --git a/src/dsp/x86/obmc_sse4.cc b/src/dsp/x86/obmc_sse4.cc new file mode 100644 index 0000000..3a1d1fd --- /dev/null +++ b/src/dsp/x86/obmc_sse4.cc @@ -0,0 +1,329 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/obmc.h" +#include "src/utils/cpu.h" + +#if LIBGAV1_TARGETING_SSE4_1 + +#include <xmmintrin.h> + +#include <cassert> +#include <cstddef> +#include <cstdint> + +#include "src/dsp/constants.h" +#include "src/dsp/dsp.h" +#include "src/dsp/x86/common_sse4.h" +#include "src/utils/common.h" +#include "src/utils/constants.h" + +namespace libgav1 { +namespace dsp { +namespace { + +#include "src/dsp/obmc.inc" + +inline void OverlapBlendFromLeft2xH_SSE4_1( + uint8_t* const prediction, const ptrdiff_t prediction_stride, + const int height, const uint8_t* const obmc_prediction, + const ptrdiff_t obmc_prediction_stride) { + uint8_t* pred = prediction; + const uint8_t* obmc_pred = obmc_prediction; + const __m128i mask_inverter = _mm_cvtsi32_si128(0x40404040); + const __m128i mask_val = _mm_shufflelo_epi16(Load4(kObmcMask), 0); + // 64 - mask + const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val); + const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val); + int y = height; + do { + const __m128i pred_val = Load2x2(pred, pred + prediction_stride); + const __m128i obmc_pred_val = + Load2x2(obmc_pred, obmc_pred + obmc_prediction_stride); + + const __m128i terms = _mm_unpacklo_epi8(pred_val, obmc_pred_val); + const __m128i result = + RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6); + const __m128i packed_result = _mm_packus_epi16(result, result); + Store2(pred, packed_result); + pred += prediction_stride; + const int16_t second_row_result = _mm_extract_epi16(packed_result, 1); + memcpy(pred, &second_row_result, sizeof(second_row_result)); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride << 1; + y -= 2; + } while (y != 0); +} + +inline void OverlapBlendFromLeft4xH_SSE4_1( + uint8_t* const prediction, const ptrdiff_t prediction_stride, + const int height, const uint8_t* const obmc_prediction, + const ptrdiff_t obmc_prediction_stride) { + uint8_t* pred = prediction; + const uint8_t* obmc_pred = obmc_prediction; + const __m128i mask_inverter = _mm_cvtsi32_si128(0x40404040); + const __m128i mask_val = Load4(kObmcMask + 2); + // 64 - mask + const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val); + // Duplicate first half of vector. + const __m128i masks = + _mm_shuffle_epi32(_mm_unpacklo_epi8(mask_val, obmc_mask_val), 0x44); + int y = height; + do { + const __m128i pred_val0 = Load4(pred); + const __m128i obmc_pred_val0 = Load4(obmc_pred); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + + // Place the second row of each source in the second four bytes. + const __m128i pred_val = + _mm_alignr_epi8(Load4(pred), _mm_slli_si128(pred_val0, 12), 12); + const __m128i obmc_pred_val = _mm_alignr_epi8( + Load4(obmc_pred), _mm_slli_si128(obmc_pred_val0, 12), 12); + const __m128i terms = _mm_unpacklo_epi8(pred_val, obmc_pred_val); + const __m128i result = + RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6); + const __m128i packed_result = _mm_packus_epi16(result, result); + Store4(pred - prediction_stride, packed_result); + const int second_row_result = _mm_extract_epi32(packed_result, 1); + memcpy(pred, &second_row_result, sizeof(second_row_result)); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + y -= 2; + } while (y != 0); +} + +inline void OverlapBlendFromLeft8xH_SSE4_1( + uint8_t* const prediction, const ptrdiff_t prediction_stride, + const int height, const uint8_t* const obmc_prediction, + const ptrdiff_t obmc_prediction_stride) { + uint8_t* pred = prediction; + const uint8_t* obmc_pred = obmc_prediction; + const __m128i mask_inverter = _mm_set1_epi8(64); + const __m128i mask_val = LoadLo8(kObmcMask + 6); + // 64 - mask + const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val); + const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val); + int y = height; + do { + const __m128i pred_val = LoadLo8(pred); + const __m128i obmc_pred_val = LoadLo8(obmc_pred); + const __m128i terms = _mm_unpacklo_epi8(pred_val, obmc_pred_val); + const __m128i result = + RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6); + + StoreLo8(pred, _mm_packus_epi16(result, result)); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + } while (--y != 0); +} + +void OverlapBlendFromLeft_SSE4_1(void* const prediction, + const ptrdiff_t prediction_stride, + const int width, const int height, + const void* const obmc_prediction, + const ptrdiff_t obmc_prediction_stride) { + auto* pred = static_cast<uint8_t*>(prediction); + const auto* obmc_pred = static_cast<const uint8_t*>(obmc_prediction); + + if (width == 2) { + OverlapBlendFromLeft2xH_SSE4_1(pred, prediction_stride, height, obmc_pred, + obmc_prediction_stride); + return; + } + if (width == 4) { + OverlapBlendFromLeft4xH_SSE4_1(pred, prediction_stride, height, obmc_pred, + obmc_prediction_stride); + return; + } + if (width == 8) { + OverlapBlendFromLeft8xH_SSE4_1(pred, prediction_stride, height, obmc_pred, + obmc_prediction_stride); + return; + } + const __m128i mask_inverter = _mm_set1_epi8(64); + const uint8_t* mask = kObmcMask + width - 2; + int x = 0; + do { + pred = static_cast<uint8_t*>(prediction) + x; + obmc_pred = static_cast<const uint8_t*>(obmc_prediction) + x; + const __m128i mask_val = LoadUnaligned16(mask + x); + // 64 - mask + const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val); + const __m128i masks_lo = _mm_unpacklo_epi8(mask_val, obmc_mask_val); + const __m128i masks_hi = _mm_unpackhi_epi8(mask_val, obmc_mask_val); + + int y = 0; + do { + const __m128i pred_val = LoadUnaligned16(pred); + const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred); + const __m128i terms_lo = _mm_unpacklo_epi8(pred_val, obmc_pred_val); + const __m128i result_lo = + RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_lo, masks_lo), 6); + const __m128i terms_hi = _mm_unpackhi_epi8(pred_val, obmc_pred_val); + const __m128i result_hi = + RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_hi, masks_hi), 6); + StoreUnaligned16(pred, _mm_packus_epi16(result_lo, result_hi)); + + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + } while (++y < height); + x += 16; + } while (x < width); +} + +inline void OverlapBlendFromTop4xH_SSE4_1( + uint8_t* const prediction, const ptrdiff_t prediction_stride, + const int height, const uint8_t* const obmc_prediction, + const ptrdiff_t obmc_prediction_stride) { + uint8_t* pred = prediction; + const uint8_t* obmc_pred = obmc_prediction; + const __m128i mask_inverter = _mm_set1_epi16(64); + const __m128i mask_shuffler = _mm_set_epi32(0x01010101, 0x01010101, 0, 0); + const __m128i mask_preinverter = _mm_set1_epi16(-256 | 1); + + const uint8_t* mask = kObmcMask + height - 2; + const int compute_height = height - (height >> 2); + int y = 0; + do { + // First mask in the first half, second mask in the second half. + const __m128i mask_val = _mm_shuffle_epi8( + _mm_cvtsi32_si128(*reinterpret_cast<const uint16_t*>(mask + y)), + mask_shuffler); + const __m128i masks = + _mm_sub_epi8(mask_inverter, _mm_sign_epi8(mask_val, mask_preinverter)); + const __m128i pred_val0 = Load4(pred); + + const __m128i obmc_pred_val0 = Load4(obmc_pred); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + const __m128i pred_val = + _mm_alignr_epi8(Load4(pred), _mm_slli_si128(pred_val0, 12), 12); + const __m128i obmc_pred_val = _mm_alignr_epi8( + Load4(obmc_pred), _mm_slli_si128(obmc_pred_val0, 12), 12); + const __m128i terms = _mm_unpacklo_epi8(obmc_pred_val, pred_val); + const __m128i result = + RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6); + + const __m128i packed_result = _mm_packus_epi16(result, result); + Store4(pred - prediction_stride, packed_result); + Store4(pred, _mm_srli_si128(packed_result, 4)); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + y += 2; + } while (y < compute_height); +} + +inline void OverlapBlendFromTop8xH_SSE4_1( + uint8_t* const prediction, const ptrdiff_t prediction_stride, + const int height, const uint8_t* const obmc_prediction, + const ptrdiff_t obmc_prediction_stride) { + uint8_t* pred = prediction; + const uint8_t* obmc_pred = obmc_prediction; + const uint8_t* mask = kObmcMask + height - 2; + const __m128i mask_inverter = _mm_set1_epi8(64); + const int compute_height = height - (height >> 2); + int y = compute_height; + do { + const __m128i mask_val = _mm_set1_epi8(mask[compute_height - y]); + // 64 - mask + const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val); + const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val); + const __m128i pred_val = LoadLo8(pred); + const __m128i obmc_pred_val = LoadLo8(obmc_pred); + const __m128i terms = _mm_unpacklo_epi8(pred_val, obmc_pred_val); + const __m128i result = + RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6); + + StoreLo8(pred, _mm_packus_epi16(result, result)); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + } while (--y != 0); +} + +void OverlapBlendFromTop_SSE4_1(void* const prediction, + const ptrdiff_t prediction_stride, + const int width, const int height, + const void* const obmc_prediction, + const ptrdiff_t obmc_prediction_stride) { + auto* pred = static_cast<uint8_t*>(prediction); + const auto* obmc_pred = static_cast<const uint8_t*>(obmc_prediction); + + if (width <= 4) { + OverlapBlendFromTop4xH_SSE4_1(pred, prediction_stride, height, obmc_pred, + obmc_prediction_stride); + return; + } + if (width == 8) { + OverlapBlendFromTop8xH_SSE4_1(pred, prediction_stride, height, obmc_pred, + obmc_prediction_stride); + return; + } + + // Stop when mask value becomes 64. + const int compute_height = height - (height >> 2); + const __m128i mask_inverter = _mm_set1_epi8(64); + int y = 0; + const uint8_t* mask = kObmcMask + height - 2; + do { + const __m128i mask_val = _mm_set1_epi8(mask[y]); + // 64 - mask + const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val); + const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val); + int x = 0; + do { + const __m128i pred_val = LoadUnaligned16(pred + x); + const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred + x); + const __m128i terms_lo = _mm_unpacklo_epi8(pred_val, obmc_pred_val); + const __m128i result_lo = + RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_lo, masks), 6); + const __m128i terms_hi = _mm_unpackhi_epi8(pred_val, obmc_pred_val); + const __m128i result_hi = + RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_hi, masks), 6); + StoreUnaligned16(pred + x, _mm_packus_epi16(result_lo, result_hi)); + x += 16; + } while (x < width); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + } while (++y < compute_height); +} + +void Init8bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8); + assert(dsp != nullptr); +#if DSP_ENABLED_8BPP_SSE4_1(ObmcVertical) + dsp->obmc_blend[kObmcDirectionVertical] = OverlapBlendFromTop_SSE4_1; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(ObmcHorizontal) + dsp->obmc_blend[kObmcDirectionHorizontal] = OverlapBlendFromLeft_SSE4_1; +#endif +} + +} // namespace + +void ObmcInit_SSE4_1() { Init8bpp(); } + +} // namespace dsp +} // namespace libgav1 + +#else // !LIBGAV1_TARGETING_SSE4_1 + +namespace libgav1 { +namespace dsp { + +void ObmcInit_SSE4_1() {} + +} // namespace dsp +} // namespace libgav1 +#endif // LIBGAV1_TARGETING_SSE4_1 |