diff options
Diffstat (limited to 'src/dsp/x86/weight_mask_sse4.cc')
-rw-r--r-- | src/dsp/x86/weight_mask_sse4.cc | 633 |
1 files changed, 562 insertions, 71 deletions
diff --git a/src/dsp/x86/weight_mask_sse4.cc b/src/dsp/x86/weight_mask_sse4.cc index dfd5662..08a1739 100644 --- a/src/dsp/x86/weight_mask_sse4.cc +++ b/src/dsp/x86/weight_mask_sse4.cc @@ -36,47 +36,65 @@ namespace { constexpr int kRoundingBits8bpp = 4; -template <bool mask_is_inverse> -inline void WeightMask8_SSE4(const int16_t* prediction_0, - const int16_t* prediction_1, uint8_t* mask) { - const __m128i pred_0 = LoadAligned16(prediction_0); - const __m128i pred_1 = LoadAligned16(prediction_1); - const __m128i difference = RightShiftWithRounding_U16( - _mm_abs_epi16(_mm_sub_epi16(pred_0, pred_1)), kRoundingBits8bpp); - const __m128i scaled_difference = _mm_srli_epi16(difference, 4); +template <bool mask_is_inverse, bool is_store_16> +inline void WeightMask16_SSE4(const int16_t* prediction_0, + const int16_t* prediction_1, uint8_t* mask, + ptrdiff_t mask_stride) { + const __m128i pred_00 = LoadAligned16(prediction_0); + const __m128i pred_10 = LoadAligned16(prediction_1); + const __m128i difference_0 = RightShiftWithRounding_U16( + _mm_abs_epi16(_mm_sub_epi16(pred_00, pred_10)), kRoundingBits8bpp); + const __m128i scaled_difference_0 = _mm_srli_epi16(difference_0, 4); + + const __m128i pred_01 = LoadAligned16(prediction_0 + 8); + const __m128i pred_11 = LoadAligned16(prediction_1 + 8); + const __m128i difference_1 = RightShiftWithRounding_U16( + _mm_abs_epi16(_mm_sub_epi16(pred_01, pred_11)), kRoundingBits8bpp); + const __m128i scaled_difference_1 = _mm_srli_epi16(difference_1, 4); + const __m128i difference_offset = _mm_set1_epi8(38); const __m128i adjusted_difference = - _mm_adds_epu8(_mm_packus_epi16(scaled_difference, scaled_difference), + _mm_adds_epu8(_mm_packus_epi16(scaled_difference_0, scaled_difference_1), difference_offset); const __m128i mask_ceiling = _mm_set1_epi8(64); const __m128i mask_value = _mm_min_epi8(adjusted_difference, mask_ceiling); if (mask_is_inverse) { const __m128i inverted_mask_value = _mm_sub_epi8(mask_ceiling, mask_value); - StoreLo8(mask, inverted_mask_value); + if (is_store_16) { + StoreAligned16(mask, inverted_mask_value); + } else { + StoreLo8(mask, inverted_mask_value); + StoreHi8(mask + mask_stride, inverted_mask_value); + } } else { - StoreLo8(mask, mask_value); + if (is_store_16) { + StoreAligned16(mask, mask_value); + } else { + StoreLo8(mask, mask_value); + StoreHi8(mask + mask_stride, mask_value); + } } } -#define WEIGHT8_WITHOUT_STRIDE \ - WeightMask8_SSE4<mask_is_inverse>(pred_0, pred_1, mask) +#define WEIGHT8_PAIR_WITHOUT_STRIDE \ + WeightMask16_SSE4<mask_is_inverse, false>(pred_0, pred_1, mask, mask_stride) -#define WEIGHT8_AND_STRIDE \ - WEIGHT8_WITHOUT_STRIDE; \ - pred_0 += 8; \ - pred_1 += 8; \ - mask += mask_stride +#define WEIGHT8_PAIR_AND_STRIDE \ + WEIGHT8_PAIR_WITHOUT_STRIDE; \ + pred_0 += 8 << 1; \ + pred_1 += 8 << 1; \ + mask += mask_stride << 1 template <bool mask_is_inverse> void WeightMask8x8_SSE4(const void* prediction_0, const void* prediction_1, uint8_t* mask, ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const int16_t*>(prediction_0); const auto* pred_1 = static_cast<const int16_t*>(prediction_1); - int y = 0; - do { - WEIGHT8_AND_STRIDE; - } while (++y < 7); - WEIGHT8_WITHOUT_STRIDE; + + WEIGHT8_PAIR_AND_STRIDE; + WEIGHT8_PAIR_AND_STRIDE; + WEIGHT8_PAIR_AND_STRIDE; + WEIGHT8_PAIR_WITHOUT_STRIDE; } template <bool mask_is_inverse> @@ -84,13 +102,13 @@ void WeightMask8x16_SSE4(const void* prediction_0, const void* prediction_1, uint8_t* mask, ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const int16_t*>(prediction_0); const auto* pred_1 = static_cast<const int16_t*>(prediction_1); - int y3 = 0; + int y3 = 3; do { - WEIGHT8_AND_STRIDE; - WEIGHT8_AND_STRIDE; - WEIGHT8_AND_STRIDE; - } while (++y3 < 5); - WEIGHT8_WITHOUT_STRIDE; + WEIGHT8_PAIR_AND_STRIDE; + WEIGHT8_PAIR_AND_STRIDE; + } while (--y3 != 0); + WEIGHT8_PAIR_AND_STRIDE; + WEIGHT8_PAIR_WITHOUT_STRIDE; } template <bool mask_is_inverse> @@ -98,21 +116,17 @@ void WeightMask8x32_SSE4(const void* prediction_0, const void* prediction_1, uint8_t* mask, ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const int16_t*>(prediction_0); const auto* pred_1 = static_cast<const int16_t*>(prediction_1); - int y5 = 0; + int y5 = 5; do { - WEIGHT8_AND_STRIDE; - WEIGHT8_AND_STRIDE; - WEIGHT8_AND_STRIDE; - WEIGHT8_AND_STRIDE; - WEIGHT8_AND_STRIDE; - } while (++y5 < 6); - WEIGHT8_AND_STRIDE; - WEIGHT8_WITHOUT_STRIDE; + WEIGHT8_PAIR_AND_STRIDE; + WEIGHT8_PAIR_AND_STRIDE; + WEIGHT8_PAIR_AND_STRIDE; + } while (--y5 != 0); + WEIGHT8_PAIR_WITHOUT_STRIDE; } -#define WEIGHT16_WITHOUT_STRIDE \ - WeightMask8_SSE4<mask_is_inverse>(pred_0, pred_1, mask); \ - WeightMask8_SSE4<mask_is_inverse>(pred_0 + 8, pred_1 + 8, mask + 8) +#define WEIGHT16_WITHOUT_STRIDE \ + WeightMask16_SSE4<mask_is_inverse, true>(pred_0, pred_1, mask, mask_stride) #define WEIGHT16_AND_STRIDE \ WEIGHT16_WITHOUT_STRIDE; \ @@ -125,10 +139,10 @@ void WeightMask16x8_SSE4(const void* prediction_0, const void* prediction_1, uint8_t* mask, ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const int16_t*>(prediction_0); const auto* pred_1 = static_cast<const int16_t*>(prediction_1); - int y = 0; + int y = 7; do { WEIGHT16_AND_STRIDE; - } while (++y < 7); + } while (--y != 0); WEIGHT16_WITHOUT_STRIDE; } @@ -137,12 +151,12 @@ void WeightMask16x16_SSE4(const void* prediction_0, const void* prediction_1, uint8_t* mask, ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const int16_t*>(prediction_0); const auto* pred_1 = static_cast<const int16_t*>(prediction_1); - int y3 = 0; + int y3 = 5; do { WEIGHT16_AND_STRIDE; WEIGHT16_AND_STRIDE; WEIGHT16_AND_STRIDE; - } while (++y3 < 5); + } while (--y3 != 0); WEIGHT16_WITHOUT_STRIDE; } @@ -151,14 +165,14 @@ void WeightMask16x32_SSE4(const void* prediction_0, const void* prediction_1, uint8_t* mask, ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const int16_t*>(prediction_0); const auto* pred_1 = static_cast<const int16_t*>(prediction_1); - int y5 = 0; + int y5 = 6; do { WEIGHT16_AND_STRIDE; WEIGHT16_AND_STRIDE; WEIGHT16_AND_STRIDE; WEIGHT16_AND_STRIDE; WEIGHT16_AND_STRIDE; - } while (++y5 < 6); + } while (--y5 != 0); WEIGHT16_AND_STRIDE; WEIGHT16_WITHOUT_STRIDE; } @@ -168,20 +182,19 @@ void WeightMask16x64_SSE4(const void* prediction_0, const void* prediction_1, uint8_t* mask, ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const int16_t*>(prediction_0); const auto* pred_1 = static_cast<const int16_t*>(prediction_1); - int y3 = 0; + int y3 = 21; do { WEIGHT16_AND_STRIDE; WEIGHT16_AND_STRIDE; WEIGHT16_AND_STRIDE; - } while (++y3 < 21); + } while (--y3 != 0); WEIGHT16_WITHOUT_STRIDE; } -#define WEIGHT32_WITHOUT_STRIDE \ - WeightMask8_SSE4<mask_is_inverse>(pred_0, pred_1, mask); \ - WeightMask8_SSE4<mask_is_inverse>(pred_0 + 8, pred_1 + 8, mask + 8); \ - WeightMask8_SSE4<mask_is_inverse>(pred_0 + 16, pred_1 + 16, mask + 16); \ - WeightMask8_SSE4<mask_is_inverse>(pred_0 + 24, pred_1 + 24, mask + 24) +#define WEIGHT32_WITHOUT_STRIDE \ + WeightMask16_SSE4<mask_is_inverse, true>(pred_0, pred_1, mask, mask_stride); \ + WeightMask16_SSE4<mask_is_inverse, true>(pred_0 + 16, pred_1 + 16, \ + mask + 16, mask_stride) #define WEIGHT32_AND_STRIDE \ WEIGHT32_WITHOUT_STRIDE; \ @@ -209,12 +222,12 @@ void WeightMask32x16_SSE4(const void* prediction_0, const void* prediction_1, uint8_t* mask, ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const int16_t*>(prediction_0); const auto* pred_1 = static_cast<const int16_t*>(prediction_1); - int y3 = 0; + int y3 = 5; do { WEIGHT32_AND_STRIDE; WEIGHT32_AND_STRIDE; WEIGHT32_AND_STRIDE; - } while (++y3 < 5); + } while (--y3 != 0); WEIGHT32_WITHOUT_STRIDE; } @@ -223,14 +236,14 @@ void WeightMask32x32_SSE4(const void* prediction_0, const void* prediction_1, uint8_t* mask, ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const int16_t*>(prediction_0); const auto* pred_1 = static_cast<const int16_t*>(prediction_1); - int y5 = 0; + int y5 = 6; do { WEIGHT32_AND_STRIDE; WEIGHT32_AND_STRIDE; WEIGHT32_AND_STRIDE; WEIGHT32_AND_STRIDE; WEIGHT32_AND_STRIDE; - } while (++y5 < 6); + } while (--y5 != 0); WEIGHT32_AND_STRIDE; WEIGHT32_WITHOUT_STRIDE; } @@ -240,24 +253,23 @@ void WeightMask32x64_SSE4(const void* prediction_0, const void* prediction_1, uint8_t* mask, ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const int16_t*>(prediction_0); const auto* pred_1 = static_cast<const int16_t*>(prediction_1); - int y3 = 0; + int y3 = 21; do { WEIGHT32_AND_STRIDE; WEIGHT32_AND_STRIDE; WEIGHT32_AND_STRIDE; - } while (++y3 < 21); + } while (--y3 != 0); WEIGHT32_WITHOUT_STRIDE; } -#define WEIGHT64_WITHOUT_STRIDE \ - WeightMask8_SSE4<mask_is_inverse>(pred_0, pred_1, mask); \ - WeightMask8_SSE4<mask_is_inverse>(pred_0 + 8, pred_1 + 8, mask + 8); \ - WeightMask8_SSE4<mask_is_inverse>(pred_0 + 16, pred_1 + 16, mask + 16); \ - WeightMask8_SSE4<mask_is_inverse>(pred_0 + 24, pred_1 + 24, mask + 24); \ - WeightMask8_SSE4<mask_is_inverse>(pred_0 + 32, pred_1 + 32, mask + 32); \ - WeightMask8_SSE4<mask_is_inverse>(pred_0 + 40, pred_1 + 40, mask + 40); \ - WeightMask8_SSE4<mask_is_inverse>(pred_0 + 48, pred_1 + 48, mask + 48); \ - WeightMask8_SSE4<mask_is_inverse>(pred_0 + 56, pred_1 + 56, mask + 56) +#define WEIGHT64_WITHOUT_STRIDE \ + WeightMask16_SSE4<mask_is_inverse, true>(pred_0, pred_1, mask, mask_stride); \ + WeightMask16_SSE4<mask_is_inverse, true>(pred_0 + 16, pred_1 + 16, \ + mask + 16, mask_stride); \ + WeightMask16_SSE4<mask_is_inverse, true>(pred_0 + 32, pred_1 + 32, \ + mask + 32, mask_stride); \ + WeightMask16_SSE4<mask_is_inverse, true>(pred_0 + 48, pred_1 + 48, \ + mask + 48, mask_stride) #define WEIGHT64_AND_STRIDE \ WEIGHT64_WITHOUT_STRIDE; \ @@ -447,12 +459,491 @@ void Init8bpp() { } // namespace } // namespace low_bitdepth -void WeightMaskInit_SSE4_1() { low_bitdepth::Init8bpp(); } +#if LIBGAV1_MAX_BITDEPTH >= 10 +namespace high_bitdepth { +namespace { + +constexpr int kRoundingBits10bpp = 6; +constexpr int kScaledDiffShift = 4; + +template <bool mask_is_inverse, bool is_store_16> +inline void WeightMask16_10bpp_SSE4(const uint16_t* prediction_0, + const uint16_t* prediction_1, uint8_t* mask, + ptrdiff_t mask_stride) { + const __m128i diff_offset = _mm_set1_epi8(38); + const __m128i mask_ceiling = _mm_set1_epi8(64); + const __m128i zero = _mm_setzero_si128(); + + // Range of prediction: [3988, 61532]. + const __m128i pred_00 = LoadAligned16(prediction_0); + const __m128i pred_10 = LoadAligned16(prediction_1); + const __m128i pred_lo_00 = _mm_cvtepu16_epi32(pred_00); + const __m128i pred_lo_10 = _mm_cvtepu16_epi32(pred_10); + const __m128i diff_lo_0 = RightShiftWithRounding_U32( + _mm_abs_epi32(_mm_sub_epi32(pred_lo_00, pred_lo_10)), kRoundingBits10bpp); + + const __m128i pred_hi_00 = _mm_unpackhi_epi16(pred_00, zero); + const __m128i pred_hi_10 = _mm_unpackhi_epi16(pred_10, zero); + const __m128i diff_hi_0 = RightShiftWithRounding_U32( + _mm_abs_epi32(_mm_sub_epi32(pred_hi_00, pred_hi_10)), kRoundingBits10bpp); + + const __m128i diff_0 = _mm_packus_epi32(diff_lo_0, diff_hi_0); + const __m128i scaled_diff_0 = _mm_srli_epi16(diff_0, kScaledDiffShift); + + const __m128i pred_01 = LoadAligned16(prediction_0 + 8); + const __m128i pred_11 = LoadAligned16(prediction_1 + 8); + const __m128i pred_lo_01 = _mm_cvtepu16_epi32(pred_01); + const __m128i pred_lo_11 = _mm_cvtepu16_epi32(pred_11); + const __m128i diff_lo_1 = RightShiftWithRounding_U32( + _mm_abs_epi32(_mm_sub_epi32(pred_lo_01, pred_lo_11)), kRoundingBits10bpp); + + const __m128i pred_hi_01 = _mm_unpackhi_epi16(pred_01, zero); + const __m128i pred_hi_11 = _mm_unpackhi_epi16(pred_11, zero); + const __m128i diff_hi_1 = RightShiftWithRounding_U32( + _mm_abs_epi32(_mm_sub_epi32(pred_hi_01, pred_hi_11)), kRoundingBits10bpp); + + const __m128i diff_1 = _mm_packus_epi32(diff_lo_1, diff_hi_1); + const __m128i scaled_diff_1 = _mm_srli_epi16(diff_1, kScaledDiffShift); + + const __m128i adjusted_diff = _mm_adds_epu8( + _mm_packus_epi16(scaled_diff_0, scaled_diff_1), diff_offset); + const __m128i mask_value = _mm_min_epi8(adjusted_diff, mask_ceiling); + + if (mask_is_inverse) { + const __m128i inverted_mask_value = _mm_sub_epi8(mask_ceiling, mask_value); + if (is_store_16) { + StoreAligned16(mask, inverted_mask_value); + } else { + StoreLo8(mask, inverted_mask_value); + StoreHi8(mask + mask_stride, inverted_mask_value); + } + } else { + if (is_store_16) { + StoreAligned16(mask, mask_value); + } else { + StoreLo8(mask, mask_value); + StoreHi8(mask + mask_stride, mask_value); + } + } +} + +#define WEIGHT8_PAIR_WITHOUT_STRIDE_10BPP \ + WeightMask16_10bpp_SSE4<mask_is_inverse, false>(pred_0, pred_1, mask, \ + mask_stride) + +#define WEIGHT8_PAIR_AND_STRIDE_10BPP \ + WEIGHT8_PAIR_WITHOUT_STRIDE_10BPP; \ + pred_0 += 8 << 1; \ + pred_1 += 8 << 1; \ + mask += mask_stride << 1 + +template <bool mask_is_inverse> +void WeightMask8x8_10bpp_SSE4(const void* prediction_0, + const void* prediction_1, uint8_t* mask, + ptrdiff_t mask_stride) { + const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); + const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); + + WEIGHT8_PAIR_AND_STRIDE_10BPP; + WEIGHT8_PAIR_AND_STRIDE_10BPP; + WEIGHT8_PAIR_AND_STRIDE_10BPP; + WEIGHT8_PAIR_WITHOUT_STRIDE_10BPP; +} + +template <bool mask_is_inverse> +void WeightMask8x16_10bpp_SSE4(const void* prediction_0, + const void* prediction_1, uint8_t* mask, + ptrdiff_t mask_stride) { + const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); + const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); + int y3 = 3; + do { + WEIGHT8_PAIR_AND_STRIDE_10BPP; + WEIGHT8_PAIR_AND_STRIDE_10BPP; + } while (--y3 != 0); + WEIGHT8_PAIR_AND_STRIDE_10BPP; + WEIGHT8_PAIR_WITHOUT_STRIDE_10BPP; +} + +template <bool mask_is_inverse> +void WeightMask8x32_10bpp_SSE4(const void* prediction_0, + const void* prediction_1, uint8_t* mask, + ptrdiff_t mask_stride) { + const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); + const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); + int y5 = 5; + do { + WEIGHT8_PAIR_AND_STRIDE_10BPP; + WEIGHT8_PAIR_AND_STRIDE_10BPP; + WEIGHT8_PAIR_AND_STRIDE_10BPP; + } while (--y5 != 0); + WEIGHT8_PAIR_WITHOUT_STRIDE_10BPP; +} + +#define WEIGHT16_WITHOUT_STRIDE_10BPP \ + WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0, pred_1, mask, \ + mask_stride) + +#define WEIGHT16_AND_STRIDE_10BPP \ + WEIGHT16_WITHOUT_STRIDE_10BPP; \ + pred_0 += 16; \ + pred_1 += 16; \ + mask += mask_stride + +template <bool mask_is_inverse> +void WeightMask16x8_10bpp_SSE4(const void* prediction_0, + const void* prediction_1, uint8_t* mask, + ptrdiff_t mask_stride) { + const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); + const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); + int y = 7; + do { + WEIGHT16_AND_STRIDE_10BPP; + } while (--y != 0); + WEIGHT16_WITHOUT_STRIDE_10BPP; +} + +template <bool mask_is_inverse> +void WeightMask16x16_10bpp_SSE4(const void* prediction_0, + const void* prediction_1, uint8_t* mask, + ptrdiff_t mask_stride) { + const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); + const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); + int y3 = 5; + do { + WEIGHT16_AND_STRIDE_10BPP; + WEIGHT16_AND_STRIDE_10BPP; + WEIGHT16_AND_STRIDE_10BPP; + } while (--y3 != 0); + WEIGHT16_WITHOUT_STRIDE_10BPP; +} + +template <bool mask_is_inverse> +void WeightMask16x32_10bpp_SSE4(const void* prediction_0, + const void* prediction_1, uint8_t* mask, + ptrdiff_t mask_stride) { + const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); + const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); + int y5 = 6; + do { + WEIGHT16_AND_STRIDE_10BPP; + WEIGHT16_AND_STRIDE_10BPP; + WEIGHT16_AND_STRIDE_10BPP; + WEIGHT16_AND_STRIDE_10BPP; + WEIGHT16_AND_STRIDE_10BPP; + } while (--y5 != 0); + WEIGHT16_AND_STRIDE_10BPP; + WEIGHT16_WITHOUT_STRIDE_10BPP; +} + +template <bool mask_is_inverse> +void WeightMask16x64_10bpp_SSE4(const void* prediction_0, + const void* prediction_1, uint8_t* mask, + ptrdiff_t mask_stride) { + const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); + const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); + int y3 = 21; + do { + WEIGHT16_AND_STRIDE_10BPP; + WEIGHT16_AND_STRIDE_10BPP; + WEIGHT16_AND_STRIDE_10BPP; + } while (--y3 != 0); + WEIGHT16_WITHOUT_STRIDE_10BPP; +} + +#define WEIGHT32_WITHOUT_STRIDE_10BPP \ + WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0, pred_1, mask, \ + mask_stride); \ + WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0 + 16, pred_1 + 16, \ + mask + 16, mask_stride) + +#define WEIGHT32_AND_STRIDE_10BPP \ + WEIGHT32_WITHOUT_STRIDE_10BPP; \ + pred_0 += 32; \ + pred_1 += 32; \ + mask += mask_stride + +template <bool mask_is_inverse> +void WeightMask32x8_10bpp_SSE4(const void* prediction_0, + const void* prediction_1, uint8_t* mask, + ptrdiff_t mask_stride) { + const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); + const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); + WEIGHT32_AND_STRIDE_10BPP; + WEIGHT32_AND_STRIDE_10BPP; + WEIGHT32_AND_STRIDE_10BPP; + WEIGHT32_AND_STRIDE_10BPP; + WEIGHT32_AND_STRIDE_10BPP; + WEIGHT32_AND_STRIDE_10BPP; + WEIGHT32_AND_STRIDE_10BPP; + WEIGHT32_WITHOUT_STRIDE_10BPP; +} + +template <bool mask_is_inverse> +void WeightMask32x16_10bpp_SSE4(const void* prediction_0, + const void* prediction_1, uint8_t* mask, + ptrdiff_t mask_stride) { + const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); + const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); + int y3 = 5; + do { + WEIGHT32_AND_STRIDE_10BPP; + WEIGHT32_AND_STRIDE_10BPP; + WEIGHT32_AND_STRIDE_10BPP; + } while (--y3 != 0); + WEIGHT32_WITHOUT_STRIDE_10BPP; +} + +template <bool mask_is_inverse> +void WeightMask32x32_10bpp_SSE4(const void* prediction_0, + const void* prediction_1, uint8_t* mask, + ptrdiff_t mask_stride) { + const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); + const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); + int y5 = 6; + do { + WEIGHT32_AND_STRIDE_10BPP; + WEIGHT32_AND_STRIDE_10BPP; + WEIGHT32_AND_STRIDE_10BPP; + WEIGHT32_AND_STRIDE_10BPP; + WEIGHT32_AND_STRIDE_10BPP; + } while (--y5 != 0); + WEIGHT32_AND_STRIDE_10BPP; + WEIGHT32_WITHOUT_STRIDE_10BPP; +} + +template <bool mask_is_inverse> +void WeightMask32x64_10bpp_SSE4(const void* prediction_0, + const void* prediction_1, uint8_t* mask, + ptrdiff_t mask_stride) { + const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); + const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); + int y3 = 21; + do { + WEIGHT32_AND_STRIDE_10BPP; + WEIGHT32_AND_STRIDE_10BPP; + WEIGHT32_AND_STRIDE_10BPP; + } while (--y3 != 0); + WEIGHT32_WITHOUT_STRIDE_10BPP; +} + +#define WEIGHT64_WITHOUT_STRIDE_10BPP \ + WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0, pred_1, mask, \ + mask_stride); \ + WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0 + 16, pred_1 + 16, \ + mask + 16, mask_stride); \ + WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0 + 32, pred_1 + 32, \ + mask + 32, mask_stride); \ + WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0 + 48, pred_1 + 48, \ + mask + 48, mask_stride) + +#define WEIGHT64_AND_STRIDE_10BPP \ + WEIGHT64_WITHOUT_STRIDE_10BPP; \ + pred_0 += 64; \ + pred_1 += 64; \ + mask += mask_stride + +template <bool mask_is_inverse> +void WeightMask64x16_10bpp_SSE4(const void* prediction_0, + const void* prediction_1, uint8_t* mask, + ptrdiff_t mask_stride) { + const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); + const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); + int y3 = 5; + do { + WEIGHT64_AND_STRIDE_10BPP; + WEIGHT64_AND_STRIDE_10BPP; + WEIGHT64_AND_STRIDE_10BPP; + } while (--y3 != 0); + WEIGHT64_WITHOUT_STRIDE_10BPP; +} + +template <bool mask_is_inverse> +void WeightMask64x32_10bpp_SSE4(const void* prediction_0, + const void* prediction_1, uint8_t* mask, + ptrdiff_t mask_stride) { + const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); + const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); + int y5 = 6; + do { + WEIGHT64_AND_STRIDE_10BPP; + WEIGHT64_AND_STRIDE_10BPP; + WEIGHT64_AND_STRIDE_10BPP; + WEIGHT64_AND_STRIDE_10BPP; + WEIGHT64_AND_STRIDE_10BPP; + } while (--y5 != 0); + WEIGHT64_AND_STRIDE_10BPP; + WEIGHT64_WITHOUT_STRIDE_10BPP; +} + +template <bool mask_is_inverse> +void WeightMask64x64_10bpp_SSE4(const void* prediction_0, + const void* prediction_1, uint8_t* mask, + ptrdiff_t mask_stride) { + const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); + const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); + int y3 = 21; + do { + WEIGHT64_AND_STRIDE_10BPP; + WEIGHT64_AND_STRIDE_10BPP; + WEIGHT64_AND_STRIDE_10BPP; + } while (--y3 != 0); + WEIGHT64_WITHOUT_STRIDE_10BPP; +} + +template <bool mask_is_inverse> +void WeightMask64x128_10bpp_SSE4(const void* prediction_0, + const void* prediction_1, uint8_t* mask, + ptrdiff_t mask_stride) { + const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); + const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); + int y3 = 42; + do { + WEIGHT64_AND_STRIDE_10BPP; + WEIGHT64_AND_STRIDE_10BPP; + WEIGHT64_AND_STRIDE_10BPP; + } while (--y3 != 0); + WEIGHT64_AND_STRIDE_10BPP; + WEIGHT64_WITHOUT_STRIDE_10BPP; +} + +template <bool mask_is_inverse> +void WeightMask128x64_10bpp_SSE4(const void* prediction_0, + const void* prediction_1, uint8_t* mask, + ptrdiff_t mask_stride) { + const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); + const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); + int y3 = 21; + const ptrdiff_t adjusted_mask_stride = mask_stride - 64; + do { + WEIGHT64_WITHOUT_STRIDE_10BPP; + pred_0 += 64; + pred_1 += 64; + mask += 64; + WEIGHT64_WITHOUT_STRIDE_10BPP; + pred_0 += 64; + pred_1 += 64; + mask += adjusted_mask_stride; + + WEIGHT64_WITHOUT_STRIDE_10BPP; + pred_0 += 64; + pred_1 += 64; + mask += 64; + WEIGHT64_WITHOUT_STRIDE_10BPP; + pred_0 += 64; + pred_1 += 64; + mask += adjusted_mask_stride; + + WEIGHT64_WITHOUT_STRIDE_10BPP; + pred_0 += 64; + pred_1 += 64; + mask += 64; + WEIGHT64_WITHOUT_STRIDE_10BPP; + pred_0 += 64; + pred_1 += 64; + mask += adjusted_mask_stride; + } while (--y3 != 0); + WEIGHT64_WITHOUT_STRIDE_10BPP; + pred_0 += 64; + pred_1 += 64; + mask += 64; + WEIGHT64_WITHOUT_STRIDE_10BPP; +} + +template <bool mask_is_inverse> +void WeightMask128x128_10bpp_SSE4(const void* prediction_0, + const void* prediction_1, uint8_t* mask, + ptrdiff_t mask_stride) { + const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); + const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); + int y3 = 42; + const ptrdiff_t adjusted_mask_stride = mask_stride - 64; + do { + WEIGHT64_WITHOUT_STRIDE_10BPP; + pred_0 += 64; + pred_1 += 64; + mask += 64; + WEIGHT64_WITHOUT_STRIDE_10BPP; + pred_0 += 64; + pred_1 += 64; + mask += adjusted_mask_stride; + + WEIGHT64_WITHOUT_STRIDE_10BPP; + pred_0 += 64; + pred_1 += 64; + mask += 64; + WEIGHT64_WITHOUT_STRIDE_10BPP; + pred_0 += 64; + pred_1 += 64; + mask += adjusted_mask_stride; + + WEIGHT64_WITHOUT_STRIDE_10BPP; + pred_0 += 64; + pred_1 += 64; + mask += 64; + WEIGHT64_WITHOUT_STRIDE_10BPP; + pred_0 += 64; + pred_1 += 64; + mask += adjusted_mask_stride; + } while (--y3 != 0); + WEIGHT64_WITHOUT_STRIDE_10BPP; + pred_0 += 64; + pred_1 += 64; + mask += 64; + WEIGHT64_WITHOUT_STRIDE_10BPP; + pred_0 += 64; + pred_1 += 64; + mask += adjusted_mask_stride; + + WEIGHT64_WITHOUT_STRIDE_10BPP; + pred_0 += 64; + pred_1 += 64; + mask += 64; + WEIGHT64_WITHOUT_STRIDE_10BPP; +} + +#define INIT_WEIGHT_MASK_10BPP(width, height, w_index, h_index) \ + dsp->weight_mask[w_index][h_index][0] = \ + WeightMask##width##x##height##_10bpp_SSE4<0>; \ + dsp->weight_mask[w_index][h_index][1] = \ + WeightMask##width##x##height##_10bpp_SSE4<1> +void Init10bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10); + assert(dsp != nullptr); + INIT_WEIGHT_MASK_10BPP(8, 8, 0, 0); + INIT_WEIGHT_MASK_10BPP(8, 16, 0, 1); + INIT_WEIGHT_MASK_10BPP(8, 32, 0, 2); + INIT_WEIGHT_MASK_10BPP(16, 8, 1, 0); + INIT_WEIGHT_MASK_10BPP(16, 16, 1, 1); + INIT_WEIGHT_MASK_10BPP(16, 32, 1, 2); + INIT_WEIGHT_MASK_10BPP(16, 64, 1, 3); + INIT_WEIGHT_MASK_10BPP(32, 8, 2, 0); + INIT_WEIGHT_MASK_10BPP(32, 16, 2, 1); + INIT_WEIGHT_MASK_10BPP(32, 32, 2, 2); + INIT_WEIGHT_MASK_10BPP(32, 64, 2, 3); + INIT_WEIGHT_MASK_10BPP(64, 16, 3, 1); + INIT_WEIGHT_MASK_10BPP(64, 32, 3, 2); + INIT_WEIGHT_MASK_10BPP(64, 64, 3, 3); + INIT_WEIGHT_MASK_10BPP(64, 128, 3, 4); + INIT_WEIGHT_MASK_10BPP(128, 64, 4, 3); + INIT_WEIGHT_MASK_10BPP(128, 128, 4, 4); +} + +} // namespace +} // namespace high_bitdepth +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +void WeightMaskInit_SSE4_1() { + low_bitdepth::Init8bpp(); +#if LIBGAV1_MAX_BITDEPTH >= 10 + high_bitdepth::Init10bpp(); +#endif +} } // namespace dsp } // namespace libgav1 -#else // !LIBGAV1_TARGETING_SSE4_1 +#else // !LIBGAV1_TARGETING_SSE4_1 namespace libgav1 { namespace dsp { |