diff options
Diffstat (limited to 'src/dsp/x86/loop_restoration_sse4.cc')
-rw-r--r-- | src/dsp/x86/loop_restoration_sse4.cc | 241 |
1 files changed, 134 insertions, 107 deletions
diff --git a/src/dsp/x86/loop_restoration_sse4.cc b/src/dsp/x86/loop_restoration_sse4.cc index 24f5ad2..273bcc8 100644 --- a/src/dsp/x86/loop_restoration_sse4.cc +++ b/src/dsp/x86/loop_restoration_sse4.cc @@ -481,13 +481,12 @@ inline void WienerVerticalTap1(const int16_t* wiener_buffer, } } -void WienerFilter_SSE4_1(const RestorationUnitInfo& restoration_info, - const void* const source, const void* const top_border, - const void* const bottom_border, - const ptrdiff_t stride, const int width, - const int height, - RestorationBuffer* const restoration_buffer, - void* const dest) { +void WienerFilter_SSE4_1( + const RestorationUnitInfo& restoration_info, const void* const source, + const ptrdiff_t stride, const void* const top_border, + const ptrdiff_t top_border_stride, const void* const bottom_border, + const ptrdiff_t bottom_border_stride, const int width, const int height, + RestorationBuffer* const restoration_buffer, void* const dest) { const int16_t* const number_leading_zero_coefficients = restoration_info.wiener_info.number_leading_zero_coefficients; const int number_rows_to_skip = std::max( @@ -516,45 +515,48 @@ void WienerFilter_SSE4_1(const RestorationUnitInfo& restoration_info, const __m128i coefficients_horizontal = _mm_sub_epi16(c, _mm_setr_epi16(0, 0, 0, 128, 0, 0, 0, 0)); if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) { - WienerHorizontalTap7(top + (2 - height_extra) * stride - 3, stride, - wiener_stride, height_extra, filter_horizontal[0], - coefficients_horizontal, &wiener_buffer_horizontal); - WienerHorizontalTap7(src - 3, stride, wiener_stride, height, + WienerHorizontalTap7(top + (2 - height_extra) * top_border_stride - 3, + top_border_stride, wiener_stride, height_extra, filter_horizontal[0], coefficients_horizontal, &wiener_buffer_horizontal); - WienerHorizontalTap7(bottom - 3, stride, wiener_stride, height_extra, + WienerHorizontalTap7(src - 3, stride, wiener_stride, height, filter_horizontal[0], coefficients_horizontal, &wiener_buffer_horizontal); - } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) { - WienerHorizontalTap5(top + (2 - height_extra) * stride - 2, stride, - wiener_stride, height_extra, filter_horizontal[1], + WienerHorizontalTap7(bottom - 3, bottom_border_stride, wiener_stride, + height_extra, filter_horizontal[0], coefficients_horizontal, &wiener_buffer_horizontal); - WienerHorizontalTap5(src - 2, stride, wiener_stride, height, + } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) { + WienerHorizontalTap5(top + (2 - height_extra) * top_border_stride - 2, + top_border_stride, wiener_stride, height_extra, filter_horizontal[1], coefficients_horizontal, &wiener_buffer_horizontal); - WienerHorizontalTap5(bottom - 2, stride, wiener_stride, height_extra, + WienerHorizontalTap5(src - 2, stride, wiener_stride, height, filter_horizontal[1], coefficients_horizontal, &wiener_buffer_horizontal); + WienerHorizontalTap5(bottom - 2, bottom_border_stride, wiener_stride, + height_extra, filter_horizontal[1], + coefficients_horizontal, &wiener_buffer_horizontal); } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) { // The maximum over-reads happen here. - WienerHorizontalTap3(top + (2 - height_extra) * stride - 1, stride, - wiener_stride, height_extra, filter_horizontal[2], - coefficients_horizontal, &wiener_buffer_horizontal); - WienerHorizontalTap3(src - 1, stride, wiener_stride, height, + WienerHorizontalTap3(top + (2 - height_extra) * top_border_stride - 1, + top_border_stride, wiener_stride, height_extra, filter_horizontal[2], coefficients_horizontal, &wiener_buffer_horizontal); - WienerHorizontalTap3(bottom - 1, stride, wiener_stride, height_extra, + WienerHorizontalTap3(src - 1, stride, wiener_stride, height, filter_horizontal[2], coefficients_horizontal, &wiener_buffer_horizontal); + WienerHorizontalTap3(bottom - 1, bottom_border_stride, wiener_stride, + height_extra, filter_horizontal[2], + coefficients_horizontal, &wiener_buffer_horizontal); } else { assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3); - WienerHorizontalTap1(top + (2 - height_extra) * stride, stride, - wiener_stride, height_extra, + WienerHorizontalTap1(top + (2 - height_extra) * top_border_stride, + top_border_stride, wiener_stride, height_extra, &wiener_buffer_horizontal); WienerHorizontalTap1(src, stride, wiener_stride, height, &wiener_buffer_horizontal); - WienerHorizontalTap1(bottom, stride, wiener_stride, height_extra, - &wiener_buffer_horizontal); + WienerHorizontalTap1(bottom, bottom_border_stride, wiener_stride, + height_extra, &wiener_buffer_horizontal); } // vertical filtering. @@ -1160,11 +1162,26 @@ inline __m128i CalculateMa(const __m128i sum, const __m128i sum_sq[2], return _mm_packus_epi32(z0, z1); } -template <int n> -inline __m128i CalculateB(const __m128i sum, const __m128i ma) { - static_assert(n == 9 || n == 25, ""); +inline __m128i CalculateB5(const __m128i sum, const __m128i ma) { + // one_over_n == 164. constexpr uint32_t one_over_n = - ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n; + ((1 << kSgrProjReciprocalBits) + (25 >> 1)) / 25; + // one_over_n_quarter == 41. + constexpr uint32_t one_over_n_quarter = one_over_n >> 2; + static_assert(one_over_n == one_over_n_quarter << 2, ""); + // |ma| is in range [0, 255]. + const __m128i m = _mm_maddubs_epi16(ma, _mm_set1_epi16(one_over_n_quarter)); + const __m128i m0 = VmullLo16(m, sum); + const __m128i m1 = VmullHi16(m, sum); + const __m128i b_lo = VrshrU32(m0, kSgrProjReciprocalBits - 2); + const __m128i b_hi = VrshrU32(m1, kSgrProjReciprocalBits - 2); + return _mm_packus_epi32(b_lo, b_hi); +} + +inline __m128i CalculateB3(const __m128i sum, const __m128i ma) { + // one_over_n == 455. + constexpr uint32_t one_over_n = + ((1 << kSgrProjReciprocalBits) + (9 >> 1)) / 9; const __m128i m0 = VmullLo16(ma, sum); const __m128i m1 = VmullHi16(ma, sum); const __m128i m2 = _mm_mullo_epi32(m0, _mm_set1_epi32(one_over_n)); @@ -1227,12 +1244,12 @@ inline void LookupIntermediate(const __m128i sum, const __m128i index, } else { maq = _mm_unpackhi_epi8(*ma, _mm_setzero_si128()); } - *b = CalculateB<n>(sum, maq); + *b = (n == 9) ? CalculateB3(sum, maq) : CalculateB5(sum, maq); } // Set the shuffle control mask of indices out of range [0, 15] to (1xxxxxxx)b // to get value 0 as the shuffle result. The most significiant bit 1 comes -// either from the comparision instruction, or from the sign bit of the index. +// either from the comparison instruction, or from the sign bit of the index. inline __m128i ShuffleIndex(const __m128i table, const __m128i index) { __m128i mask; mask = _mm_cmpgt_epi8(index, _mm_set1_epi8(15)); @@ -1250,15 +1267,15 @@ inline __m128i AdjustValue(const __m128i value, const __m128i index, inline void CalculateIntermediate(const __m128i sum[2], const __m128i index[2], __m128i* const ma, __m128i* const b0, __m128i* const b1) { - // Use table lookup to read elements which indices are less than 48. + // Use table lookup to read elements whose indices are less than 48. const __m128i c0 = LoadAligned16(kSgrMaLookup + 0 * 16); const __m128i c1 = LoadAligned16(kSgrMaLookup + 1 * 16); const __m128i c2 = LoadAligned16(kSgrMaLookup + 2 * 16); const __m128i indices = _mm_packus_epi16(index[0], index[1]); __m128i idx; - // Clip idx to 127 to apply signed comparision instructions. + // Clip idx to 127 to apply signed comparison instructions. idx = _mm_min_epu8(indices, _mm_set1_epi8(127)); - // All elements which indices are less than 48 are set to 0. + // All elements whose indices are less than 48 are set to 0. // Get shuffle results for indices in range [0, 15]. *ma = ShuffleIndex(c0, idx); // Get shuffle results for indices in range [16, 31]. @@ -1273,12 +1290,12 @@ inline void CalculateIntermediate(const __m128i sum[2], const __m128i index[2], const __m128i res2 = ShuffleIndex(c2, idx); *ma = _mm_or_si128(*ma, res2); - // For elements which indices are larger than 47, since they seldom change + // For elements whose indices are larger than 47, since they seldom change // values with the increase of the index, we use comparison and arithmetic // operations to calculate their values. - // Add -128 to apply signed comparision instructions. + // Add -128 to apply signed comparison instructions. idx = _mm_add_epi8(indices, _mm_set1_epi8(-128)); - // Elements which indices are larger than 47 (with value 0) are set to 5. + // Elements whose indices are larger than 47 (with value 0) are set to 5. *ma = _mm_max_epu8(*ma, _mm_set1_epi8(5)); *ma = AdjustValue(*ma, idx, 55); // 55 is the last index which value is 5. *ma = AdjustValue(*ma, idx, 72); // 72 is the last index which value is 4. @@ -1298,9 +1315,9 @@ inline void CalculateIntermediate(const __m128i sum[2], const __m128i index[2], // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits). // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits). const __m128i maq0 = _mm_unpacklo_epi8(*ma, _mm_setzero_si128()); - *b0 = CalculateB<9>(sum[0], maq0); + *b0 = CalculateB3(sum[0], maq0); const __m128i maq1 = _mm_unpackhi_epi8(*ma, _mm_setzero_si128()); - *b1 = CalculateB<9>(sum[1], maq1); + *b1 = CalculateB3(sum[1], maq1); } inline void CalculateIntermediate(const __m128i sum[2], const __m128i index[2], @@ -1776,9 +1793,9 @@ inline void BoxSumFilterPreProcess( const uint8_t* const src0, const uint8_t* const src1, const int width, const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5], uint32_t* const square_sum3[4], uint32_t* const square_sum5[5], - const ptrdiff_t sum_width, uint16_t* const ma343[4], - uint16_t* const ma444[2], uint16_t* ma565, uint32_t* const b343[4], - uint32_t* const b444[2], uint32_t* b565) { + const ptrdiff_t sum_width, uint16_t* const ma343[4], uint16_t* const ma444, + uint16_t* ma565, uint32_t* const b343[4], uint32_t* const b444, + uint32_t* b565) { __m128i s[2][2], ma3[2][2], ma5[2], sq[2][4], b3[2][3], b5[3]; s[0][0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width); s[1][0] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1 - width); @@ -1808,9 +1825,8 @@ inline void BoxSumFilterPreProcess( Sum565W(b5 + 1, b + 2); StoreAligned64U32(b565, b); Prepare3_8<0>(ma3[1], ma3x); - Store343_444Lo(ma3x, b3[1], x, ma343[1], ma444[0], b343[1], b444[0]); - Store343_444Hi(ma3x, b3[1] + 1, x + 8, ma343[1], ma444[0], b343[1], - b444[0]); + Store343_444Lo(ma3x, b3[1], x, ma343[1], ma444, b343[1], b444); + Store343_444Hi(ma3x, b3[1] + 1, x + 8, ma343[1], ma444, b343[1], b444); Prepare3_8<0>(ma5, ma5x); ma[0] = Sum565Lo(ma5x); ma[1] = Sum565Hi(ma5x); @@ -1854,8 +1870,9 @@ inline __m128i CalculateFilteredOutput(const __m128i src, const __m128i ma, return _mm_packs_epi32(dst_lo, dst_hi); // 13 bits } -inline __m128i CalculateFilteredOutputPass1(const __m128i src, __m128i ma[2], - __m128i b[2][2]) { +inline __m128i CalculateFilteredOutputPass1(const __m128i src, + const __m128i ma[2], + const __m128i b[2][2]) { const __m128i ma_sum = _mm_add_epi16(ma[0], ma[1]); __m128i b_sum[2]; b_sum[0] = _mm_add_epi32(b[0][0], b[1][0]); @@ -1863,8 +1880,9 @@ inline __m128i CalculateFilteredOutputPass1(const __m128i src, __m128i ma[2], return CalculateFilteredOutput<5>(src, ma_sum, b_sum); } -inline __m128i CalculateFilteredOutputPass2(const __m128i src, __m128i ma[3], - __m128i b[3][2]) { +inline __m128i CalculateFilteredOutputPass2(const __m128i src, + const __m128i ma[3], + const __m128i b[3][2]) { const __m128i ma_sum = Sum3_16(ma); __m128i b_sum[2]; Sum3_32(b, b_sum); @@ -1916,15 +1934,15 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterPass1( int x = 0; do { - __m128i ma[2], ma3[3], b[2][2], sr[2], p[2]; + __m128i ma[2], ma5[3], b[2][2], sr[2], p[2]; s[0][1] = LoadUnaligned16Msan(src0 + x + 16, x + 16 + kOverreadInBytesPass1 - width); s[1][1] = LoadUnaligned16Msan(src1 + x + 16, x + 16 + kOverreadInBytesPass1 - width); BoxFilterPreProcess5(s, sum_width, x + 8, scale, sum5, square_sum5, sq, mas, bs); - Prepare3_8<0>(mas, ma3); - ma[1] = Sum565Lo(ma3); + Prepare3_8<0>(mas, ma5); + ma[1] = Sum565Lo(ma5); StoreAligned16(ma565[1] + x, ma[1]); Sum565W(bs, b[1]); StoreAligned32U32(b565[1] + x, b[1]); @@ -1939,7 +1957,7 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterPass1( const __m128i d00 = SelfGuidedSingleMultiplier(sr0_lo, p[0], w0); const __m128i d10 = SelfGuidedSingleMultiplier(sr1_lo, p[1], w0); - ma[1] = Sum565Hi(ma3); + ma[1] = Sum565Hi(ma5); StoreAligned16(ma565[1] + x + 8, ma[1]); Sum565W(bs + 1, b[1]); StoreAligned32U32(b565[1] + x + 8, b[1]); @@ -2158,9 +2176,9 @@ inline void BoxFilterLastRow( const ptrdiff_t sum_width, const uint16_t scales[2], const int16_t w0, const int16_t w2, uint16_t* const sum3[4], uint16_t* const sum5[5], uint32_t* const square_sum3[4], uint32_t* const square_sum5[5], - uint16_t* const ma343[4], uint16_t* const ma444[3], - uint16_t* const ma565[2], uint32_t* const b343[4], uint32_t* const b444[3], - uint32_t* const b565[2], uint8_t* const dst) { + uint16_t* const ma343, uint16_t* const ma444, uint16_t* const ma565, + uint32_t* const b343, uint32_t* const b444, uint32_t* const b565, + uint8_t* const dst) { __m128i s[2], ma3[2], ma5[2], sq[4], b3[3], b5[3], ma[3], b[3][2]; s[0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width); sq[0] = SquareLo8(s[0]); @@ -2183,13 +2201,13 @@ inline void BoxFilterLastRow( Sum343W(b3, b[2]); const __m128i sr = LoadAligned16(src + x); const __m128i sr_lo = _mm_unpacklo_epi8(sr, _mm_setzero_si128()); - ma[0] = LoadAligned16(ma565[0] + x); - LoadAligned32U32(b565[0] + x, b[0]); + ma[0] = LoadAligned16(ma565 + x); + LoadAligned32U32(b565 + x, b[0]); p[0] = CalculateFilteredOutputPass1(sr_lo, ma, b); - ma[0] = LoadAligned16(ma343[0] + x); - ma[1] = LoadAligned16(ma444[0] + x); - LoadAligned32U32(b343[0] + x, b[0]); - LoadAligned32U32(b444[0] + x, b[1]); + ma[0] = LoadAligned16(ma343 + x); + ma[1] = LoadAligned16(ma444 + x); + LoadAligned32U32(b343 + x, b[0]); + LoadAligned32U32(b444 + x, b[1]); p[1] = CalculateFilteredOutputPass2(sr_lo, ma, b); const __m128i d0 = SelfGuidedDoubleMultiplier(sr_lo, p, w0, w2); @@ -2198,13 +2216,13 @@ inline void BoxFilterLastRow( ma[2] = Sum343Hi(ma3x); Sum343W(b3 + 1, b[2]); const __m128i sr_hi = _mm_unpackhi_epi8(sr, _mm_setzero_si128()); - ma[0] = LoadAligned16(ma565[0] + x + 8); - LoadAligned32U32(b565[0] + x + 8, b[0]); + ma[0] = LoadAligned16(ma565 + x + 8); + LoadAligned32U32(b565 + x + 8, b[0]); p[0] = CalculateFilteredOutputPass1(sr_hi, ma, b); - ma[0] = LoadAligned16(ma343[0] + x + 8); - ma[1] = LoadAligned16(ma444[0] + x + 8); - LoadAligned32U32(b343[0] + x + 8, b[0]); - LoadAligned32U32(b444[0] + x + 8, b[1]); + ma[0] = LoadAligned16(ma343 + x + 8); + ma[1] = LoadAligned16(ma444 + x + 8); + LoadAligned32U32(b343 + x + 8, b[0]); + LoadAligned32U32(b444 + x + 8, b[1]); p[1] = CalculateFilteredOutputPass2(sr_hi, ma, b); const __m128i d1 = SelfGuidedDoubleMultiplier(sr_hi, p, w0, w2); StoreAligned16(dst + x, _mm_packus_epi16(d0, d1)); @@ -2220,8 +2238,9 @@ inline void BoxFilterLastRow( LIBGAV1_ALWAYS_INLINE void BoxFilterProcess( const RestorationUnitInfo& restoration_info, const uint8_t* src, - const uint8_t* const top_border, const uint8_t* bottom_border, - const ptrdiff_t stride, const int width, const int height, + const ptrdiff_t stride, const uint8_t* const top_border, + const ptrdiff_t top_border_stride, const uint8_t* bottom_border, + const ptrdiff_t bottom_border_stride, const int width, const int height, SgrBuffer* const sgr_buffer, uint8_t* dst) { const auto temp_stride = Align<ptrdiff_t>(width, 16); const auto sum_width = Align<ptrdiff_t>(width + 8, 16); @@ -2261,14 +2280,14 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterProcess( b565[1] = b565[0] + temp_stride; assert(scales[0] != 0); assert(scales[1] != 0); - BoxSum(top_border, stride, width, sum_stride, sum_width, sum3[0], sum5[1], - square_sum3[0], square_sum5[1]); + BoxSum(top_border, top_border_stride, width, sum_stride, sum_width, sum3[0], + sum5[1], square_sum3[0], square_sum5[1]); sum5[0] = sum5[1]; square_sum5[0] = square_sum5[1]; const uint8_t* const s = (height > 1) ? src + stride : bottom_border; BoxSumFilterPreProcess(src, s, width, scales, sum3, sum5, square_sum3, - square_sum5, sum_width, ma343, ma444, ma565[0], b343, - b444, b565[0]); + square_sum5, sum_width, ma343, ma444[0], ma565[0], + b343, b444[0], b565[0]); sum5[0] = sgr_buffer->sum5; square_sum5[0] = sgr_buffer->square_sum5; @@ -2298,7 +2317,7 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterProcess( const uint8_t* sr[2]; if ((height & 1) == 0) { sr[0] = bottom_border; - sr[1] = bottom_border + stride; + sr[1] = bottom_border + bottom_border_stride; } else { sr[0] = src + 2 * stride; sr[1] = bottom_border; @@ -2322,19 +2341,21 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterProcess( std::swap(ma565[0], ma565[1]); std::swap(b565[0], b565[1]); } - BoxFilterLastRow(src + 3, bottom_border + stride, width, sum_width, scales, - w0, w2, sum3, sum5, square_sum3, square_sum5, ma343, ma444, - ma565, b343, b444, b565, dst); + BoxFilterLastRow(src + 3, bottom_border + bottom_border_stride, width, + sum_width, scales, w0, w2, sum3, sum5, square_sum3, + square_sum5, ma343[0], ma444[0], ma565[0], b343[0], + b444[0], b565[0], dst); } } inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info, - const uint8_t* src, + const uint8_t* src, const ptrdiff_t stride, const uint8_t* const top_border, + const ptrdiff_t top_border_stride, const uint8_t* bottom_border, - const ptrdiff_t stride, const int width, - const int height, SgrBuffer* const sgr_buffer, - uint8_t* dst) { + const ptrdiff_t bottom_border_stride, + const int width, const int height, + SgrBuffer* const sgr_buffer, uint8_t* dst) { const auto temp_stride = Align<ptrdiff_t>(width, 16); const auto sum_width = Align<ptrdiff_t>(width + 8, 16); const auto sum_stride = temp_stride + 16; @@ -2354,8 +2375,8 @@ inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info, b565[0] = sgr_buffer->b565; b565[1] = b565[0] + temp_stride; assert(scale != 0); - BoxSum<5>(top_border, stride, width, sum_stride, sum_width, sum5[1], - square_sum5[1]); + BoxSum<5>(top_border, top_border_stride, width, sum_stride, sum_width, + sum5[1], square_sum5[1]); sum5[0] = sum5[1]; square_sum5[0] = square_sum5[1]; const uint8_t* const s = (height > 1) ? src + stride : bottom_border; @@ -2381,7 +2402,7 @@ inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info, const uint8_t* sr[2]; if ((height & 1) == 0) { sr[0] = bottom_border; - sr[1] = bottom_border + stride; + sr[1] = bottom_border + bottom_border_stride; } else { sr[0] = src + 2 * stride; sr[1] = bottom_border; @@ -2399,18 +2420,20 @@ inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info, Circulate5PointersBy2<uint16_t>(sum5); Circulate5PointersBy2<uint32_t>(square_sum5); } - BoxFilterPass1LastRow(src, bottom_border + stride, width, sum_width, scale, - w0, sum5, square_sum5, ma565[0], b565[0], dst); + BoxFilterPass1LastRow(src, bottom_border + bottom_border_stride, width, + sum_width, scale, w0, sum5, square_sum5, ma565[0], + b565[0], dst); } } inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info, - const uint8_t* src, + const uint8_t* src, const ptrdiff_t stride, const uint8_t* const top_border, + const ptrdiff_t top_border_stride, const uint8_t* bottom_border, - const ptrdiff_t stride, const int width, - const int height, SgrBuffer* const sgr_buffer, - uint8_t* dst) { + const ptrdiff_t bottom_border_stride, + const int width, const int height, + SgrBuffer* const sgr_buffer, uint8_t* dst) { assert(restoration_info.sgr_proj_info.multiplier[0] == 0); const auto temp_stride = Align<ptrdiff_t>(width, 16); const auto sum_width = Align<ptrdiff_t>(width + 8, 16); @@ -2436,8 +2459,8 @@ inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info, b444[0] = sgr_buffer->b444; b444[1] = b444[0] + temp_stride; assert(scale != 0); - BoxSum<3>(top_border, stride, width, sum_stride, sum_width, sum3[0], - square_sum3[0]); + BoxSum<3>(top_border, top_border_stride, width, sum_stride, sum_width, + sum3[0], square_sum3[0]); BoxSumFilterPreProcess3<false>(src, width, scale, sum3, square_sum3, sum_width, ma343[0], nullptr, b343[0], nullptr); @@ -2448,7 +2471,7 @@ inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info, s = src + stride; } else { s = bottom_border; - bottom_border += stride; + bottom_border += bottom_border_stride; } BoxSumFilterPreProcess3<true>(s, width, scale, sum3, square_sum3, sum_width, ma343[1], ma444[0], b343[1], b444[0]); @@ -2475,7 +2498,7 @@ inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info, square_sum3, ma343, ma444, b343, b444, dst); src += stride; dst += stride; - bottom_border += stride; + bottom_border += bottom_border_stride; Circulate3PointersBy1<uint16_t>(ma343); Circulate3PointersBy1<uint32_t>(b343); std::swap(ma444[0], ma444[1]); @@ -2483,13 +2506,14 @@ inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info, } while (--y != 0); } -// If |width| is non-multiple of 8, up to 7 more pixels are written to |dest| in -// the end of each row. It is safe to overwrite the output as it will not be +// If |width| is non-multiple of 16, up to 15 more pixels are written to |dest| +// in the end of each row. It is safe to overwrite the output as it will not be // part of the visible frame. void SelfGuidedFilter_SSE4_1( const RestorationUnitInfo& restoration_info, const void* const source, - const void* const top_border, const void* const bottom_border, - const ptrdiff_t stride, const int width, const int height, + const ptrdiff_t stride, const void* const top_border, + const ptrdiff_t top_border_stride, const void* const bottom_border, + const ptrdiff_t bottom_border_stride, const int width, const int height, RestorationBuffer* const restoration_buffer, void* const dest) { const int index = restoration_info.sgr_proj_info.index; const int radius_pass_0 = kSgrProjParams[index][0]; // 2 or 0 @@ -2503,14 +2527,17 @@ void SelfGuidedFilter_SSE4_1( // |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the // following assertion. assert(radius_pass_0 != 0); - BoxFilterProcessPass1(restoration_info, src - 3, top - 3, bottom - 3, - stride, width, height, sgr_buffer, dst); + BoxFilterProcessPass1(restoration_info, src - 3, stride, top - 3, + top_border_stride, bottom - 3, bottom_border_stride, + width, height, sgr_buffer, dst); } else if (radius_pass_0 == 0) { - BoxFilterProcessPass2(restoration_info, src - 2, top - 2, bottom - 2, - stride, width, height, sgr_buffer, dst); + BoxFilterProcessPass2(restoration_info, src - 2, stride, top - 2, + top_border_stride, bottom - 2, bottom_border_stride, + width, height, sgr_buffer, dst); } else { - BoxFilterProcess(restoration_info, src - 3, top - 3, bottom - 3, stride, - width, height, sgr_buffer, dst); + BoxFilterProcess(restoration_info, src - 3, stride, top - 3, + top_border_stride, bottom - 3, bottom_border_stride, width, + height, sgr_buffer, dst); } } @@ -2538,7 +2565,7 @@ void LoopRestorationInit_SSE4_1() { low_bitdepth::Init8bpp(); } } // namespace dsp } // namespace libgav1 -#else // !LIBGAV1_TARGETING_SSE4_1 +#else // !LIBGAV1_TARGETING_SSE4_1 namespace libgav1 { namespace dsp { |