aboutsummaryrefslogtreecommitdiff
path: root/src/dsp/x86/loop_restoration_sse4.cc
diff options
context:
space:
mode:
Diffstat (limited to 'src/dsp/x86/loop_restoration_sse4.cc')
-rw-r--r--src/dsp/x86/loop_restoration_sse4.cc241
1 files changed, 134 insertions, 107 deletions
diff --git a/src/dsp/x86/loop_restoration_sse4.cc b/src/dsp/x86/loop_restoration_sse4.cc
index 24f5ad2..273bcc8 100644
--- a/src/dsp/x86/loop_restoration_sse4.cc
+++ b/src/dsp/x86/loop_restoration_sse4.cc
@@ -481,13 +481,12 @@ inline void WienerVerticalTap1(const int16_t* wiener_buffer,
}
}
-void WienerFilter_SSE4_1(const RestorationUnitInfo& restoration_info,
- const void* const source, const void* const top_border,
- const void* const bottom_border,
- const ptrdiff_t stride, const int width,
- const int height,
- RestorationBuffer* const restoration_buffer,
- void* const dest) {
+void WienerFilter_SSE4_1(
+ const RestorationUnitInfo& restoration_info, const void* const source,
+ const ptrdiff_t stride, const void* const top_border,
+ const ptrdiff_t top_border_stride, const void* const bottom_border,
+ const ptrdiff_t bottom_border_stride, const int width, const int height,
+ RestorationBuffer* const restoration_buffer, void* const dest) {
const int16_t* const number_leading_zero_coefficients =
restoration_info.wiener_info.number_leading_zero_coefficients;
const int number_rows_to_skip = std::max(
@@ -516,45 +515,48 @@ void WienerFilter_SSE4_1(const RestorationUnitInfo& restoration_info,
const __m128i coefficients_horizontal =
_mm_sub_epi16(c, _mm_setr_epi16(0, 0, 0, 128, 0, 0, 0, 0));
if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) {
- WienerHorizontalTap7(top + (2 - height_extra) * stride - 3, stride,
- wiener_stride, height_extra, filter_horizontal[0],
- coefficients_horizontal, &wiener_buffer_horizontal);
- WienerHorizontalTap7(src - 3, stride, wiener_stride, height,
+ WienerHorizontalTap7(top + (2 - height_extra) * top_border_stride - 3,
+ top_border_stride, wiener_stride, height_extra,
filter_horizontal[0], coefficients_horizontal,
&wiener_buffer_horizontal);
- WienerHorizontalTap7(bottom - 3, stride, wiener_stride, height_extra,
+ WienerHorizontalTap7(src - 3, stride, wiener_stride, height,
filter_horizontal[0], coefficients_horizontal,
&wiener_buffer_horizontal);
- } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
- WienerHorizontalTap5(top + (2 - height_extra) * stride - 2, stride,
- wiener_stride, height_extra, filter_horizontal[1],
+ WienerHorizontalTap7(bottom - 3, bottom_border_stride, wiener_stride,
+ height_extra, filter_horizontal[0],
coefficients_horizontal, &wiener_buffer_horizontal);
- WienerHorizontalTap5(src - 2, stride, wiener_stride, height,
+ } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
+ WienerHorizontalTap5(top + (2 - height_extra) * top_border_stride - 2,
+ top_border_stride, wiener_stride, height_extra,
filter_horizontal[1], coefficients_horizontal,
&wiener_buffer_horizontal);
- WienerHorizontalTap5(bottom - 2, stride, wiener_stride, height_extra,
+ WienerHorizontalTap5(src - 2, stride, wiener_stride, height,
filter_horizontal[1], coefficients_horizontal,
&wiener_buffer_horizontal);
+ WienerHorizontalTap5(bottom - 2, bottom_border_stride, wiener_stride,
+ height_extra, filter_horizontal[1],
+ coefficients_horizontal, &wiener_buffer_horizontal);
} else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) {
// The maximum over-reads happen here.
- WienerHorizontalTap3(top + (2 - height_extra) * stride - 1, stride,
- wiener_stride, height_extra, filter_horizontal[2],
- coefficients_horizontal, &wiener_buffer_horizontal);
- WienerHorizontalTap3(src - 1, stride, wiener_stride, height,
+ WienerHorizontalTap3(top + (2 - height_extra) * top_border_stride - 1,
+ top_border_stride, wiener_stride, height_extra,
filter_horizontal[2], coefficients_horizontal,
&wiener_buffer_horizontal);
- WienerHorizontalTap3(bottom - 1, stride, wiener_stride, height_extra,
+ WienerHorizontalTap3(src - 1, stride, wiener_stride, height,
filter_horizontal[2], coefficients_horizontal,
&wiener_buffer_horizontal);
+ WienerHorizontalTap3(bottom - 1, bottom_border_stride, wiener_stride,
+ height_extra, filter_horizontal[2],
+ coefficients_horizontal, &wiener_buffer_horizontal);
} else {
assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3);
- WienerHorizontalTap1(top + (2 - height_extra) * stride, stride,
- wiener_stride, height_extra,
+ WienerHorizontalTap1(top + (2 - height_extra) * top_border_stride,
+ top_border_stride, wiener_stride, height_extra,
&wiener_buffer_horizontal);
WienerHorizontalTap1(src, stride, wiener_stride, height,
&wiener_buffer_horizontal);
- WienerHorizontalTap1(bottom, stride, wiener_stride, height_extra,
- &wiener_buffer_horizontal);
+ WienerHorizontalTap1(bottom, bottom_border_stride, wiener_stride,
+ height_extra, &wiener_buffer_horizontal);
}
// vertical filtering.
@@ -1160,11 +1162,26 @@ inline __m128i CalculateMa(const __m128i sum, const __m128i sum_sq[2],
return _mm_packus_epi32(z0, z1);
}
-template <int n>
-inline __m128i CalculateB(const __m128i sum, const __m128i ma) {
- static_assert(n == 9 || n == 25, "");
+inline __m128i CalculateB5(const __m128i sum, const __m128i ma) {
+ // one_over_n == 164.
constexpr uint32_t one_over_n =
- ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n;
+ ((1 << kSgrProjReciprocalBits) + (25 >> 1)) / 25;
+ // one_over_n_quarter == 41.
+ constexpr uint32_t one_over_n_quarter = one_over_n >> 2;
+ static_assert(one_over_n == one_over_n_quarter << 2, "");
+ // |ma| is in range [0, 255].
+ const __m128i m = _mm_maddubs_epi16(ma, _mm_set1_epi16(one_over_n_quarter));
+ const __m128i m0 = VmullLo16(m, sum);
+ const __m128i m1 = VmullHi16(m, sum);
+ const __m128i b_lo = VrshrU32(m0, kSgrProjReciprocalBits - 2);
+ const __m128i b_hi = VrshrU32(m1, kSgrProjReciprocalBits - 2);
+ return _mm_packus_epi32(b_lo, b_hi);
+}
+
+inline __m128i CalculateB3(const __m128i sum, const __m128i ma) {
+ // one_over_n == 455.
+ constexpr uint32_t one_over_n =
+ ((1 << kSgrProjReciprocalBits) + (9 >> 1)) / 9;
const __m128i m0 = VmullLo16(ma, sum);
const __m128i m1 = VmullHi16(ma, sum);
const __m128i m2 = _mm_mullo_epi32(m0, _mm_set1_epi32(one_over_n));
@@ -1227,12 +1244,12 @@ inline void LookupIntermediate(const __m128i sum, const __m128i index,
} else {
maq = _mm_unpackhi_epi8(*ma, _mm_setzero_si128());
}
- *b = CalculateB<n>(sum, maq);
+ *b = (n == 9) ? CalculateB3(sum, maq) : CalculateB5(sum, maq);
}
// Set the shuffle control mask of indices out of range [0, 15] to (1xxxxxxx)b
// to get value 0 as the shuffle result. The most significiant bit 1 comes
-// either from the comparision instruction, or from the sign bit of the index.
+// either from the comparison instruction, or from the sign bit of the index.
inline __m128i ShuffleIndex(const __m128i table, const __m128i index) {
__m128i mask;
mask = _mm_cmpgt_epi8(index, _mm_set1_epi8(15));
@@ -1250,15 +1267,15 @@ inline __m128i AdjustValue(const __m128i value, const __m128i index,
inline void CalculateIntermediate(const __m128i sum[2], const __m128i index[2],
__m128i* const ma, __m128i* const b0,
__m128i* const b1) {
- // Use table lookup to read elements which indices are less than 48.
+ // Use table lookup to read elements whose indices are less than 48.
const __m128i c0 = LoadAligned16(kSgrMaLookup + 0 * 16);
const __m128i c1 = LoadAligned16(kSgrMaLookup + 1 * 16);
const __m128i c2 = LoadAligned16(kSgrMaLookup + 2 * 16);
const __m128i indices = _mm_packus_epi16(index[0], index[1]);
__m128i idx;
- // Clip idx to 127 to apply signed comparision instructions.
+ // Clip idx to 127 to apply signed comparison instructions.
idx = _mm_min_epu8(indices, _mm_set1_epi8(127));
- // All elements which indices are less than 48 are set to 0.
+ // All elements whose indices are less than 48 are set to 0.
// Get shuffle results for indices in range [0, 15].
*ma = ShuffleIndex(c0, idx);
// Get shuffle results for indices in range [16, 31].
@@ -1273,12 +1290,12 @@ inline void CalculateIntermediate(const __m128i sum[2], const __m128i index[2],
const __m128i res2 = ShuffleIndex(c2, idx);
*ma = _mm_or_si128(*ma, res2);
- // For elements which indices are larger than 47, since they seldom change
+ // For elements whose indices are larger than 47, since they seldom change
// values with the increase of the index, we use comparison and arithmetic
// operations to calculate their values.
- // Add -128 to apply signed comparision instructions.
+ // Add -128 to apply signed comparison instructions.
idx = _mm_add_epi8(indices, _mm_set1_epi8(-128));
- // Elements which indices are larger than 47 (with value 0) are set to 5.
+ // Elements whose indices are larger than 47 (with value 0) are set to 5.
*ma = _mm_max_epu8(*ma, _mm_set1_epi8(5));
*ma = AdjustValue(*ma, idx, 55); // 55 is the last index which value is 5.
*ma = AdjustValue(*ma, idx, 72); // 72 is the last index which value is 4.
@@ -1298,9 +1315,9 @@ inline void CalculateIntermediate(const __m128i sum[2], const __m128i index[2],
// Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
// Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
const __m128i maq0 = _mm_unpacklo_epi8(*ma, _mm_setzero_si128());
- *b0 = CalculateB<9>(sum[0], maq0);
+ *b0 = CalculateB3(sum[0], maq0);
const __m128i maq1 = _mm_unpackhi_epi8(*ma, _mm_setzero_si128());
- *b1 = CalculateB<9>(sum[1], maq1);
+ *b1 = CalculateB3(sum[1], maq1);
}
inline void CalculateIntermediate(const __m128i sum[2], const __m128i index[2],
@@ -1776,9 +1793,9 @@ inline void BoxSumFilterPreProcess(
const uint8_t* const src0, const uint8_t* const src1, const int width,
const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5],
uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
- const ptrdiff_t sum_width, uint16_t* const ma343[4],
- uint16_t* const ma444[2], uint16_t* ma565, uint32_t* const b343[4],
- uint32_t* const b444[2], uint32_t* b565) {
+ const ptrdiff_t sum_width, uint16_t* const ma343[4], uint16_t* const ma444,
+ uint16_t* ma565, uint32_t* const b343[4], uint32_t* const b444,
+ uint32_t* b565) {
__m128i s[2][2], ma3[2][2], ma5[2], sq[2][4], b3[2][3], b5[3];
s[0][0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width);
s[1][0] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1 - width);
@@ -1808,9 +1825,8 @@ inline void BoxSumFilterPreProcess(
Sum565W(b5 + 1, b + 2);
StoreAligned64U32(b565, b);
Prepare3_8<0>(ma3[1], ma3x);
- Store343_444Lo(ma3x, b3[1], x, ma343[1], ma444[0], b343[1], b444[0]);
- Store343_444Hi(ma3x, b3[1] + 1, x + 8, ma343[1], ma444[0], b343[1],
- b444[0]);
+ Store343_444Lo(ma3x, b3[1], x, ma343[1], ma444, b343[1], b444);
+ Store343_444Hi(ma3x, b3[1] + 1, x + 8, ma343[1], ma444, b343[1], b444);
Prepare3_8<0>(ma5, ma5x);
ma[0] = Sum565Lo(ma5x);
ma[1] = Sum565Hi(ma5x);
@@ -1854,8 +1870,9 @@ inline __m128i CalculateFilteredOutput(const __m128i src, const __m128i ma,
return _mm_packs_epi32(dst_lo, dst_hi); // 13 bits
}
-inline __m128i CalculateFilteredOutputPass1(const __m128i src, __m128i ma[2],
- __m128i b[2][2]) {
+inline __m128i CalculateFilteredOutputPass1(const __m128i src,
+ const __m128i ma[2],
+ const __m128i b[2][2]) {
const __m128i ma_sum = _mm_add_epi16(ma[0], ma[1]);
__m128i b_sum[2];
b_sum[0] = _mm_add_epi32(b[0][0], b[1][0]);
@@ -1863,8 +1880,9 @@ inline __m128i CalculateFilteredOutputPass1(const __m128i src, __m128i ma[2],
return CalculateFilteredOutput<5>(src, ma_sum, b_sum);
}
-inline __m128i CalculateFilteredOutputPass2(const __m128i src, __m128i ma[3],
- __m128i b[3][2]) {
+inline __m128i CalculateFilteredOutputPass2(const __m128i src,
+ const __m128i ma[3],
+ const __m128i b[3][2]) {
const __m128i ma_sum = Sum3_16(ma);
__m128i b_sum[2];
Sum3_32(b, b_sum);
@@ -1916,15 +1934,15 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterPass1(
int x = 0;
do {
- __m128i ma[2], ma3[3], b[2][2], sr[2], p[2];
+ __m128i ma[2], ma5[3], b[2][2], sr[2], p[2];
s[0][1] = LoadUnaligned16Msan(src0 + x + 16,
x + 16 + kOverreadInBytesPass1 - width);
s[1][1] = LoadUnaligned16Msan(src1 + x + 16,
x + 16 + kOverreadInBytesPass1 - width);
BoxFilterPreProcess5(s, sum_width, x + 8, scale, sum5, square_sum5, sq, mas,
bs);
- Prepare3_8<0>(mas, ma3);
- ma[1] = Sum565Lo(ma3);
+ Prepare3_8<0>(mas, ma5);
+ ma[1] = Sum565Lo(ma5);
StoreAligned16(ma565[1] + x, ma[1]);
Sum565W(bs, b[1]);
StoreAligned32U32(b565[1] + x, b[1]);
@@ -1939,7 +1957,7 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterPass1(
const __m128i d00 = SelfGuidedSingleMultiplier(sr0_lo, p[0], w0);
const __m128i d10 = SelfGuidedSingleMultiplier(sr1_lo, p[1], w0);
- ma[1] = Sum565Hi(ma3);
+ ma[1] = Sum565Hi(ma5);
StoreAligned16(ma565[1] + x + 8, ma[1]);
Sum565W(bs + 1, b[1]);
StoreAligned32U32(b565[1] + x + 8, b[1]);
@@ -2158,9 +2176,9 @@ inline void BoxFilterLastRow(
const ptrdiff_t sum_width, const uint16_t scales[2], const int16_t w0,
const int16_t w2, uint16_t* const sum3[4], uint16_t* const sum5[5],
uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
- uint16_t* const ma343[4], uint16_t* const ma444[3],
- uint16_t* const ma565[2], uint32_t* const b343[4], uint32_t* const b444[3],
- uint32_t* const b565[2], uint8_t* const dst) {
+ uint16_t* const ma343, uint16_t* const ma444, uint16_t* const ma565,
+ uint32_t* const b343, uint32_t* const b444, uint32_t* const b565,
+ uint8_t* const dst) {
__m128i s[2], ma3[2], ma5[2], sq[4], b3[3], b5[3], ma[3], b[3][2];
s[0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width);
sq[0] = SquareLo8(s[0]);
@@ -2183,13 +2201,13 @@ inline void BoxFilterLastRow(
Sum343W(b3, b[2]);
const __m128i sr = LoadAligned16(src + x);
const __m128i sr_lo = _mm_unpacklo_epi8(sr, _mm_setzero_si128());
- ma[0] = LoadAligned16(ma565[0] + x);
- LoadAligned32U32(b565[0] + x, b[0]);
+ ma[0] = LoadAligned16(ma565 + x);
+ LoadAligned32U32(b565 + x, b[0]);
p[0] = CalculateFilteredOutputPass1(sr_lo, ma, b);
- ma[0] = LoadAligned16(ma343[0] + x);
- ma[1] = LoadAligned16(ma444[0] + x);
- LoadAligned32U32(b343[0] + x, b[0]);
- LoadAligned32U32(b444[0] + x, b[1]);
+ ma[0] = LoadAligned16(ma343 + x);
+ ma[1] = LoadAligned16(ma444 + x);
+ LoadAligned32U32(b343 + x, b[0]);
+ LoadAligned32U32(b444 + x, b[1]);
p[1] = CalculateFilteredOutputPass2(sr_lo, ma, b);
const __m128i d0 = SelfGuidedDoubleMultiplier(sr_lo, p, w0, w2);
@@ -2198,13 +2216,13 @@ inline void BoxFilterLastRow(
ma[2] = Sum343Hi(ma3x);
Sum343W(b3 + 1, b[2]);
const __m128i sr_hi = _mm_unpackhi_epi8(sr, _mm_setzero_si128());
- ma[0] = LoadAligned16(ma565[0] + x + 8);
- LoadAligned32U32(b565[0] + x + 8, b[0]);
+ ma[0] = LoadAligned16(ma565 + x + 8);
+ LoadAligned32U32(b565 + x + 8, b[0]);
p[0] = CalculateFilteredOutputPass1(sr_hi, ma, b);
- ma[0] = LoadAligned16(ma343[0] + x + 8);
- ma[1] = LoadAligned16(ma444[0] + x + 8);
- LoadAligned32U32(b343[0] + x + 8, b[0]);
- LoadAligned32U32(b444[0] + x + 8, b[1]);
+ ma[0] = LoadAligned16(ma343 + x + 8);
+ ma[1] = LoadAligned16(ma444 + x + 8);
+ LoadAligned32U32(b343 + x + 8, b[0]);
+ LoadAligned32U32(b444 + x + 8, b[1]);
p[1] = CalculateFilteredOutputPass2(sr_hi, ma, b);
const __m128i d1 = SelfGuidedDoubleMultiplier(sr_hi, p, w0, w2);
StoreAligned16(dst + x, _mm_packus_epi16(d0, d1));
@@ -2220,8 +2238,9 @@ inline void BoxFilterLastRow(
LIBGAV1_ALWAYS_INLINE void BoxFilterProcess(
const RestorationUnitInfo& restoration_info, const uint8_t* src,
- const uint8_t* const top_border, const uint8_t* bottom_border,
- const ptrdiff_t stride, const int width, const int height,
+ const ptrdiff_t stride, const uint8_t* const top_border,
+ const ptrdiff_t top_border_stride, const uint8_t* bottom_border,
+ const ptrdiff_t bottom_border_stride, const int width, const int height,
SgrBuffer* const sgr_buffer, uint8_t* dst) {
const auto temp_stride = Align<ptrdiff_t>(width, 16);
const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
@@ -2261,14 +2280,14 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterProcess(
b565[1] = b565[0] + temp_stride;
assert(scales[0] != 0);
assert(scales[1] != 0);
- BoxSum(top_border, stride, width, sum_stride, sum_width, sum3[0], sum5[1],
- square_sum3[0], square_sum5[1]);
+ BoxSum(top_border, top_border_stride, width, sum_stride, sum_width, sum3[0],
+ sum5[1], square_sum3[0], square_sum5[1]);
sum5[0] = sum5[1];
square_sum5[0] = square_sum5[1];
const uint8_t* const s = (height > 1) ? src + stride : bottom_border;
BoxSumFilterPreProcess(src, s, width, scales, sum3, sum5, square_sum3,
- square_sum5, sum_width, ma343, ma444, ma565[0], b343,
- b444, b565[0]);
+ square_sum5, sum_width, ma343, ma444[0], ma565[0],
+ b343, b444[0], b565[0]);
sum5[0] = sgr_buffer->sum5;
square_sum5[0] = sgr_buffer->square_sum5;
@@ -2298,7 +2317,7 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterProcess(
const uint8_t* sr[2];
if ((height & 1) == 0) {
sr[0] = bottom_border;
- sr[1] = bottom_border + stride;
+ sr[1] = bottom_border + bottom_border_stride;
} else {
sr[0] = src + 2 * stride;
sr[1] = bottom_border;
@@ -2322,19 +2341,21 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterProcess(
std::swap(ma565[0], ma565[1]);
std::swap(b565[0], b565[1]);
}
- BoxFilterLastRow(src + 3, bottom_border + stride, width, sum_width, scales,
- w0, w2, sum3, sum5, square_sum3, square_sum5, ma343, ma444,
- ma565, b343, b444, b565, dst);
+ BoxFilterLastRow(src + 3, bottom_border + bottom_border_stride, width,
+ sum_width, scales, w0, w2, sum3, sum5, square_sum3,
+ square_sum5, ma343[0], ma444[0], ma565[0], b343[0],
+ b444[0], b565[0], dst);
}
}
inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
- const uint8_t* src,
+ const uint8_t* src, const ptrdiff_t stride,
const uint8_t* const top_border,
+ const ptrdiff_t top_border_stride,
const uint8_t* bottom_border,
- const ptrdiff_t stride, const int width,
- const int height, SgrBuffer* const sgr_buffer,
- uint8_t* dst) {
+ const ptrdiff_t bottom_border_stride,
+ const int width, const int height,
+ SgrBuffer* const sgr_buffer, uint8_t* dst) {
const auto temp_stride = Align<ptrdiff_t>(width, 16);
const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
const auto sum_stride = temp_stride + 16;
@@ -2354,8 +2375,8 @@ inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
b565[0] = sgr_buffer->b565;
b565[1] = b565[0] + temp_stride;
assert(scale != 0);
- BoxSum<5>(top_border, stride, width, sum_stride, sum_width, sum5[1],
- square_sum5[1]);
+ BoxSum<5>(top_border, top_border_stride, width, sum_stride, sum_width,
+ sum5[1], square_sum5[1]);
sum5[0] = sum5[1];
square_sum5[0] = square_sum5[1];
const uint8_t* const s = (height > 1) ? src + stride : bottom_border;
@@ -2381,7 +2402,7 @@ inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
const uint8_t* sr[2];
if ((height & 1) == 0) {
sr[0] = bottom_border;
- sr[1] = bottom_border + stride;
+ sr[1] = bottom_border + bottom_border_stride;
} else {
sr[0] = src + 2 * stride;
sr[1] = bottom_border;
@@ -2399,18 +2420,20 @@ inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
Circulate5PointersBy2<uint16_t>(sum5);
Circulate5PointersBy2<uint32_t>(square_sum5);
}
- BoxFilterPass1LastRow(src, bottom_border + stride, width, sum_width, scale,
- w0, sum5, square_sum5, ma565[0], b565[0], dst);
+ BoxFilterPass1LastRow(src, bottom_border + bottom_border_stride, width,
+ sum_width, scale, w0, sum5, square_sum5, ma565[0],
+ b565[0], dst);
}
}
inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
- const uint8_t* src,
+ const uint8_t* src, const ptrdiff_t stride,
const uint8_t* const top_border,
+ const ptrdiff_t top_border_stride,
const uint8_t* bottom_border,
- const ptrdiff_t stride, const int width,
- const int height, SgrBuffer* const sgr_buffer,
- uint8_t* dst) {
+ const ptrdiff_t bottom_border_stride,
+ const int width, const int height,
+ SgrBuffer* const sgr_buffer, uint8_t* dst) {
assert(restoration_info.sgr_proj_info.multiplier[0] == 0);
const auto temp_stride = Align<ptrdiff_t>(width, 16);
const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
@@ -2436,8 +2459,8 @@ inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
b444[0] = sgr_buffer->b444;
b444[1] = b444[0] + temp_stride;
assert(scale != 0);
- BoxSum<3>(top_border, stride, width, sum_stride, sum_width, sum3[0],
- square_sum3[0]);
+ BoxSum<3>(top_border, top_border_stride, width, sum_stride, sum_width,
+ sum3[0], square_sum3[0]);
BoxSumFilterPreProcess3<false>(src, width, scale, sum3, square_sum3,
sum_width, ma343[0], nullptr, b343[0],
nullptr);
@@ -2448,7 +2471,7 @@ inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
s = src + stride;
} else {
s = bottom_border;
- bottom_border += stride;
+ bottom_border += bottom_border_stride;
}
BoxSumFilterPreProcess3<true>(s, width, scale, sum3, square_sum3, sum_width,
ma343[1], ma444[0], b343[1], b444[0]);
@@ -2475,7 +2498,7 @@ inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
square_sum3, ma343, ma444, b343, b444, dst);
src += stride;
dst += stride;
- bottom_border += stride;
+ bottom_border += bottom_border_stride;
Circulate3PointersBy1<uint16_t>(ma343);
Circulate3PointersBy1<uint32_t>(b343);
std::swap(ma444[0], ma444[1]);
@@ -2483,13 +2506,14 @@ inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
} while (--y != 0);
}
-// If |width| is non-multiple of 8, up to 7 more pixels are written to |dest| in
-// the end of each row. It is safe to overwrite the output as it will not be
+// If |width| is non-multiple of 16, up to 15 more pixels are written to |dest|
+// in the end of each row. It is safe to overwrite the output as it will not be
// part of the visible frame.
void SelfGuidedFilter_SSE4_1(
const RestorationUnitInfo& restoration_info, const void* const source,
- const void* const top_border, const void* const bottom_border,
- const ptrdiff_t stride, const int width, const int height,
+ const ptrdiff_t stride, const void* const top_border,
+ const ptrdiff_t top_border_stride, const void* const bottom_border,
+ const ptrdiff_t bottom_border_stride, const int width, const int height,
RestorationBuffer* const restoration_buffer, void* const dest) {
const int index = restoration_info.sgr_proj_info.index;
const int radius_pass_0 = kSgrProjParams[index][0]; // 2 or 0
@@ -2503,14 +2527,17 @@ void SelfGuidedFilter_SSE4_1(
// |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the
// following assertion.
assert(radius_pass_0 != 0);
- BoxFilterProcessPass1(restoration_info, src - 3, top - 3, bottom - 3,
- stride, width, height, sgr_buffer, dst);
+ BoxFilterProcessPass1(restoration_info, src - 3, stride, top - 3,
+ top_border_stride, bottom - 3, bottom_border_stride,
+ width, height, sgr_buffer, dst);
} else if (radius_pass_0 == 0) {
- BoxFilterProcessPass2(restoration_info, src - 2, top - 2, bottom - 2,
- stride, width, height, sgr_buffer, dst);
+ BoxFilterProcessPass2(restoration_info, src - 2, stride, top - 2,
+ top_border_stride, bottom - 2, bottom_border_stride,
+ width, height, sgr_buffer, dst);
} else {
- BoxFilterProcess(restoration_info, src - 3, top - 3, bottom - 3, stride,
- width, height, sgr_buffer, dst);
+ BoxFilterProcess(restoration_info, src - 3, stride, top - 3,
+ top_border_stride, bottom - 3, bottom_border_stride, width,
+ height, sgr_buffer, dst);
}
}
@@ -2538,7 +2565,7 @@ void LoopRestorationInit_SSE4_1() { low_bitdepth::Init8bpp(); }
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_TARGETING_SSE4_1
+#else // !LIBGAV1_TARGETING_SSE4_1
namespace libgav1 {
namespace dsp {