aboutsummaryrefslogtreecommitdiff
path: root/src/dsp/arm/loop_restoration_neon.cc
diff options
context:
space:
mode:
Diffstat (limited to 'src/dsp/arm/loop_restoration_neon.cc')
-rw-r--r--src/dsp/arm/loop_restoration_neon.cc687
1 files changed, 329 insertions, 358 deletions
diff --git a/src/dsp/arm/loop_restoration_neon.cc b/src/dsp/arm/loop_restoration_neon.cc
index e6ceb66..2db137f 100644
--- a/src/dsp/arm/loop_restoration_neon.cc
+++ b/src/dsp/arm/loop_restoration_neon.cc
@@ -28,6 +28,7 @@
#include "src/dsp/constants.h"
#include "src/dsp/dsp.h"
#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
#include "src/utils/constants.h"
namespace libgav1 {
@@ -491,11 +492,14 @@ inline void WienerVerticalTap1(const int16_t* wiener_buffer,
// filter row by row. This is faster than doing it column by column when
// considering cache issues.
void WienerFilter_NEON(
- const RestorationUnitInfo& restoration_info, const void* const source,
- const ptrdiff_t stride, const void* const top_border,
- const ptrdiff_t top_border_stride, const void* const bottom_border,
+ const RestorationUnitInfo& LIBGAV1_RESTRICT restoration_info,
+ const void* LIBGAV1_RESTRICT const source, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_border,
+ const ptrdiff_t top_border_stride,
+ const void* LIBGAV1_RESTRICT const bottom_border,
const ptrdiff_t bottom_border_stride, const int width, const int height,
- RestorationBuffer* const restoration_buffer, void* const dest) {
+ RestorationBuffer* LIBGAV1_RESTRICT const restoration_buffer,
+ void* LIBGAV1_RESTRICT const dest) {
const int16_t* const number_leading_zero_coefficients =
restoration_info.wiener_info.number_leading_zero_coefficients;
const int number_rows_to_skip = std::max(
@@ -591,6 +595,74 @@ void WienerFilter_NEON(
//------------------------------------------------------------------------------
// SGR
+// SIMD overreads 8 - (width % 8) - 2 * padding pixels, where padding is 3 for
+// Pass 1 and 2 for Pass 2.
+constexpr int kOverreadInBytesPass1 = 2;
+constexpr int kOverreadInBytesPass2 = 4;
+
+// SIMD overreads 16 - (width % 16) - 2 * padding pixels, where padding is 3 for
+// Pass 1 and 2 for Pass 2.
+constexpr int kWideOverreadInBytesPass1 = 10;
+constexpr int kWideOverreadInBytesPass2 = 12;
+
+inline void LoadAligned16x2U16(const uint16_t* const src[2], const ptrdiff_t x,
+ uint16x8_t dst[2]) {
+ dst[0] = vld1q_u16(src[0] + x);
+ dst[1] = vld1q_u16(src[1] + x);
+}
+
+inline void LoadAligned16x3U16(const uint16_t* const src[3], const ptrdiff_t x,
+ uint16x8_t dst[3]) {
+ dst[0] = vld1q_u16(src[0] + x);
+ dst[1] = vld1q_u16(src[1] + x);
+ dst[2] = vld1q_u16(src[2] + x);
+}
+
+inline void LoadAligned32U32(const uint32_t* const src, uint32x4x2_t* dst) {
+ (*dst).val[0] = vld1q_u32(src + 0);
+ (*dst).val[1] = vld1q_u32(src + 4);
+}
+
+inline void LoadAligned32x2U32(const uint32_t* const src[2], const ptrdiff_t x,
+ uint32x4x2_t dst[2]) {
+ LoadAligned32U32(src[0] + x, &dst[0]);
+ LoadAligned32U32(src[1] + x, &dst[1]);
+}
+
+inline void LoadAligned32x3U32(const uint32_t* const src[3], const ptrdiff_t x,
+ uint32x4x2_t dst[3]) {
+ LoadAligned32U32(src[0] + x, &dst[0]);
+ LoadAligned32U32(src[1] + x, &dst[1]);
+ LoadAligned32U32(src[2] + x, &dst[2]);
+}
+
+inline void StoreAligned32U16(uint16_t* const dst, const uint16x8_t src[2]) {
+ vst1q_u16(dst + 0, src[0]);
+ vst1q_u16(dst + 8, src[1]);
+}
+
+inline void StoreAligned32U32(uint32_t* const dst, const uint32x4x2_t src) {
+ vst1q_u32(dst + 0, src.val[0]);
+ vst1q_u32(dst + 4, src.val[1]);
+}
+
+inline void StoreAligned64U32(uint32_t* const dst, const uint32x4x2_t src[2]) {
+ vst1q_u32(dst + 0, src[0].val[0]);
+ vst1q_u32(dst + 4, src[0].val[1]);
+ vst1q_u32(dst + 8, src[1].val[0]);
+ vst1q_u32(dst + 12, src[1].val[1]);
+}
+
+inline uint16x8_t SquareLo8(const uint8x8_t src) { return vmull_u8(src, src); }
+
+inline uint16x8_t SquareLo8(const uint8x16_t src) {
+ return vmull_u8(vget_low_u8(src), vget_low_u8(src));
+}
+
+inline uint16x8_t SquareHi8(const uint8x16_t src) {
+ return vmull_u8(vget_high_u8(src), vget_high_u8(src));
+}
+
inline void Prepare3_8(const uint8x8_t src[2], uint8x8_t dst[3]) {
dst[0] = VshrU128<0>(src);
dst[1] = VshrU128<1>(src);
@@ -904,58 +976,69 @@ inline uint32x4x2_t Sum565W(const uint16x8_t src[2]) {
}
inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride,
- const ptrdiff_t sum_stride, uint16_t* sum3, uint16_t* sum5,
+ const ptrdiff_t width, const ptrdiff_t sum_stride,
+ const ptrdiff_t sum_width, uint16_t* sum3, uint16_t* sum5,
uint32_t* square_sum3, uint32_t* square_sum5) {
+ const ptrdiff_t overread_in_bytes = kOverreadInBytesPass1 - width;
int y = 2;
// Don't change loop width to 16, which is even slower.
do {
uint8x8_t s[2];
uint16x8_t sq[2];
- s[0] = vld1_u8(src);
- sq[0] = vmull_u8(s[0], s[0]);
- ptrdiff_t x = 0;
+ s[0] = Load1MsanU8(src, overread_in_bytes);
+ sq[0] = SquareLo8(s[0]);
+ ptrdiff_t x = sum_width;
do {
uint16x8_t row3, row5;
uint32x4x2_t row_sq3, row_sq5;
- s[1] = vld1_u8(src + x + 8);
- sq[1] = vmull_u8(s[1], s[1]);
+ x -= 8;
+ src += 8;
+ s[1] = Load1MsanU8(src, sum_width - x + overread_in_bytes);
+ sq[1] = SquareLo8(s[1]);
SumHorizontal(s, sq, &row3, &row5, &row_sq3, &row_sq5);
vst1q_u16(sum3, row3);
vst1q_u16(sum5, row5);
- vst1q_u32(square_sum3 + 0, row_sq3.val[0]);
- vst1q_u32(square_sum3 + 4, row_sq3.val[1]);
- vst1q_u32(square_sum5 + 0, row_sq5.val[0]);
- vst1q_u32(square_sum5 + 4, row_sq5.val[1]);
+ StoreAligned32U32(square_sum3 + 0, row_sq3);
+ StoreAligned32U32(square_sum5 + 0, row_sq5);
s[0] = s[1];
sq[0] = sq[1];
sum3 += 8;
sum5 += 8;
square_sum3 += 8;
square_sum5 += 8;
- x += 8;
- } while (x < sum_stride);
- src += src_stride;
+ } while (x != 0);
+ src += src_stride - sum_width;
+ sum3 += sum_stride - sum_width;
+ sum5 += sum_stride - sum_width;
+ square_sum3 += sum_stride - sum_width;
+ square_sum5 += sum_stride - sum_width;
} while (--y != 0);
}
template <int size>
inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride,
- const ptrdiff_t sum_stride, uint16_t* sums,
+ const ptrdiff_t width, const ptrdiff_t sum_stride,
+ const ptrdiff_t sum_width, uint16_t* sums,
uint32_t* square_sums) {
static_assert(size == 3 || size == 5, "");
+ const ptrdiff_t overread_in_bytes =
+ ((size == 5) ? kOverreadInBytesPass1 : kOverreadInBytesPass2) -
+ sizeof(*src) * width;
int y = 2;
// Don't change loop width to 16, which is even slower.
do {
uint8x8_t s[2];
uint16x8_t sq[2];
- s[0] = vld1_u8(src);
- sq[0] = vmull_u8(s[0], s[0]);
- ptrdiff_t x = 0;
+ s[0] = Load1MsanU8(src, overread_in_bytes);
+ sq[0] = SquareLo8(s[0]);
+ ptrdiff_t x = sum_width;
do {
uint16x8_t row;
uint32x4x2_t row_sq;
- s[1] = vld1_u8(src + x + 8);
- sq[1] = vmull_u8(s[1], s[1]);
+ x -= 8;
+ src += 8;
+ s[1] = Load1MsanU8(src, sum_width - x + overread_in_bytes);
+ sq[1] = SquareLo8(s[1]);
if (size == 3) {
row = Sum3Horizontal(s);
row_sq = Sum3WHorizontal(sq);
@@ -964,15 +1047,15 @@ inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride,
row_sq = Sum5WHorizontal(sq);
}
vst1q_u16(sums, row);
- vst1q_u32(square_sums + 0, row_sq.val[0]);
- vst1q_u32(square_sums + 4, row_sq.val[1]);
+ StoreAligned32U32(square_sums, row_sq);
s[0] = s[1];
sq[0] = sq[1];
sums += 8;
square_sums += 8;
- x += 8;
- } while (x < sum_stride);
- src += src_stride;
+ } while (x != 0);
+ src += src_stride - sum_width;
+ sums += sum_stride - sum_width;
+ square_sums += sum_stride - sum_width;
} while (--y != 0);
}
@@ -1143,339 +1226,216 @@ inline void Store343_444(const uint8x16_t ma3[3], const uint16x8_t b3[2],
}
LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5Lo(
- const uint8_t* const src0, const uint8_t* const src1, const uint32_t scale,
- uint8x16_t s[2][2], uint16_t* const sum5[5], uint32_t* const square_sum5[5],
- uint16x8_t sq[2][4], uint8x16_t* const ma, uint16x8_t* const b) {
+ uint8x16_t s[2][2], const uint32_t scale, uint16_t* const sum5[5],
+ uint32_t* const square_sum5[5], uint16x8_t sq[2][4], uint8x16_t* const ma,
+ uint16x8_t* const b) {
uint16x8_t s5[5];
uint32x4x2_t sq5[5];
- s[0][0] = vld1q_u8(src0);
- s[1][0] = vld1q_u8(src1);
- sq[0][0] = vmull_u8(vget_low_u8(s[0][0]), vget_low_u8(s[0][0]));
- sq[1][0] = vmull_u8(vget_low_u8(s[1][0]), vget_low_u8(s[1][0]));
- sq[0][1] = vmull_u8(vget_high_u8(s[0][0]), vget_high_u8(s[0][0]));
- sq[1][1] = vmull_u8(vget_high_u8(s[1][0]), vget_high_u8(s[1][0]));
+ sq[0][0] = SquareLo8(s[0][0]);
+ sq[1][0] = SquareLo8(s[1][0]);
+ sq[0][1] = SquareHi8(s[0][0]);
+ sq[1][1] = SquareHi8(s[1][0]);
s5[3] = Sum5Horizontal(s[0][0]);
s5[4] = Sum5Horizontal(s[1][0]);
sq5[3] = Sum5WHorizontal(sq[0]);
sq5[4] = Sum5WHorizontal(sq[1]);
vst1q_u16(sum5[3], s5[3]);
vst1q_u16(sum5[4], s5[4]);
- vst1q_u32(square_sum5[3] + 0, sq5[3].val[0]);
- vst1q_u32(square_sum5[3] + 4, sq5[3].val[1]);
- vst1q_u32(square_sum5[4] + 0, sq5[4].val[0]);
- vst1q_u32(square_sum5[4] + 4, sq5[4].val[1]);
- s5[0] = vld1q_u16(sum5[0]);
- s5[1] = vld1q_u16(sum5[1]);
- s5[2] = vld1q_u16(sum5[2]);
- sq5[0].val[0] = vld1q_u32(square_sum5[0] + 0);
- sq5[0].val[1] = vld1q_u32(square_sum5[0] + 4);
- sq5[1].val[0] = vld1q_u32(square_sum5[1] + 0);
- sq5[1].val[1] = vld1q_u32(square_sum5[1] + 4);
- sq5[2].val[0] = vld1q_u32(square_sum5[2] + 0);
- sq5[2].val[1] = vld1q_u32(square_sum5[2] + 4);
+ StoreAligned32U32(square_sum5[3], sq5[3]);
+ StoreAligned32U32(square_sum5[4], sq5[4]);
+ LoadAligned16x3U16(sum5, 0, s5);
+ LoadAligned32x3U32(square_sum5, 0, sq5);
CalculateIntermediate5<0>(s5, sq5, scale, ma, b);
}
LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5(
- const uint8_t* const src0, const uint8_t* const src1, const ptrdiff_t x,
- const uint32_t scale, uint8x16_t s[2][2], uint16_t* const sum5[5],
- uint32_t* const square_sum5[5], uint16x8_t sq[2][4], uint8x16_t ma[2],
- uint16x8_t b[2]) {
+ uint8x16_t s[2][2], const ptrdiff_t x, const uint32_t scale,
+ uint16_t* const sum5[5], uint32_t* const square_sum5[5],
+ uint16x8_t sq[2][4], uint8x16_t ma[2], uint16x8_t b[2]) {
uint16x8_t s5[2][5];
uint32x4x2_t sq5[5];
- s[0][1] = vld1q_u8(src0 + x + 8);
- s[1][1] = vld1q_u8(src1 + x + 8);
- sq[0][2] = vmull_u8(vget_low_u8(s[0][1]), vget_low_u8(s[0][1]));
- sq[1][2] = vmull_u8(vget_low_u8(s[1][1]), vget_low_u8(s[1][1]));
+ sq[0][2] = SquareLo8(s[0][1]);
+ sq[1][2] = SquareLo8(s[1][1]);
Sum5Horizontal<8>(s[0], &s5[0][3], &s5[1][3]);
Sum5Horizontal<8>(s[1], &s5[0][4], &s5[1][4]);
sq5[3] = Sum5WHorizontal(sq[0] + 1);
sq5[4] = Sum5WHorizontal(sq[1] + 1);
vst1q_u16(sum5[3] + x, s5[0][3]);
vst1q_u16(sum5[4] + x, s5[0][4]);
- vst1q_u32(square_sum5[3] + x + 0, sq5[3].val[0]);
- vst1q_u32(square_sum5[3] + x + 4, sq5[3].val[1]);
- vst1q_u32(square_sum5[4] + x + 0, sq5[4].val[0]);
- vst1q_u32(square_sum5[4] + x + 4, sq5[4].val[1]);
- s5[0][0] = vld1q_u16(sum5[0] + x);
- s5[0][1] = vld1q_u16(sum5[1] + x);
- s5[0][2] = vld1q_u16(sum5[2] + x);
- sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 0);
- sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 4);
- sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 0);
- sq5[1].val[1] = vld1q_u32(square_sum5[1] + x + 4);
- sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 0);
- sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 4);
+ StoreAligned32U32(square_sum5[3] + x, sq5[3]);
+ StoreAligned32U32(square_sum5[4] + x, sq5[4]);
+ LoadAligned16x3U16(sum5, x, s5[0]);
+ LoadAligned32x3U32(square_sum5, x, sq5);
CalculateIntermediate5<8>(s5[0], sq5, scale, &ma[0], &b[0]);
- sq[0][3] = vmull_u8(vget_high_u8(s[0][1]), vget_high_u8(s[0][1]));
- sq[1][3] = vmull_u8(vget_high_u8(s[1][1]), vget_high_u8(s[1][1]));
+ sq[0][3] = SquareHi8(s[0][1]);
+ sq[1][3] = SquareHi8(s[1][1]);
sq5[3] = Sum5WHorizontal(sq[0] + 2);
sq5[4] = Sum5WHorizontal(sq[1] + 2);
vst1q_u16(sum5[3] + x + 8, s5[1][3]);
vst1q_u16(sum5[4] + x + 8, s5[1][4]);
- vst1q_u32(square_sum5[3] + x + 8, sq5[3].val[0]);
- vst1q_u32(square_sum5[3] + x + 12, sq5[3].val[1]);
- vst1q_u32(square_sum5[4] + x + 8, sq5[4].val[0]);
- vst1q_u32(square_sum5[4] + x + 12, sq5[4].val[1]);
- s5[1][0] = vld1q_u16(sum5[0] + x + 8);
- s5[1][1] = vld1q_u16(sum5[1] + x + 8);
- s5[1][2] = vld1q_u16(sum5[2] + x + 8);
- sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 8);
- sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 12);
- sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 8);
- sq5[1].val[1] = vld1q_u32(square_sum5[1] + x + 12);
- sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 8);
- sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 12);
+ StoreAligned32U32(square_sum5[3] + x + 8, sq5[3]);
+ StoreAligned32U32(square_sum5[4] + x + 8, sq5[4]);
+ LoadAligned16x3U16(sum5, x + 8, s5[1]);
+ LoadAligned32x3U32(square_sum5, x + 8, sq5);
CalculateIntermediate5<0>(s5[1], sq5, scale, &ma[1], &b[1]);
}
LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRowLo(
- const uint8_t* const src, const uint32_t scale, uint8x16_t* const s,
- const uint16_t* const sum5[5], const uint32_t* const square_sum5[5],
- uint16x8_t sq[2], uint8x16_t* const ma, uint16x8_t* const b) {
+ uint8x16_t* const s, const uint32_t scale, const uint16_t* const sum5[5],
+ const uint32_t* const square_sum5[5], uint16x8_t sq[2],
+ uint8x16_t* const ma, uint16x8_t* const b) {
uint16x8_t s5[5];
uint32x4x2_t sq5[5];
- *s = vld1q_u8(src);
- sq[0] = vmull_u8(vget_low_u8(*s), vget_low_u8(*s));
- sq[1] = vmull_u8(vget_high_u8(*s), vget_high_u8(*s));
+ sq[0] = SquareLo8(s[0]);
+ sq[1] = SquareHi8(s[0]);
s5[3] = s5[4] = Sum5Horizontal(*s);
sq5[3] = sq5[4] = Sum5WHorizontal(sq);
- s5[0] = vld1q_u16(sum5[0]);
- s5[1] = vld1q_u16(sum5[1]);
- s5[2] = vld1q_u16(sum5[2]);
- sq5[0].val[0] = vld1q_u32(square_sum5[0] + 0);
- sq5[0].val[1] = vld1q_u32(square_sum5[0] + 4);
- sq5[1].val[0] = vld1q_u32(square_sum5[1] + 0);
- sq5[1].val[1] = vld1q_u32(square_sum5[1] + 4);
- sq5[2].val[0] = vld1q_u32(square_sum5[2] + 0);
- sq5[2].val[1] = vld1q_u32(square_sum5[2] + 4);
+ LoadAligned16x3U16(sum5, 0, s5);
+ LoadAligned32x3U32(square_sum5, 0, sq5);
CalculateIntermediate5<0>(s5, sq5, scale, ma, b);
}
LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRow(
- const uint8_t* const src, const ptrdiff_t x, const uint32_t scale,
- uint8x16_t s[2], const uint16_t* const sum5[5],
- const uint32_t* const square_sum5[5], uint16x8_t sq[3], uint8x16_t ma[2],
- uint16x8_t b[2]) {
+ uint8x16_t s[2], const ptrdiff_t x, const uint32_t scale,
+ const uint16_t* const sum5[5], const uint32_t* const square_sum5[5],
+ uint16x8_t sq[3], uint8x16_t ma[2], uint16x8_t b[2]) {
uint16x8_t s5[2][5];
uint32x4x2_t sq5[5];
- s[1] = vld1q_u8(src + x + 8);
- sq[1] = vmull_u8(vget_low_u8(s[1]), vget_low_u8(s[1]));
+ sq[1] = SquareLo8(s[1]);
Sum5Horizontal<8>(s, &s5[0][3], &s5[1][3]);
sq5[3] = sq5[4] = Sum5WHorizontal(sq);
- s5[0][0] = vld1q_u16(sum5[0] + x);
- s5[0][1] = vld1q_u16(sum5[1] + x);
- s5[0][2] = vld1q_u16(sum5[2] + x);
+ LoadAligned16x3U16(sum5, x, s5[0]);
s5[0][4] = s5[0][3];
- sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 0);
- sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 4);
- sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 0);
- sq5[1].val[1] = vld1q_u32(square_sum5[1] + x + 4);
- sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 0);
- sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 4);
+ LoadAligned32x3U32(square_sum5, x, sq5);
CalculateIntermediate5<8>(s5[0], sq5, scale, &ma[0], &b[0]);
- sq[2] = vmull_u8(vget_high_u8(s[1]), vget_high_u8(s[1]));
+ sq[2] = SquareHi8(s[1]);
sq5[3] = sq5[4] = Sum5WHorizontal(sq + 1);
- s5[1][0] = vld1q_u16(sum5[0] + x + 8);
- s5[1][1] = vld1q_u16(sum5[1] + x + 8);
- s5[1][2] = vld1q_u16(sum5[2] + x + 8);
+ LoadAligned16x3U16(sum5, x + 8, s5[1]);
s5[1][4] = s5[1][3];
- sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 8);
- sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 12);
- sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 8);
- sq5[1].val[1] = vld1q_u32(square_sum5[1] + x + 12);
- sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 8);
- sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 12);
+ LoadAligned32x3U32(square_sum5, x + 8, sq5);
CalculateIntermediate5<0>(s5[1], sq5, scale, &ma[1], &b[1]);
}
LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3Lo(
- const uint8_t* const src, const uint32_t scale, uint8x16_t* const s,
- uint16_t* const sum3[3], uint32_t* const square_sum3[3], uint16x8_t sq[2],
- uint8x16_t* const ma, uint16x8_t* const b) {
+ uint8x16_t* const s, const uint32_t scale, uint16_t* const sum3[3],
+ uint32_t* const square_sum3[3], uint16x8_t sq[2], uint8x16_t* const ma,
+ uint16x8_t* const b) {
uint16x8_t s3[3];
uint32x4x2_t sq3[3];
- *s = vld1q_u8(src);
- sq[0] = vmull_u8(vget_low_u8(*s), vget_low_u8(*s));
- sq[1] = vmull_u8(vget_high_u8(*s), vget_high_u8(*s));
+ sq[0] = SquareLo8(*s);
+ sq[1] = SquareHi8(*s);
s3[2] = Sum3Horizontal(*s);
sq3[2] = Sum3WHorizontal(sq);
vst1q_u16(sum3[2], s3[2]);
- vst1q_u32(square_sum3[2] + 0, sq3[2].val[0]);
- vst1q_u32(square_sum3[2] + 4, sq3[2].val[1]);
- s3[0] = vld1q_u16(sum3[0]);
- s3[1] = vld1q_u16(sum3[1]);
- sq3[0].val[0] = vld1q_u32(square_sum3[0] + 0);
- sq3[0].val[1] = vld1q_u32(square_sum3[0] + 4);
- sq3[1].val[0] = vld1q_u32(square_sum3[1] + 0);
- sq3[1].val[1] = vld1q_u32(square_sum3[1] + 4);
+ StoreAligned32U32(square_sum3[2], sq3[2]);
+ LoadAligned16x2U16(sum3, 0, s3);
+ LoadAligned32x2U32(square_sum3, 0, sq3);
CalculateIntermediate3<0>(s3, sq3, scale, ma, b);
}
LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3(
- const uint8_t* const src, const ptrdiff_t x, const uint32_t scale,
- uint16_t* const sum3[3], uint32_t* const square_sum3[3], uint8x16_t s[2],
- uint16x8_t sq[3], uint8x16_t ma[2], uint16x8_t b[2]) {
+ uint8x16_t s[2], const ptrdiff_t x, const uint32_t scale,
+ uint16_t* const sum3[3], uint32_t* const square_sum3[3], uint16x8_t sq[3],
+ uint8x16_t ma[2], uint16x8_t b[2]) {
uint16x8_t s3[4];
uint32x4x2_t sq3[3];
- s[1] = vld1q_u8(src + x + 8);
- sq[1] = vmull_u8(vget_low_u8(s[1]), vget_low_u8(s[1]));
+ sq[1] = SquareLo8(s[1]);
Sum3Horizontal<8>(s, s3 + 2);
sq3[2] = Sum3WHorizontal(sq);
vst1q_u16(sum3[2] + x, s3[2]);
- vst1q_u32(square_sum3[2] + x + 0, sq3[2].val[0]);
- vst1q_u32(square_sum3[2] + x + 4, sq3[2].val[1]);
- s3[0] = vld1q_u16(sum3[0] + x);
- s3[1] = vld1q_u16(sum3[1] + x);
- sq3[0].val[0] = vld1q_u32(square_sum3[0] + x + 0);
- sq3[0].val[1] = vld1q_u32(square_sum3[0] + x + 4);
- sq3[1].val[0] = vld1q_u32(square_sum3[1] + x + 0);
- sq3[1].val[1] = vld1q_u32(square_sum3[1] + x + 4);
+ StoreAligned32U32(square_sum3[2] + x, sq3[2]);
+ LoadAligned16x2U16(sum3, x, s3);
+ LoadAligned32x2U32(square_sum3, x, sq3);
CalculateIntermediate3<8>(s3, sq3, scale, &ma[0], &b[0]);
- sq[2] = vmull_u8(vget_high_u8(s[1]), vget_high_u8(s[1]));
+ sq[2] = SquareHi8(s[1]);
sq3[2] = Sum3WHorizontal(sq + 1);
vst1q_u16(sum3[2] + x + 8, s3[3]);
- vst1q_u32(square_sum3[2] + x + 8, sq3[2].val[0]);
- vst1q_u32(square_sum3[2] + x + 12, sq3[2].val[1]);
- s3[1] = vld1q_u16(sum3[0] + x + 8);
- s3[2] = vld1q_u16(sum3[1] + x + 8);
- sq3[0].val[0] = vld1q_u32(square_sum3[0] + x + 8);
- sq3[0].val[1] = vld1q_u32(square_sum3[0] + x + 12);
- sq3[1].val[0] = vld1q_u32(square_sum3[1] + x + 8);
- sq3[1].val[1] = vld1q_u32(square_sum3[1] + x + 12);
+ StoreAligned32U32(square_sum3[2] + x + 8, sq3[2]);
+ LoadAligned16x2U16(sum3, x + 8, s3 + 1);
+ LoadAligned32x2U32(square_sum3, x + 8, sq3);
CalculateIntermediate3<0>(s3 + 1, sq3, scale, &ma[1], &b[1]);
}
LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLo(
- const uint8_t* const src0, const uint8_t* const src1,
- const uint16_t scales[2], uint8x16_t s[2][2], uint16_t* const sum3[4],
+ uint8x16_t s[2][2], const uint16_t scales[2], uint16_t* const sum3[4],
uint16_t* const sum5[5], uint32_t* const square_sum3[4],
uint32_t* const square_sum5[5], uint16x8_t sq[2][4], uint8x16_t ma3[2][2],
uint16x8_t b3[2][3], uint8x16_t* const ma5, uint16x8_t* const b5) {
uint16x8_t s3[4], s5[5];
uint32x4x2_t sq3[4], sq5[5];
- s[0][0] = vld1q_u8(src0);
- s[1][0] = vld1q_u8(src1);
- sq[0][0] = vmull_u8(vget_low_u8(s[0][0]), vget_low_u8(s[0][0]));
- sq[1][0] = vmull_u8(vget_low_u8(s[1][0]), vget_low_u8(s[1][0]));
- sq[0][1] = vmull_u8(vget_high_u8(s[0][0]), vget_high_u8(s[0][0]));
- sq[1][1] = vmull_u8(vget_high_u8(s[1][0]), vget_high_u8(s[1][0]));
+ sq[0][0] = SquareLo8(s[0][0]);
+ sq[1][0] = SquareLo8(s[1][0]);
+ sq[0][1] = SquareHi8(s[0][0]);
+ sq[1][1] = SquareHi8(s[1][0]);
SumHorizontal(s[0][0], sq[0], &s3[2], &s5[3], &sq3[2], &sq5[3]);
SumHorizontal(s[1][0], sq[1], &s3[3], &s5[4], &sq3[3], &sq5[4]);
vst1q_u16(sum3[2], s3[2]);
vst1q_u16(sum3[3], s3[3]);
- vst1q_u32(square_sum3[2] + 0, sq3[2].val[0]);
- vst1q_u32(square_sum3[2] + 4, sq3[2].val[1]);
- vst1q_u32(square_sum3[3] + 0, sq3[3].val[0]);
- vst1q_u32(square_sum3[3] + 4, sq3[3].val[1]);
+ StoreAligned32U32(square_sum3[2], sq3[2]);
+ StoreAligned32U32(square_sum3[3], sq3[3]);
vst1q_u16(sum5[3], s5[3]);
vst1q_u16(sum5[4], s5[4]);
- vst1q_u32(square_sum5[3] + 0, sq5[3].val[0]);
- vst1q_u32(square_sum5[3] + 4, sq5[3].val[1]);
- vst1q_u32(square_sum5[4] + 0, sq5[4].val[0]);
- vst1q_u32(square_sum5[4] + 4, sq5[4].val[1]);
- s3[0] = vld1q_u16(sum3[0]);
- s3[1] = vld1q_u16(sum3[1]);
- sq3[0].val[0] = vld1q_u32(square_sum3[0] + 0);
- sq3[0].val[1] = vld1q_u32(square_sum3[0] + 4);
- sq3[1].val[0] = vld1q_u32(square_sum3[1] + 0);
- sq3[1].val[1] = vld1q_u32(square_sum3[1] + 4);
- s5[0] = vld1q_u16(sum5[0]);
- s5[1] = vld1q_u16(sum5[1]);
- s5[2] = vld1q_u16(sum5[2]);
- sq5[0].val[0] = vld1q_u32(square_sum5[0] + 0);
- sq5[0].val[1] = vld1q_u32(square_sum5[0] + 4);
- sq5[1].val[0] = vld1q_u32(square_sum5[1] + 0);
- sq5[1].val[1] = vld1q_u32(square_sum5[1] + 4);
- sq5[2].val[0] = vld1q_u32(square_sum5[2] + 0);
- sq5[2].val[1] = vld1q_u32(square_sum5[2] + 4);
+ StoreAligned32U32(square_sum5[3], sq5[3]);
+ StoreAligned32U32(square_sum5[4], sq5[4]);
+ LoadAligned16x2U16(sum3, 0, s3);
+ LoadAligned32x2U32(square_sum3, 0, sq3);
+ LoadAligned16x3U16(sum5, 0, s5);
+ LoadAligned32x3U32(square_sum5, 0, sq5);
CalculateIntermediate3<0>(s3, sq3, scales[1], ma3[0], b3[0]);
CalculateIntermediate3<0>(s3 + 1, sq3 + 1, scales[1], ma3[1], b3[1]);
CalculateIntermediate5<0>(s5, sq5, scales[0], ma5, b5);
}
LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess(
- const uint8_t* const src0, const uint8_t* const src1, const ptrdiff_t x,
- const uint16_t scales[2], uint8x16_t s[2][2], uint16_t* const sum3[4],
- uint16_t* const sum5[5], uint32_t* const square_sum3[4],
- uint32_t* const square_sum5[5], uint16x8_t sq[2][4], uint8x16_t ma3[2][2],
- uint16x8_t b3[2][3], uint8x16_t ma5[2], uint16x8_t b5[2]) {
+ const uint8x16_t s[2][2], const ptrdiff_t x, const uint16_t scales[2],
+ uint16_t* const sum3[4], uint16_t* const sum5[5],
+ uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+ uint16x8_t sq[2][4], uint8x16_t ma3[2][2], uint16x8_t b3[2][3],
+ uint8x16_t ma5[2], uint16x8_t b5[2]) {
uint16x8_t s3[2][4], s5[2][5];
uint32x4x2_t sq3[4], sq5[5];
- s[0][1] = vld1q_u8(src0 + x + 8);
- s[1][1] = vld1q_u8(src1 + x + 8);
- sq[0][2] = vmull_u8(vget_low_u8(s[0][1]), vget_low_u8(s[0][1]));
- sq[1][2] = vmull_u8(vget_low_u8(s[1][1]), vget_low_u8(s[1][1]));
+ sq[0][2] = SquareLo8(s[0][1]);
+ sq[1][2] = SquareLo8(s[1][1]);
SumHorizontal<8>(s[0], &s3[0][2], &s3[1][2], &s5[0][3], &s5[1][3]);
SumHorizontal<8>(s[1], &s3[0][3], &s3[1][3], &s5[0][4], &s5[1][4]);
SumHorizontal(sq[0] + 1, &sq3[2], &sq5[3]);
SumHorizontal(sq[1] + 1, &sq3[3], &sq5[4]);
vst1q_u16(sum3[2] + x, s3[0][2]);
vst1q_u16(sum3[3] + x, s3[0][3]);
- vst1q_u32(square_sum3[2] + x + 0, sq3[2].val[0]);
- vst1q_u32(square_sum3[2] + x + 4, sq3[2].val[1]);
- vst1q_u32(square_sum3[3] + x + 0, sq3[3].val[0]);
- vst1q_u32(square_sum3[3] + x + 4, sq3[3].val[1]);
+ StoreAligned32U32(square_sum3[2] + x, sq3[2]);
+ StoreAligned32U32(square_sum3[3] + x, sq3[3]);
vst1q_u16(sum5[3] + x, s5[0][3]);
vst1q_u16(sum5[4] + x, s5[0][4]);
- vst1q_u32(square_sum5[3] + x + 0, sq5[3].val[0]);
- vst1q_u32(square_sum5[3] + x + 4, sq5[3].val[1]);
- vst1q_u32(square_sum5[4] + x + 0, sq5[4].val[0]);
- vst1q_u32(square_sum5[4] + x + 4, sq5[4].val[1]);
- s3[0][0] = vld1q_u16(sum3[0] + x);
- s3[0][1] = vld1q_u16(sum3[1] + x);
- sq3[0].val[0] = vld1q_u32(square_sum3[0] + x + 0);
- sq3[0].val[1] = vld1q_u32(square_sum3[0] + x + 4);
- sq3[1].val[0] = vld1q_u32(square_sum3[1] + x + 0);
- sq3[1].val[1] = vld1q_u32(square_sum3[1] + x + 4);
- s5[0][0] = vld1q_u16(sum5[0] + x);
- s5[0][1] = vld1q_u16(sum5[1] + x);
- s5[0][2] = vld1q_u16(sum5[2] + x);
- sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 0);
- sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 4);
- sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 0);
- sq5[1].val[1] = vld1q_u32(square_sum5[1] + x + 4);
- sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 0);
- sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 4);
+ StoreAligned32U32(square_sum5[3] + x, sq5[3]);
+ StoreAligned32U32(square_sum5[4] + x, sq5[4]);
+ LoadAligned16x2U16(sum3, x, s3[0]);
+ LoadAligned32x2U32(square_sum3, x, sq3);
+ LoadAligned16x3U16(sum5, x, s5[0]);
+ LoadAligned32x3U32(square_sum5, x, sq5);
CalculateIntermediate3<8>(s3[0], sq3, scales[1], &ma3[0][0], &b3[0][1]);
CalculateIntermediate3<8>(s3[0] + 1, sq3 + 1, scales[1], &ma3[1][0],
&b3[1][1]);
CalculateIntermediate5<8>(s5[0], sq5, scales[0], &ma5[0], &b5[0]);
- sq[0][3] = vmull_u8(vget_high_u8(s[0][1]), vget_high_u8(s[0][1]));
- sq[1][3] = vmull_u8(vget_high_u8(s[1][1]), vget_high_u8(s[1][1]));
+ sq[0][3] = SquareHi8(s[0][1]);
+ sq[1][3] = SquareHi8(s[1][1]);
SumHorizontal(sq[0] + 2, &sq3[2], &sq5[3]);
SumHorizontal(sq[1] + 2, &sq3[3], &sq5[4]);
vst1q_u16(sum3[2] + x + 8, s3[1][2]);
vst1q_u16(sum3[3] + x + 8, s3[1][3]);
- vst1q_u32(square_sum3[2] + x + 8, sq3[2].val[0]);
- vst1q_u32(square_sum3[2] + x + 12, sq3[2].val[1]);
- vst1q_u32(square_sum3[3] + x + 8, sq3[3].val[0]);
- vst1q_u32(square_sum3[3] + x + 12, sq3[3].val[1]);
+ StoreAligned32U32(square_sum3[2] + x + 8, sq3[2]);
+ StoreAligned32U32(square_sum3[3] + x + 8, sq3[3]);
vst1q_u16(sum5[3] + x + 8, s5[1][3]);
vst1q_u16(sum5[4] + x + 8, s5[1][4]);
- vst1q_u32(square_sum5[3] + x + 8, sq5[3].val[0]);
- vst1q_u32(square_sum5[3] + x + 12, sq5[3].val[1]);
- vst1q_u32(square_sum5[4] + x + 8, sq5[4].val[0]);
- vst1q_u32(square_sum5[4] + x + 12, sq5[4].val[1]);
- s3[1][0] = vld1q_u16(sum3[0] + x + 8);
- s3[1][1] = vld1q_u16(sum3[1] + x + 8);
- sq3[0].val[0] = vld1q_u32(square_sum3[0] + x + 8);
- sq3[0].val[1] = vld1q_u32(square_sum3[0] + x + 12);
- sq3[1].val[0] = vld1q_u32(square_sum3[1] + x + 8);
- sq3[1].val[1] = vld1q_u32(square_sum3[1] + x + 12);
- s5[1][0] = vld1q_u16(sum5[0] + x + 8);
- s5[1][1] = vld1q_u16(sum5[1] + x + 8);
- s5[1][2] = vld1q_u16(sum5[2] + x + 8);
- sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 8);
- sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 12);
- sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 8);
- sq5[1].val[1] = vld1q_u32(square_sum5[1] + x + 12);
- sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 8);
- sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 12);
+ StoreAligned32U32(square_sum5[3] + x + 8, sq5[3]);
+ StoreAligned32U32(square_sum5[4] + x + 8, sq5[4]);
+ LoadAligned16x2U16(sum3, x + 8, s3[1]);
+ LoadAligned32x2U32(square_sum3, x + 8, sq3);
+ LoadAligned16x3U16(sum5, x + 8, s5[1]);
+ LoadAligned32x3U32(square_sum5, x + 8, sq5);
CalculateIntermediate3<0>(s3[1], sq3, scales[1], &ma3[0][1], &b3[0][2]);
CalculateIntermediate3<0>(s3[1] + 1, sq3 + 1, scales[1], &ma3[1][1],
&b3[1][2]);
@@ -1483,90 +1443,55 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess(
}
LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRowLo(
- const uint8_t* const src, const uint16_t scales[2],
+ uint8x16_t* const s, const uint16_t scales[2],
const uint16_t* const sum3[4], const uint16_t* const sum5[5],
const uint32_t* const square_sum3[4], const uint32_t* const square_sum5[5],
- uint8x16_t* const s, uint16x8_t sq[2], uint8x16_t* const ma3,
- uint8x16_t* const ma5, uint16x8_t* const b3, uint16x8_t* const b5) {
+ uint16x8_t sq[2], uint8x16_t* const ma3, uint8x16_t* const ma5,
+ uint16x8_t* const b3, uint16x8_t* const b5) {
uint16x8_t s3[3], s5[5];
uint32x4x2_t sq3[3], sq5[5];
- *s = vld1q_u8(src);
- sq[0] = vmull_u8(vget_low_u8(*s), vget_low_u8(*s));
- sq[1] = vmull_u8(vget_high_u8(*s), vget_high_u8(*s));
+ sq[0] = SquareLo8(s[0]);
+ sq[1] = SquareHi8(s[0]);
SumHorizontal(*s, sq, &s3[2], &s5[3], &sq3[2], &sq5[3]);
- s5[0] = vld1q_u16(sum5[0]);
- s5[1] = vld1q_u16(sum5[1]);
- s5[2] = vld1q_u16(sum5[2]);
+ LoadAligned16x3U16(sum5, 0, s5);
s5[4] = s5[3];
- sq5[0].val[0] = vld1q_u32(square_sum5[0] + 0);
- sq5[0].val[1] = vld1q_u32(square_sum5[0] + 4);
- sq5[1].val[0] = vld1q_u32(square_sum5[1] + 0);
- sq5[1].val[1] = vld1q_u32(square_sum5[1] + 4);
- sq5[2].val[0] = vld1q_u32(square_sum5[2] + 0);
- sq5[2].val[1] = vld1q_u32(square_sum5[2] + 4);
+ LoadAligned32x3U32(square_sum5, 0, sq5);
sq5[4] = sq5[3];
CalculateIntermediate5<0>(s5, sq5, scales[0], ma5, b5);
- s3[0] = vld1q_u16(sum3[0]);
- s3[1] = vld1q_u16(sum3[1]);
- sq3[0].val[0] = vld1q_u32(square_sum3[0] + 0);
- sq3[0].val[1] = vld1q_u32(square_sum3[0] + 4);
- sq3[1].val[0] = vld1q_u32(square_sum3[1] + 0);
- sq3[1].val[1] = vld1q_u32(square_sum3[1] + 4);
+ LoadAligned16x2U16(sum3, 0, s3);
+ LoadAligned32x2U32(square_sum3, 0, sq3);
CalculateIntermediate3<0>(s3, sq3, scales[1], ma3, b3);
}
LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRow(
- const uint8_t* const src, const ptrdiff_t x, const uint16_t scales[2],
+ uint8x16_t s[2], const ptrdiff_t x, const uint16_t scales[2],
const uint16_t* const sum3[4], const uint16_t* const sum5[5],
const uint32_t* const square_sum3[4], const uint32_t* const square_sum5[5],
- uint8x16_t s[2], uint16x8_t sq[3], uint8x16_t ma3[2], uint8x16_t ma5[2],
- uint16x8_t b3[2], uint16x8_t b5[2]) {
+ uint16x8_t sq[3], uint8x16_t ma3[2], uint8x16_t ma5[2], uint16x8_t b3[2],
+ uint16x8_t b5[2]) {
uint16x8_t s3[2][3], s5[2][5];
uint32x4x2_t sq3[3], sq5[5];
- s[1] = vld1q_u8(src + x + 8);
- sq[1] = vmull_u8(vget_low_u8(s[1]), vget_low_u8(s[1]));
+ sq[1] = SquareLo8(s[1]);
SumHorizontal<8>(s, &s3[0][2], &s3[1][2], &s5[0][3], &s5[1][3]);
SumHorizontal(sq, &sq3[2], &sq5[3]);
- s5[0][0] = vld1q_u16(sum5[0] + x);
- s5[0][1] = vld1q_u16(sum5[1] + x);
- s5[0][2] = vld1q_u16(sum5[2] + x);
+ LoadAligned16x3U16(sum5, x, s5[0]);
s5[0][4] = s5[0][3];
- sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 0);
- sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 4);
- sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 0);
- sq5[1].val[1] = vld1q_u32(square_sum5[1] + x + 4);
- sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 0);
- sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 4);
+ LoadAligned32x3U32(square_sum5, x, sq5);
sq5[4] = sq5[3];
CalculateIntermediate5<8>(s5[0], sq5, scales[0], &ma5[0], &b5[0]);
- s3[0][0] = vld1q_u16(sum3[0] + x);
- s3[0][1] = vld1q_u16(sum3[1] + x);
- sq3[0].val[0] = vld1q_u32(square_sum3[0] + x + 0);
- sq3[0].val[1] = vld1q_u32(square_sum3[0] + x + 4);
- sq3[1].val[0] = vld1q_u32(square_sum3[1] + x + 0);
- sq3[1].val[1] = vld1q_u32(square_sum3[1] + x + 4);
+ LoadAligned16x2U16(sum3, x, s3[0]);
+ LoadAligned32x2U32(square_sum3, x, sq3);
CalculateIntermediate3<8>(s3[0], sq3, scales[1], &ma3[0], &b3[0]);
- sq[2] = vmull_u8(vget_high_u8(s[1]), vget_high_u8(s[1]));
+ sq[2] = SquareHi8(s[1]);
SumHorizontal(sq + 1, &sq3[2], &sq5[3]);
- s5[1][0] = vld1q_u16(sum5[0] + x + 8);
- s5[1][1] = vld1q_u16(sum5[1] + x + 8);
- s5[1][2] = vld1q_u16(sum5[2] + x + 8);
+ LoadAligned16x3U16(sum5, x + 8, s5[1]);
s5[1][4] = s5[1][3];
- sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 8);
- sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 12);
- sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 8);
- sq5[1].val[1] = vld1q_u32(square_sum5[1] + x + 12);
- sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 8);
- sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 12);
+ LoadAligned32x3U32(square_sum5, x + 8, sq5);
sq5[4] = sq5[3];
CalculateIntermediate5<0>(s5[1], sq5, scales[0], &ma5[1], &b5[1]);
- s3[1][0] = vld1q_u16(sum3[0] + x + 8);
- s3[1][1] = vld1q_u16(sum3[1] + x + 8);
- sq3[0].val[0] = vld1q_u32(square_sum3[0] + x + 8);
- sq3[0].val[1] = vld1q_u32(square_sum3[0] + x + 12);
- sq3[1].val[0] = vld1q_u32(square_sum3[1] + x + 8);
- sq3[1].val[1] = vld1q_u32(square_sum3[1] + x + 12);
+ LoadAligned16x2U16(sum3, x + 8, s3[1]);
+ LoadAligned32x2U32(square_sum3, x + 8, sq3);
CalculateIntermediate3<0>(s3[1], sq3, scales[1], &ma3[1], &b3[1]);
}
@@ -1576,18 +1501,23 @@ inline void BoxSumFilterPreProcess5(const uint8_t* const src0,
uint16_t* const sum5[5],
uint32_t* const square_sum5[5],
uint16_t* ma565, uint32_t* b565) {
+ const ptrdiff_t overread_in_bytes = kWideOverreadInBytesPass1 - width;
uint8x16_t s[2][2], mas[2];
uint16x8_t sq[2][4], bs[3];
- BoxFilterPreProcess5Lo(src0, src1, scale, s, sum5, square_sum5, sq, &mas[0],
- &bs[0]);
+ // TODO(b/194217060): Future msan load.
+ s[0][0] = vld1q_u8(src0);
+ s[1][0] = vld1q_u8(src1);
+
+ BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq, &mas[0], &bs[0]);
int x = 0;
do {
uint16x8_t ma[2];
uint8x16_t masx[3];
uint32x4x2_t b[2];
- BoxFilterPreProcess5(src0, src1, x + 8, scale, s, sum5, square_sum5, sq,
- mas, bs + 1);
+ s[0][1] = Load1QMsanU8(src0 + x + 16, x + 16 + overread_in_bytes);
+ s[1][1] = Load1QMsanU8(src1 + x + 16, x + 16 + overread_in_bytes);
+ BoxFilterPreProcess5(s, x + 8, scale, sum5, square_sum5, sq, mas, bs + 1);
Prepare3_8<0>(mas, masx);
ma[0] = Sum565<0>(masx);
b[0] = Sum565W(bs);
@@ -1617,15 +1547,17 @@ LIBGAV1_ALWAYS_INLINE void BoxSumFilterPreProcess3(
const uint8_t* const src, const int width, const uint32_t scale,
uint16_t* const sum3[3], uint32_t* const square_sum3[3], uint16_t* ma343,
uint16_t* ma444, uint32_t* b343, uint32_t* b444) {
+ const ptrdiff_t overread_in_bytes = kWideOverreadInBytesPass2 - width;
uint8x16_t s[2], mas[2];
uint16x8_t sq[4], bs[3];
- BoxFilterPreProcess3Lo(src, scale, &s[0], sum3, square_sum3, sq, &mas[0],
- &bs[0]);
+ s[0] = Load1QMsanU8(src, overread_in_bytes);
+ BoxFilterPreProcess3Lo(&s[0], scale, sum3, square_sum3, sq, &mas[0], &bs[0]);
int x = 0;
do {
uint8x16_t ma3x[3];
- BoxFilterPreProcess3(src, x + 8, scale, sum3, square_sum3, s, sq + 1, mas,
+ s[1] = Load1QMsanU8(src + x + 16, x + 16 + overread_in_bytes);
+ BoxFilterPreProcess3(s, x + 8, scale, sum3, square_sum3, sq + 1, mas,
bs + 1);
Prepare3_8<0>(mas, ma3x);
if (calculate444) {
@@ -1664,43 +1596,43 @@ inline void BoxSumFilterPreProcess(
uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
uint16_t* const ma343[4], uint16_t* const ma444, uint16_t* ma565,
uint32_t* const b343[4], uint32_t* const b444, uint32_t* b565) {
+ const ptrdiff_t overread_in_bytes = kWideOverreadInBytesPass1 - width;
uint8x16_t s[2][2], ma3[2][2], ma5[2];
uint16x8_t sq[2][4], b3[2][3], b5[3];
- BoxFilterPreProcessLo(src0, src1, scales, s, sum3, sum5, square_sum3,
- square_sum5, sq, ma3, b3, &ma5[0], &b5[0]);
+ // TODO(b/194217060): Future msan load.
+ s[0][0] = vld1q_u8(src0);
+ s[1][0] = vld1q_u8(src1);
+
+ BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq,
+ ma3, b3, &ma5[0], &b5[0]);
int x = 0;
do {
uint16x8_t ma[2];
uint8x16_t ma3x[3], ma5x[3];
uint32x4x2_t b[2];
- BoxFilterPreProcess(src0, src1, x + 8, scales, s, sum3, sum5, square_sum3,
- square_sum5, sq, ma3, b3, ma5, b5 + 1);
+
+ s[0][1] = Load1QMsanU8(src0 + x + 16, x + 16 + overread_in_bytes);
+ s[1][1] = Load1QMsanU8(src1 + x + 16, x + 16 + overread_in_bytes);
+ BoxFilterPreProcess(s, x + 8, scales, sum3, sum5, square_sum3, square_sum5,
+ sq, ma3, b3, ma5, b5 + 1);
Prepare3_8<0>(ma3[0], ma3x);
ma[0] = Sum343<0>(ma3x);
ma[1] = Sum343<8>(ma3x);
+ StoreAligned32U16(ma343[0] + x, ma);
b[0] = Sum343W(b3[0] + 0);
b[1] = Sum343W(b3[0] + 1);
- vst1q_u16(ma343[0] + x, ma[0]);
- vst1q_u16(ma343[0] + x + 8, ma[1]);
- vst1q_u32(b343[0] + x, b[0].val[0]);
- vst1q_u32(b343[0] + x + 4, b[0].val[1]);
- vst1q_u32(b343[0] + x + 8, b[1].val[0]);
- vst1q_u32(b343[0] + x + 12, b[1].val[1]);
+ StoreAligned64U32(b343[0] + x, b);
Prepare3_8<0>(ma3[1], ma3x);
Store343_444<0>(ma3x, b3[1], x, ma343[1], ma444, b343[1], b444);
Store343_444<8>(ma3x, b3[1] + 1, x + 8, ma343[1], ma444, b343[1], b444);
Prepare3_8<0>(ma5, ma5x);
ma[0] = Sum565<0>(ma5x);
ma[1] = Sum565<8>(ma5x);
+ StoreAligned32U16(ma565, ma);
b[0] = Sum565W(b5);
b[1] = Sum565W(b5 + 1);
- vst1q_u16(ma565, ma[0]);
- vst1q_u16(ma565 + 8, ma[1]);
- vst1q_u32(b565 + 0, b[0].val[0]);
- vst1q_u32(b565 + 4, b[0].val[1]);
- vst1q_u32(b565 + 8, b[1].val[0]);
- vst1q_u32(b565 + 12, b[1].val[1]);
+ StoreAligned64U32(b565, b);
s[0][0] = s[0][1];
s[1][0] = s[1][1];
sq[0][1] = sq[0][3];
@@ -1799,10 +1731,13 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterPass1(
uint32_t* const square_sum5[5], const int width, const uint32_t scale,
const int16_t w0, uint16_t* const ma565[2], uint32_t* const b565[2],
uint8_t* const dst) {
+ const ptrdiff_t overread_in_bytes = kWideOverreadInBytesPass1 - width;
uint8x16_t s[2][2], mas[2];
uint16x8_t sq[2][4], bs[3];
- BoxFilterPreProcess5Lo(src0, src1, scale, s, sum5, square_sum5, sq, &mas[0],
- &bs[0]);
+ s[0][0] = Load1QMsanU8(src0, overread_in_bytes);
+ s[1][0] = Load1QMsanU8(src1, overread_in_bytes);
+
+ BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq, &mas[0], &bs[0]);
int x = 0;
do {
@@ -1810,8 +1745,9 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterPass1(
uint8x16_t masx[3];
uint32x4x2_t b[2];
int16x8_t p0, p1;
- BoxFilterPreProcess5(src0, src1, x + 8, scale, s, sum5, square_sum5, sq,
- mas, bs + 1);
+ s[0][1] = Load1QMsanU8(src0 + x + 16, x + 16 + overread_in_bytes);
+ s[1][1] = Load1QMsanU8(src1 + x + 16, x + 16 + overread_in_bytes);
+ BoxFilterPreProcess5(s, x + 8, scale, sum5, square_sum5, sq, mas, bs + 1);
Prepare3_8<0>(mas, masx);
ma[1] = Sum565<0>(masx);
b[1] = Sum565W(bs);
@@ -1865,7 +1801,10 @@ inline void BoxFilterPass1LastRow(const uint8_t* const src,
uint8_t* const dst) {
uint8x16_t s[2], mas[2];
uint16x8_t sq[4], bs[4];
- BoxFilterPreProcess5LastRowLo(src0, scale, s, sum5, square_sum5, sq, &mas[0],
+ // TODO(b/194217060): Future msan load.
+ s[0] = vld1q_u8(src0);
+
+ BoxFilterPreProcess5LastRowLo(s, scale, sum5, square_sum5, sq, &mas[0],
&bs[0]);
int x = 0;
@@ -1873,8 +1812,11 @@ inline void BoxFilterPass1LastRow(const uint8_t* const src,
uint16x8_t ma[2];
uint8x16_t masx[3];
uint32x4x2_t b[2];
- BoxFilterPreProcess5LastRow(src0, x + 8, scale, s, sum5, square_sum5,
- sq + 1, mas, bs + 1);
+ // TODO(b/194217060): Future msan load.
+ s[1] = vld1q_u8(src0 + x + 16);
+
+ BoxFilterPreProcess5LastRow(s, x + 8, scale, sum5, square_sum5, sq + 1, mas,
+ bs + 1);
Prepare3_8<0>(mas, masx);
ma[1] = Sum565<0>(masx);
b[1] = Sum565W(bs);
@@ -1911,17 +1853,21 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterPass2(
uint32_t* const square_sum3[3], uint16_t* const ma343[3],
uint16_t* const ma444[2], uint32_t* const b343[3], uint32_t* const b444[2],
uint8_t* const dst) {
+ const ptrdiff_t overread_in_bytes = kWideOverreadInBytesPass2 - width;
uint8x16_t s[2], mas[2];
uint16x8_t sq[4], bs[3];
- BoxFilterPreProcess3Lo(src0, scale, &s[0], sum3, square_sum3, sq, &mas[0],
- &bs[0]);
+ // TODO(b/194217060): Future msan load.
+ s[0] = vld1q_u8(src0);
+
+ BoxFilterPreProcess3Lo(&s[0], scale, sum3, square_sum3, sq, &mas[0], &bs[0]);
int x = 0;
do {
uint16x8_t ma[3];
uint8x16_t ma3x[3];
uint32x4x2_t b[3];
- BoxFilterPreProcess3(src0, x + 8, scale, sum3, square_sum3, s, sq + 1, mas,
+ s[1] = Load1QMsanU8(src0 + x + 16, x + 16 + overread_in_bytes);
+ BoxFilterPreProcess3(s, x + 8, scale, sum3, square_sum3, sq + 1, mas,
bs + 1);
Prepare3_8<0>(mas, ma3x);
Store343_444<0>(ma3x, bs, x, &ma[2], &b[2], ma343[2], ma444[1], b343[2],
@@ -1966,10 +1912,15 @@ LIBGAV1_ALWAYS_INLINE void BoxFilter(
uint16_t* const ma343[4], uint16_t* const ma444[3],
uint16_t* const ma565[2], uint32_t* const b343[4], uint32_t* const b444[3],
uint32_t* const b565[2], uint8_t* const dst) {
+ const ptrdiff_t overread_in_bytes = kWideOverreadInBytesPass1 - width;
uint8x16_t s[2][2], ma3[2][2], ma5[2];
uint16x8_t sq[2][4], b3[2][3], b5[3];
- BoxFilterPreProcessLo(src0, src1, scales, s, sum3, sum5, square_sum3,
- square_sum5, sq, ma3, b3, &ma5[0], &b5[0]);
+ // TODO(b/194217060): Future msan load.
+ s[0][0] = vld1q_u8(src0);
+ s[1][0] = vld1q_u8(src1);
+
+ BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq,
+ ma3, b3, &ma5[0], &b5[0]);
int x = 0;
do {
@@ -1977,8 +1928,10 @@ LIBGAV1_ALWAYS_INLINE void BoxFilter(
uint8x16_t ma3x[2][3], ma5x[3];
uint32x4x2_t b[3][3];
int16x8_t p[2][2];
- BoxFilterPreProcess(src0, src1, x + 8, scales, s, sum3, sum5, square_sum3,
- square_sum5, sq, ma3, b3, ma5, b5 + 1);
+ s[0][1] = Load1QMsanU8(src0 + x + 16, x + 16 + overread_in_bytes);
+ s[1][1] = Load1QMsanU8(src1 + x + 16, x + 16 + overread_in_bytes);
+ BoxFilterPreProcess(s, x + 8, scales, sum3, sum5, square_sum3, square_sum5,
+ sq, ma3, b3, ma5, b5 + 1);
Prepare3_8<0>(ma3[0], ma3x[0]);
Prepare3_8<0>(ma3[1], ma3x[1]);
Store343_444<0>(ma3x[0], b3[0], x, &ma[1][2], &ma[2][1], &b[1][2], &b[2][1],
@@ -2070,17 +2023,21 @@ inline void BoxFilterLastRow(
uint8x16_t s[2], ma3[2], ma5[2];
uint16x8_t sq[4], ma[3], b3[3], b5[3];
uint32x4x2_t b[3];
- BoxFilterPreProcessLastRowLo(src0, scales, sum3, sum5, square_sum3,
- square_sum5, &s[0], sq, &ma3[0], &ma5[0], &b3[0],
- &b5[0]);
+ // TODO(b/194217060): Future msan load.
+ s[0] = vld1q_u8(src0);
+
+ BoxFilterPreProcessLastRowLo(s, scales, sum3, sum5, square_sum3, square_sum5,
+ sq, &ma3[0], &ma5[0], &b3[0], &b5[0]);
int x = 0;
do {
uint8x16_t ma3x[3], ma5x[3];
int16x8_t p[2];
- BoxFilterPreProcessLastRow(src0, x + 8, scales, sum3, sum5, square_sum3,
- square_sum5, s, sq + 1, ma3, ma5, &b3[1],
- &b5[1]);
+ // TODO(b/194217060): Future msan load.
+ s[1] = vld1q_u8(src0 + x + 16);
+
+ BoxFilterPreProcessLastRow(s, x + 8, scales, sum3, sum5, square_sum3,
+ square_sum5, sq + 1, ma3, ma5, &b3[1], &b5[1]);
Prepare3_8<0>(ma5, ma5x);
ma[1] = Sum565<0>(ma5x);
b[1] = Sum565W(b5);
@@ -2137,6 +2094,7 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterProcess(
const ptrdiff_t bottom_border_stride, const int width, const int height,
SgrBuffer* const sgr_buffer, uint8_t* dst) {
const auto temp_stride = Align<ptrdiff_t>(width, 16);
+ const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
const ptrdiff_t sum_stride = temp_stride + 8;
const int sgr_proj_index = restoration_info.sgr_proj_info.index;
const uint16_t* const scales = kSgrScaleParameter[sgr_proj_index]; // < 2^12.
@@ -2173,8 +2131,8 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterProcess(
b565[1] = b565[0] + temp_stride;
assert(scales[0] != 0);
assert(scales[1] != 0);
- BoxSum(top_border, top_border_stride, sum_stride, sum3[0], sum5[1],
- square_sum3[0], square_sum5[1]);
+ BoxSum(top_border, top_border_stride, width, sum_stride, sum_width, sum3[0],
+ sum5[1], square_sum3[0], square_sum5[1]);
sum5[0] = sum5[1];
square_sum5[0] = square_sum5[1];
const uint8_t* const s = (height > 1) ? src + stride : bottom_border;
@@ -2250,6 +2208,7 @@ inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
const int width, const int height,
SgrBuffer* const sgr_buffer, uint8_t* dst) {
const auto temp_stride = Align<ptrdiff_t>(width, 16);
+ const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
const ptrdiff_t sum_stride = temp_stride + 8;
const int sgr_proj_index = restoration_info.sgr_proj_info.index;
const uint32_t scale = kSgrScaleParameter[sgr_proj_index][0]; // < 2^12.
@@ -2267,7 +2226,8 @@ inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
b565[0] = sgr_buffer->b565;
b565[1] = b565[0] + temp_stride;
assert(scale != 0);
- BoxSum<5>(top_border, top_border_stride, sum_stride, sum5[1], square_sum5[1]);
+ BoxSum<5>(top_border, top_border_stride, width, sum_stride, sum_width,
+ sum5[1], square_sum5[1]);
sum5[0] = sum5[1];
square_sum5[0] = square_sum5[1];
const uint8_t* const s = (height > 1) ? src + stride : bottom_border;
@@ -2325,6 +2285,7 @@ inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
SgrBuffer* const sgr_buffer, uint8_t* dst) {
assert(restoration_info.sgr_proj_info.multiplier[0] == 0);
const auto temp_stride = Align<ptrdiff_t>(width, 16);
+ const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
const ptrdiff_t sum_stride = temp_stride + 8;
const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
const int16_t w0 = (1 << kSgrProjPrecisionBits) - w1;
@@ -2347,7 +2308,8 @@ inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
b444[0] = sgr_buffer->b444;
b444[1] = b444[0] + temp_stride;
assert(scale != 0);
- BoxSum<3>(top_border, top_border_stride, sum_stride, sum3[0], square_sum3[0]);
+ BoxSum<3>(top_border, top_border_stride, width, sum_stride, sum_width,
+ sum3[0], square_sum3[0]);
BoxSumFilterPreProcess3<false>(src, width, scale, sum3, square_sum3, ma343[0],
nullptr, b343[0], nullptr);
Circulate3PointersBy1<uint16_t>(sum3);
@@ -2396,11 +2358,14 @@ inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
// the end of each row. It is safe to overwrite the output as it will not be
// part of the visible frame.
void SelfGuidedFilter_NEON(
- const RestorationUnitInfo& restoration_info, const void* const source,
- const ptrdiff_t stride, const void* const top_border,
- const ptrdiff_t top_border_stride, const void* const bottom_border,
+ const RestorationUnitInfo& LIBGAV1_RESTRICT restoration_info,
+ const void* LIBGAV1_RESTRICT const source, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_border,
+ const ptrdiff_t top_border_stride,
+ const void* LIBGAV1_RESTRICT const bottom_border,
const ptrdiff_t bottom_border_stride, const int width, const int height,
- RestorationBuffer* const restoration_buffer, void* const dest) {
+ RestorationBuffer* LIBGAV1_RESTRICT const restoration_buffer,
+ void* LIBGAV1_RESTRICT const dest) {
const int index = restoration_info.sgr_proj_info.index;
const int radius_pass_0 = kSgrProjParams[index][0]; // 2 or 0
const int radius_pass_1 = kSgrProjParams[index][2]; // 1 or 0
@@ -2409,6 +2374,12 @@ void SelfGuidedFilter_NEON(
const auto* bottom = static_cast<const uint8_t*>(bottom_border);
auto* const dst = static_cast<uint8_t*>(dest);
SgrBuffer* const sgr_buffer = &restoration_buffer->sgr_buffer;
+
+#if LIBGAV1_MSAN
+ // Initialize to prevent msan warnings when intermediate overreads occur.
+ memset(sgr_buffer, 0, sizeof(SgrBuffer));
+#endif
+
if (radius_pass_1 == 0) {
// |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the
// following assertion.