aboutsummaryrefslogtreecommitdiff
path: root/src/dsp/arm/intrapred_smooth_neon.cc
diff options
context:
space:
mode:
Diffstat (limited to 'src/dsp/arm/intrapred_smooth_neon.cc')
-rw-r--r--src/dsp/arm/intrapred_smooth_neon.cc339
1 files changed, 156 insertions, 183 deletions
diff --git a/src/dsp/arm/intrapred_smooth_neon.cc b/src/dsp/arm/intrapred_smooth_neon.cc
index bcda131..d6c1450 100644
--- a/src/dsp/arm/intrapred_smooth_neon.cc
+++ b/src/dsp/arm/intrapred_smooth_neon.cc
@@ -31,7 +31,6 @@
namespace libgav1 {
namespace dsp {
-
namespace low_bitdepth {
namespace {
@@ -42,20 +41,15 @@ constexpr uint8_t kSmoothWeights[] = {
#include "src/dsp/smooth_weights.inc"
};
-inline uint16x4_t CalculatePred(const uint16x4_t weighted_top,
- const uint16x4_t weighted_left,
- const uint16x4_t weighted_bl,
- const uint16x4_t weighted_tr) {
- const uint32x4_t pred_0 = vaddl_u16(weighted_top, weighted_left);
- const uint32x4_t pred_1 = vaddl_u16(weighted_bl, weighted_tr);
- const uint32x4_t pred_2 = vaddq_u32(pred_0, pred_1);
- return vrshrn_n_u32(pred_2, kSmoothWeightScale + 1);
+// 256 - v = vneg_s8(v)
+inline uint8x8_t NegateS8(const uint8x8_t v) {
+ return vreinterpret_u8_s8(vneg_s8(vreinterpret_s8_u8(v)));
}
template <int height>
-inline void Smooth4xN_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
- const void* LIBGAV1_RESTRICT const top_row,
- const void* LIBGAV1_RESTRICT const left_column) {
+void Smooth4xN_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
constexpr int width = 4;
const auto* const top = static_cast<const uint8_t*>(top_row);
const auto* const left = static_cast<const uint8_t*>(left_column);
@@ -68,47 +62,49 @@ inline void Smooth4xN_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
const uint8x8_t top_right_v = vdup_n_u8(top_right);
const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left);
const uint8x8_t weights_x_v = Load4(kSmoothWeights + width - 4);
- // 256 - weights = vneg_s8(weights)
- const uint8x8_t scaled_weights_x =
- vreinterpret_u8_s8(vneg_s8(vreinterpret_s8_u8(weights_x_v)));
+ const uint8x8_t scaled_weights_x = NegateS8(weights_x_v);
+ const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v);
for (int y = 0; y < height; ++y) {
const uint8x8_t left_v = vdup_n_u8(left[y]);
const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);
- const uint8x8_t scaled_weights_y =
- vreinterpret_u8_s8(vneg_s8(vreinterpret_s8_u8(weights_y_v)));
- const uint16x4_t weighted_bl =
- vget_low_u16(vmull_u8(scaled_weights_y, bottom_left_v));
-
- const uint16x4_t weighted_top = vget_low_u16(vmull_u8(weights_y_v, top_v));
- const uint16x4_t weighted_left =
- vget_low_u16(vmull_u8(weights_x_v, left_v));
- const uint16x4_t weighted_tr =
- vget_low_u16(vmull_u8(scaled_weights_x, top_right_v));
- const uint16x4_t result =
- CalculatePred(weighted_top, weighted_left, weighted_bl, weighted_tr);
-
- StoreLo4(dst, vmovn_u16(vcombine_u16(result, result)));
+ const uint8x8_t scaled_weights_y = NegateS8(weights_y_v);
+ const uint16x8_t weighted_bl = vmull_u8(scaled_weights_y, bottom_left_v);
+ const uint16x8_t weighted_top_bl =
+ vmlal_u8(weighted_bl, weights_y_v, top_v);
+ const uint16x8_t weighted_left_tr =
+ vmlal_u8(weighted_tr, weights_x_v, left_v);
+ // Maximum value of each parameter: 0xFF00
+ const uint16x8_t avg = vhaddq_u16(weighted_top_bl, weighted_left_tr);
+ const uint8x8_t result = vrshrn_n_u16(avg, kSmoothWeightScale);
+
+ StoreLo4(dst, result);
dst += stride;
}
}
-inline uint8x8_t CalculatePred(const uint16x8_t weighted_top,
- const uint16x8_t weighted_left,
- const uint16x8_t weighted_bl,
- const uint16x8_t weighted_tr) {
- // Maximum value: 0xFF00
- const uint16x8_t pred_0 = vaddq_u16(weighted_top, weighted_bl);
- // Maximum value: 0xFF00
- const uint16x8_t pred_1 = vaddq_u16(weighted_left, weighted_tr);
- const uint16x8_t pred_2 = vhaddq_u16(pred_0, pred_1);
- return vrshrn_n_u16(pred_2, kSmoothWeightScale);
+inline uint8x8_t CalculatePred(const uint16x8_t weighted_top_bl,
+ const uint16x8_t weighted_left_tr) {
+ // Maximum value of each parameter: 0xFF00
+ const uint16x8_t avg = vhaddq_u16(weighted_top_bl, weighted_left_tr);
+ return vrshrn_n_u16(avg, kSmoothWeightScale);
+}
+
+inline uint8x8_t CalculateWeightsAndPred(
+ const uint8x8_t top, const uint8x8_t left, const uint16x8_t weighted_tr,
+ const uint8x8_t bottom_left, const uint8x8_t weights_x,
+ const uint8x8_t scaled_weights_y, const uint8x8_t weights_y) {
+ const uint16x8_t weighted_top = vmull_u8(weights_y, top);
+ const uint16x8_t weighted_top_bl =
+ vmlal_u8(weighted_top, scaled_weights_y, bottom_left);
+ const uint16x8_t weighted_left_tr = vmlal_u8(weighted_tr, weights_x, left);
+ return CalculatePred(weighted_top_bl, weighted_left_tr);
}
template <int height>
-inline void Smooth8xN_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
- const void* LIBGAV1_RESTRICT const top_row,
- const void* LIBGAV1_RESTRICT const left_column) {
+void Smooth8xN_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
constexpr int width = 8;
const auto* const top = static_cast<const uint8_t*>(top_row);
const auto* const left = static_cast<const uint8_t*>(left_column);
@@ -121,21 +117,16 @@ inline void Smooth8xN_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
const uint8x8_t top_right_v = vdup_n_u8(top_right);
const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left);
const uint8x8_t weights_x_v = vld1_u8(kSmoothWeights + width - 4);
- // 256 - weights = vneg_s8(weights)
- const uint8x8_t scaled_weights_x =
- vreinterpret_u8_s8(vneg_s8(vreinterpret_s8_u8(weights_x_v)));
+ const uint8x8_t scaled_weights_x = NegateS8(weights_x_v);
+ const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v);
for (int y = 0; y < height; ++y) {
const uint8x8_t left_v = vdup_n_u8(left[y]);
const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);
- const uint8x8_t scaled_weights_y = vdup_n_u8(256 - weights_y[y]);
- const uint16x8_t weighted_bl = vmull_u8(scaled_weights_y, bottom_left_v);
-
- const uint16x8_t weighted_top = vmull_u8(weights_y_v, top_v);
- const uint16x8_t weighted_left = vmull_u8(weights_x_v, left_v);
- const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v);
+ const uint8x8_t scaled_weights_y = NegateS8(weights_y_v);
const uint8x8_t result =
- CalculatePred(weighted_top, weighted_left, weighted_bl, weighted_tr);
+ CalculateWeightsAndPred(top_v, left_v, weighted_tr, bottom_left_v,
+ weights_x_v, scaled_weights_y, weights_y_v);
vst1_u8(dst, result);
dst += stride;
@@ -146,28 +137,34 @@ inline uint8x16_t CalculateWeightsAndPred(
const uint8x16_t top, const uint8x8_t left, const uint8x8_t top_right,
const uint8x8_t weights_y, const uint8x16_t weights_x,
const uint8x16_t scaled_weights_x, const uint16x8_t weighted_bl) {
- const uint16x8_t weighted_top_low = vmull_u8(weights_y, vget_low_u8(top));
+ const uint16x8_t weighted_top_bl_low =
+ vmlal_u8(weighted_bl, weights_y, vget_low_u8(top));
const uint16x8_t weighted_left_low = vmull_u8(vget_low_u8(weights_x), left);
- const uint16x8_t weighted_tr_low =
- vmull_u8(vget_low_u8(scaled_weights_x), top_right);
- const uint8x8_t result_low = CalculatePred(
- weighted_top_low, weighted_left_low, weighted_bl, weighted_tr_low);
+ const uint16x8_t weighted_left_tr_low =
+ vmlal_u8(weighted_left_low, vget_low_u8(scaled_weights_x), top_right);
+ const uint8x8_t result_low =
+ CalculatePred(weighted_top_bl_low, weighted_left_tr_low);
- const uint16x8_t weighted_top_high = vmull_u8(weights_y, vget_high_u8(top));
+ const uint16x8_t weighted_top_bl_high =
+ vmlal_u8(weighted_bl, weights_y, vget_high_u8(top));
const uint16x8_t weighted_left_high = vmull_u8(vget_high_u8(weights_x), left);
- const uint16x8_t weighted_tr_high =
- vmull_u8(vget_high_u8(scaled_weights_x), top_right);
- const uint8x8_t result_high = CalculatePred(
- weighted_top_high, weighted_left_high, weighted_bl, weighted_tr_high);
+ const uint16x8_t weighted_left_tr_high =
+ vmlal_u8(weighted_left_high, vget_high_u8(scaled_weights_x), top_right);
+ const uint8x8_t result_high =
+ CalculatePred(weighted_top_bl_high, weighted_left_tr_high);
return vcombine_u8(result_low, result_high);
}
+// 256 - v = vneg_s8(v)
+inline uint8x16_t NegateS8(const uint8x16_t v) {
+ return vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(v)));
+}
+
template <int width, int height>
-inline void Smooth16PlusxN_NEON(
- void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
- const void* LIBGAV1_RESTRICT const top_row,
- const void* LIBGAV1_RESTRICT const left_column) {
+void Smooth16PlusxN_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const auto* const top = static_cast<const uint8_t*>(top_row);
const auto* const left = static_cast<const uint8_t*>(left_column);
const uint8_t top_right = top[width - 1];
@@ -188,9 +185,6 @@ inline void Smooth16PlusxN_NEON(
const uint8x8_t top_right_v = vdup_n_u8(top_right);
const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left);
- // TODO(johannkoenig): Consider re-reading top_v and weights_x_v in the loop.
- // This currently has a performance slope similar to Paeth so it does not
- // appear to be register bound for arm64.
uint8x16_t weights_x_v[4];
weights_x_v[0] = vld1q_u8(kSmoothWeights + width - 4);
if (width > 16) {
@@ -202,23 +196,19 @@ inline void Smooth16PlusxN_NEON(
}
uint8x16_t scaled_weights_x[4];
- scaled_weights_x[0] =
- vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x_v[0])));
+ scaled_weights_x[0] = NegateS8(weights_x_v[0]);
if (width > 16) {
- scaled_weights_x[1] =
- vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x_v[1])));
+ scaled_weights_x[1] = NegateS8(weights_x_v[1]);
if (width == 64) {
- scaled_weights_x[2] =
- vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x_v[2])));
- scaled_weights_x[3] =
- vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x_v[3])));
+ scaled_weights_x[2] = NegateS8(weights_x_v[2]);
+ scaled_weights_x[3] = NegateS8(weights_x_v[3]);
}
}
for (int y = 0; y < height; ++y) {
const uint8x8_t left_v = vdup_n_u8(left[y]);
const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);
- const uint8x8_t scaled_weights_y = vdup_n_u8(256 - weights_y[y]);
+ const uint8x8_t scaled_weights_y = NegateS8(weights_y_v);
const uint16x8_t weighted_bl = vmull_u8(scaled_weights_y, bottom_left_v);
vst1q_u8(dst, CalculateWeightsAndPred(top_v[0], left_v, top_right_v,
@@ -246,10 +236,10 @@ inline void Smooth16PlusxN_NEON(
}
template <int width, int height>
-inline void SmoothVertical4Or8xN_NEON(
- void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
- const void* LIBGAV1_RESTRICT const top_row,
- const void* LIBGAV1_RESTRICT const left_column) {
+void SmoothVertical4Or8xN_NEON(void* LIBGAV1_RESTRICT const dest,
+ ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const auto* const top = static_cast<const uint8_t*>(top_row);
const auto* const left = static_cast<const uint8_t*>(left_column);
const uint8_t bottom_left = left[height - 1];
@@ -267,17 +257,17 @@ inline void SmoothVertical4Or8xN_NEON(
for (int y = 0; y < height; ++y) {
const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);
- const uint8x8_t scaled_weights_y = vdup_n_u8(256 - weights_y[y]);
+ const uint8x8_t scaled_weights_y = NegateS8(weights_y_v);
const uint16x8_t weighted_top = vmull_u8(weights_y_v, top_v);
- const uint16x8_t weighted_bl = vmull_u8(scaled_weights_y, bottom_left_v);
- const uint16x8_t pred = vaddq_u16(weighted_top, weighted_bl);
- const uint8x8_t pred_scaled = vrshrn_n_u16(pred, kSmoothWeightScale);
+ const uint16x8_t weighted_top_bl =
+ vmlal_u8(weighted_top, scaled_weights_y, bottom_left_v);
+ const uint8x8_t pred = vrshrn_n_u16(weighted_top_bl, kSmoothWeightScale);
if (width == 4) {
- StoreLo4(dst, pred_scaled);
+ StoreLo4(dst, pred);
} else { // width == 8
- vst1_u8(dst, pred_scaled);
+ vst1_u8(dst, pred);
}
dst += stride;
}
@@ -286,10 +276,10 @@ inline void SmoothVertical4Or8xN_NEON(
inline uint8x16_t CalculateVerticalWeightsAndPred(
const uint8x16_t top, const uint8x8_t weights_y,
const uint16x8_t weighted_bl) {
- const uint16x8_t weighted_top_low = vmull_u8(weights_y, vget_low_u8(top));
- const uint16x8_t weighted_top_high = vmull_u8(weights_y, vget_high_u8(top));
- const uint16x8_t pred_low = vaddq_u16(weighted_top_low, weighted_bl);
- const uint16x8_t pred_high = vaddq_u16(weighted_top_high, weighted_bl);
+ const uint16x8_t pred_low =
+ vmlal_u8(weighted_bl, weights_y, vget_low_u8(top));
+ const uint16x8_t pred_high =
+ vmlal_u8(weighted_bl, weights_y, vget_high_u8(top));
const uint8x8_t pred_scaled_low = vrshrn_n_u16(pred_low, kSmoothWeightScale);
const uint8x8_t pred_scaled_high =
vrshrn_n_u16(pred_high, kSmoothWeightScale);
@@ -297,7 +287,7 @@ inline uint8x16_t CalculateVerticalWeightsAndPred(
}
template <int width, int height>
-inline void SmoothVertical16PlusxN_NEON(
+void SmoothVertical16PlusxN_NEON(
void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
const void* LIBGAV1_RESTRICT const top_row,
const void* LIBGAV1_RESTRICT const left_column) {
@@ -321,7 +311,7 @@ inline void SmoothVertical16PlusxN_NEON(
for (int y = 0; y < height; ++y) {
const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);
- const uint8x8_t scaled_weights_y = vdup_n_u8(256 - weights_y[y]);
+ const uint8x8_t scaled_weights_y = NegateS8(weights_y_v);
const uint16x8_t weighted_bl = vmull_u8(scaled_weights_y, bottom_left_v);
const uint8x16_t pred_0 =
@@ -349,7 +339,7 @@ inline void SmoothVertical16PlusxN_NEON(
}
template <int width, int height>
-inline void SmoothHorizontal4Or8xN_NEON(
+void SmoothHorizontal4Or8xN_NEON(
void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
const void* LIBGAV1_RESTRICT const top_row,
const void* LIBGAV1_RESTRICT const left_column) {
@@ -361,22 +351,19 @@ inline void SmoothHorizontal4Or8xN_NEON(
const uint8x8_t top_right_v = vdup_n_u8(top_right);
// Over-reads for 4xN but still within the array.
const uint8x8_t weights_x = vld1_u8(kSmoothWeights + width - 4);
- // 256 - weights = vneg_s8(weights)
- const uint8x8_t scaled_weights_x =
- vreinterpret_u8_s8(vneg_s8(vreinterpret_s8_u8(weights_x)));
+ const uint8x8_t scaled_weights_x = NegateS8(weights_x);
+ const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v);
for (int y = 0; y < height; ++y) {
const uint8x8_t left_v = vdup_n_u8(left[y]);
-
- const uint16x8_t weighted_left = vmull_u8(weights_x, left_v);
- const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v);
- const uint16x8_t pred = vaddq_u16(weighted_left, weighted_tr);
- const uint8x8_t pred_scaled = vrshrn_n_u16(pred, kSmoothWeightScale);
+ const uint16x8_t weighted_left_tr =
+ vmlal_u8(weighted_tr, weights_x, left_v);
+ const uint8x8_t pred = vrshrn_n_u16(weighted_left_tr, kSmoothWeightScale);
if (width == 4) {
- StoreLo4(dst, pred_scaled);
+ StoreLo4(dst, pred);
} else { // width == 8
- vst1_u8(dst, pred_scaled);
+ vst1_u8(dst, pred);
}
dst += stride;
}
@@ -386,23 +373,22 @@ inline uint8x16_t CalculateHorizontalWeightsAndPred(
const uint8x8_t left, const uint8x8_t top_right, const uint8x16_t weights_x,
const uint8x16_t scaled_weights_x) {
const uint16x8_t weighted_left_low = vmull_u8(vget_low_u8(weights_x), left);
- const uint16x8_t weighted_tr_low =
- vmull_u8(vget_low_u8(scaled_weights_x), top_right);
- const uint16x8_t pred_low = vaddq_u16(weighted_left_low, weighted_tr_low);
- const uint8x8_t pred_scaled_low = vrshrn_n_u16(pred_low, kSmoothWeightScale);
+ const uint16x8_t weighted_left_tr_low =
+ vmlal_u8(weighted_left_low, vget_low_u8(scaled_weights_x), top_right);
+ const uint8x8_t pred_scaled_low =
+ vrshrn_n_u16(weighted_left_tr_low, kSmoothWeightScale);
const uint16x8_t weighted_left_high = vmull_u8(vget_high_u8(weights_x), left);
- const uint16x8_t weighted_tr_high =
- vmull_u8(vget_high_u8(scaled_weights_x), top_right);
- const uint16x8_t pred_high = vaddq_u16(weighted_left_high, weighted_tr_high);
+ const uint16x8_t weighted_left_tr_high =
+ vmlal_u8(weighted_left_high, vget_high_u8(scaled_weights_x), top_right);
const uint8x8_t pred_scaled_high =
- vrshrn_n_u16(pred_high, kSmoothWeightScale);
+ vrshrn_n_u16(weighted_left_tr_high, kSmoothWeightScale);
return vcombine_u8(pred_scaled_low, pred_scaled_high);
}
template <int width, int height>
-inline void SmoothHorizontal16PlusxN_NEON(
+void SmoothHorizontal16PlusxN_NEON(
void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
const void* LIBGAV1_RESTRICT const top_row,
const void* LIBGAV1_RESTRICT const left_column) {
@@ -424,16 +410,12 @@ inline void SmoothHorizontal16PlusxN_NEON(
}
uint8x16_t scaled_weights_x[4];
- scaled_weights_x[0] =
- vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x[0])));
+ scaled_weights_x[0] = NegateS8(weights_x[0]);
if (width > 16) {
- scaled_weights_x[1] =
- vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x[1])));
+ scaled_weights_x[1] = NegateS8(weights_x[1]);
if (width == 64) {
- scaled_weights_x[2] =
- vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x[2])));
- scaled_weights_x[3] =
- vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x[3])));
+ scaled_weights_x[2] = NegateS8(weights_x[2]);
+ scaled_weights_x[3] = NegateS8(weights_x[3]);
}
}
@@ -633,10 +615,15 @@ constexpr uint16_t kSmoothWeights[] = {
#include "src/dsp/smooth_weights.inc"
};
+// 256 - v = vneg_s8(v)
+inline uint16x4_t NegateS8(const uint16x4_t v) {
+ return vreinterpret_u16_s8(vneg_s8(vreinterpret_s8_u16(v)));
+}
+
template <int height>
-inline void Smooth4xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
- const void* LIBGAV1_RESTRICT const top_row,
- const void* LIBGAV1_RESTRICT const left_column) {
+void Smooth4xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const auto* const top = static_cast<const uint16_t*>(top_row);
const auto* const left = static_cast<const uint16_t*>(left_column);
const uint16_t top_right = top[3];
@@ -647,9 +634,7 @@ inline void Smooth4xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
const uint16x4_t top_v = vld1_u16(top);
const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
const uint16x4_t weights_x_v = vld1_u16(kSmoothWeights);
- const uint16x4_t scaled_weights_x = vsub_u16(vdup_n_u16(256), weights_x_v);
-
- // Weighted top right doesn't change with each row.
+ const uint16x4_t scaled_weights_x = NegateS8(weights_x_v);
const uint32x4_t weighted_tr = vmull_n_u16(scaled_weights_x, top_right);
for (int y = 0; y < height; ++y) {
@@ -670,10 +655,10 @@ inline void Smooth4xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
// Common code between 8xH and [16|32|64]xH.
inline void CalculatePred8(uint16_t* LIBGAV1_RESTRICT dst,
- const uint32x4_t& weighted_corners_low,
- const uint32x4_t& weighted_corners_high,
- const uint16x4x2_t& top_vals,
- const uint16x4x2_t& weights_x, const uint16_t left_y,
+ const uint32x4_t weighted_corners_low,
+ const uint32x4_t weighted_corners_high,
+ const uint16x4x2_t top_vals,
+ const uint16x4x2_t weights_x, const uint16_t left_y,
const uint16_t weight_y) {
// Each variable in the running summation is named for the last item to be
// accumulated.
@@ -697,9 +682,9 @@ inline void CalculatePred8(uint16_t* LIBGAV1_RESTRICT dst,
}
template <int height>
-inline void Smooth8xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
- const void* LIBGAV1_RESTRICT const top_row,
- const void* LIBGAV1_RESTRICT const left_column) {
+void Smooth8xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const auto* const top = static_cast<const uint16_t*>(top_row);
const auto* const left = static_cast<const uint16_t*>(left_column);
const uint16_t top_right = top[7];
@@ -712,14 +697,12 @@ inline void Smooth8xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
const uint16x4x2_t weights_x = {vld1_u16(kSmoothWeights + 4),
vld1_u16(kSmoothWeights + 8)};
- // Weighted top right doesn't change with each row.
const uint32x4_t weighted_tr_low =
- vmull_n_u16(vsub_u16(vdup_n_u16(256), weights_x.val[0]), top_right);
+ vmull_n_u16(NegateS8(weights_x.val[0]), top_right);
const uint32x4_t weighted_tr_high =
- vmull_n_u16(vsub_u16(vdup_n_u16(256), weights_x.val[1]), top_right);
+ vmull_n_u16(NegateS8(weights_x.val[1]), top_right);
for (int y = 0; y < height; ++y) {
- // |weighted_bl| is invariant across the row.
const uint32x4_t weighted_bl =
vmull_n_u16(bottom_left_v, 256 - weights_y[y]);
const uint32x4_t weighted_corners_low =
@@ -735,9 +718,9 @@ inline void Smooth8xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
// For width 16 and above.
template <int width, int height>
-inline void SmoothWxH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
- const void* LIBGAV1_RESTRICT const top_row,
- const void* LIBGAV1_RESTRICT const left_column) {
+void SmoothWxH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const auto* const top = static_cast<const uint16_t*>(top_row);
const auto* const left = static_cast<const uint16_t*>(left_column);
const uint16_t top_right = top[width - 1];
@@ -746,23 +729,19 @@ inline void SmoothWxH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
auto* dst = static_cast<uint8_t*>(dest);
- const uint16x4_t weight_scaling = vdup_n_u16(256);
// Precompute weighted values that don't vary with |y|.
uint32x4_t weighted_tr_low[width >> 3];
uint32x4_t weighted_tr_high[width >> 3];
for (int i = 0; i < width >> 3; ++i) {
const int x = i << 3;
const uint16x4_t weights_x_low = vld1_u16(kSmoothWeights + width - 4 + x);
- weighted_tr_low[i] =
- vmull_n_u16(vsub_u16(weight_scaling, weights_x_low), top_right);
+ weighted_tr_low[i] = vmull_n_u16(NegateS8(weights_x_low), top_right);
const uint16x4_t weights_x_high = vld1_u16(kSmoothWeights + width + x);
- weighted_tr_high[i] =
- vmull_n_u16(vsub_u16(weight_scaling, weights_x_high), top_right);
+ weighted_tr_high[i] = vmull_n_u16(NegateS8(weights_x_high), top_right);
}
const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
for (int y = 0; y < height; ++y) {
- // |weighted_bl| is invariant across the row.
const uint32x4_t weighted_bl =
vmull_n_u16(bottom_left_v, 256 - weights_y[y]);
auto* dst_x = reinterpret_cast<uint16_t*>(dst);
@@ -785,10 +764,9 @@ inline void SmoothWxH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
}
template <int height>
-inline void SmoothVertical4xH_NEON(
- void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
- const void* LIBGAV1_RESTRICT const top_row,
- const void* LIBGAV1_RESTRICT const left_column) {
+void SmoothVertical4xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const auto* const top = static_cast<const uint16_t*>(top_row);
const auto* const left = static_cast<const uint16_t*>(left_column);
const uint16_t bottom_left = left[height - 1];
@@ -812,10 +790,10 @@ inline void SmoothVertical4xH_NEON(
}
template <int height>
-inline void SmoothVertical8xH_NEON(
- void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
- const void* LIBGAV1_RESTRICT const top_row,
- const void* LIBGAV1_RESTRICT const left_column) {
+void SmoothVertical8xH_NEON(void* LIBGAV1_RESTRICT const dest,
+ const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const auto* const top = static_cast<const uint16_t*>(top_row);
const auto* const left = static_cast<const uint16_t*>(left_column);
const uint16_t bottom_left = left[height - 1];
@@ -829,7 +807,6 @@ inline void SmoothVertical8xH_NEON(
for (int y = 0; y < height; ++y) {
auto* dst16 = reinterpret_cast<uint16_t*>(dst);
- // |weighted_bl| is invariant across the row.
const uint32x4_t weighted_bl =
vmull_n_u16(bottom_left_v, 256 - weights_y[y]);
@@ -846,10 +823,10 @@ inline void SmoothVertical8xH_NEON(
// For width 16 and above.
template <int width, int height>
-inline void SmoothVerticalWxH_NEON(
- void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
- const void* LIBGAV1_RESTRICT const top_row,
- const void* LIBGAV1_RESTRICT const left_column) {
+void SmoothVerticalWxH_NEON(void* LIBGAV1_RESTRICT const dest,
+ const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const auto* const top = static_cast<const uint16_t*>(top_row);
const auto* const left = static_cast<const uint16_t*>(left_column);
const uint16_t bottom_left = left[height - 1];
@@ -865,7 +842,6 @@ inline void SmoothVerticalWxH_NEON(
const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
for (int y = 0; y < height; ++y) {
- // |weighted_bl| is invariant across the row.
const uint32x4_t weighted_bl =
vmull_n_u16(bottom_left_v, 256 - weights_y[y]);
@@ -885,10 +861,10 @@ inline void SmoothVerticalWxH_NEON(
}
template <int height>
-inline void SmoothHorizontal4xH_NEON(
- void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
- const void* LIBGAV1_RESTRICT const top_row,
- const void* LIBGAV1_RESTRICT const left_column) {
+void SmoothHorizontal4xH_NEON(void* LIBGAV1_RESTRICT const dest,
+ ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const auto* const top = static_cast<const uint16_t*>(top_row);
const auto* const left = static_cast<const uint16_t*>(left_column);
const uint16_t top_right = top[3];
@@ -896,7 +872,7 @@ inline void SmoothHorizontal4xH_NEON(
auto* dst = static_cast<uint8_t*>(dest);
const uint16x4_t weights_x = vld1_u16(kSmoothWeights);
- const uint16x4_t scaled_weights_x = vsub_u16(vdup_n_u16(256), weights_x);
+ const uint16x4_t scaled_weights_x = NegateS8(weights_x);
const uint32x4_t weighted_tr = vmull_n_u16(scaled_weights_x, top_right);
for (int y = 0; y < height; ++y) {
@@ -909,10 +885,10 @@ inline void SmoothHorizontal4xH_NEON(
}
template <int height>
-inline void SmoothHorizontal8xH_NEON(
- void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
- const void* LIBGAV1_RESTRICT const top_row,
- const void* LIBGAV1_RESTRICT const left_column) {
+void SmoothHorizontal8xH_NEON(void* LIBGAV1_RESTRICT const dest,
+ ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const auto* const top = static_cast<const uint16_t*>(top_row);
const auto* const left = static_cast<const uint16_t*>(left_column);
const uint16_t top_right = top[7];
@@ -923,9 +899,9 @@ inline void SmoothHorizontal8xH_NEON(
vld1_u16(kSmoothWeights + 8)};
const uint32x4_t weighted_tr_low =
- vmull_n_u16(vsub_u16(vdup_n_u16(256), weights_x.val[0]), top_right);
+ vmull_n_u16(NegateS8(weights_x.val[0]), top_right);
const uint32x4_t weighted_tr_high =
- vmull_n_u16(vsub_u16(vdup_n_u16(256), weights_x.val[1]), top_right);
+ vmull_n_u16(NegateS8(weights_x.val[1]), top_right);
for (int y = 0; y < height; ++y) {
auto* dst16 = reinterpret_cast<uint16_t*>(dst);
@@ -943,18 +919,16 @@ inline void SmoothHorizontal8xH_NEON(
// For width 16 and above.
template <int width, int height>
-inline void SmoothHorizontalWxH_NEON(
- void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
- const void* LIBGAV1_RESTRICT const top_row,
- const void* LIBGAV1_RESTRICT const left_column) {
+void SmoothHorizontalWxH_NEON(void* LIBGAV1_RESTRICT const dest,
+ ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const auto* const top = static_cast<const uint16_t*>(top_row);
const auto* const left = static_cast<const uint16_t*>(left_column);
const uint16_t top_right = top[width - 1];
auto* dst = static_cast<uint8_t*>(dest);
- const uint16x4_t weight_scaling = vdup_n_u16(256);
-
uint16x4_t weights_x_low[width >> 3];
uint16x4_t weights_x_high[width >> 3];
uint32x4_t weighted_tr_low[width >> 3];
@@ -962,11 +936,9 @@ inline void SmoothHorizontalWxH_NEON(
for (int i = 0; i < width >> 3; ++i) {
const int x = i << 3;
weights_x_low[i] = vld1_u16(kSmoothWeights + width - 4 + x);
- weighted_tr_low[i] =
- vmull_n_u16(vsub_u16(weight_scaling, weights_x_low[i]), top_right);
+ weighted_tr_low[i] = vmull_n_u16(NegateS8(weights_x_low[i]), top_right);
weights_x_high[i] = vld1_u16(kSmoothWeights + width + x);
- weighted_tr_high[i] =
- vmull_n_u16(vsub_u16(weight_scaling, weights_x_high[i]), top_right);
+ weighted_tr_high[i] = vmull_n_u16(NegateS8(weights_x_high[i]), top_right);
}
for (int y = 0; y < height; ++y) {
@@ -1141,6 +1113,7 @@ void Init10bpp() {
dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothHorizontal] =
SmoothHorizontalWxH_NEON<64, 64>;
}
+
} // namespace
} // namespace high_bitdepth
#endif // LIBGAV1_MAX_BITDEPTH >= 10