aboutsummaryrefslogtreecommitdiff
path: root/src/dsp/arm/intrapred_neon.cc
diff options
context:
space:
mode:
Diffstat (limited to 'src/dsp/arm/intrapred_neon.cc')
-rw-r--r--src/dsp/arm/intrapred_neon.cc579
1 files changed, 408 insertions, 171 deletions
diff --git a/src/dsp/arm/intrapred_neon.cc b/src/dsp/arm/intrapred_neon.cc
index c143648..cd47a22 100644
--- a/src/dsp/arm/intrapred_neon.cc
+++ b/src/dsp/arm/intrapred_neon.cc
@@ -26,6 +26,7 @@
#include "src/dsp/arm/common_neon.h"
#include "src/dsp/constants.h"
#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
#include "src/utils/constants.h"
namespace libgav1 {
@@ -56,10 +57,10 @@ struct DcPredFuncs_NEON {
template <int block_width_log2, int block_height_log2, DcSumFunc sumfn,
DcStoreFunc storefn>
-void DcPredFuncs_NEON<block_width_log2, block_height_log2, sumfn,
- storefn>::DcTop(void* const dest, ptrdiff_t stride,
- const void* const top_row,
- const void* /*left_column*/) {
+void DcPredFuncs_NEON<block_width_log2, block_height_log2, sumfn, storefn>::
+ DcTop(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* /*left_column*/) {
const uint32x2_t sum = sumfn(top_row, block_width_log2, false, nullptr, 0);
const uint32x2_t dc = vrshr_n_u32(sum, block_width_log2);
storefn(dest, stride, dc);
@@ -67,10 +68,10 @@ void DcPredFuncs_NEON<block_width_log2, block_height_log2, sumfn,
template <int block_width_log2, int block_height_log2, DcSumFunc sumfn,
DcStoreFunc storefn>
-void DcPredFuncs_NEON<block_width_log2, block_height_log2, sumfn,
- storefn>::DcLeft(void* const dest, ptrdiff_t stride,
- const void* /*top_row*/,
- const void* const left_column) {
+void DcPredFuncs_NEON<block_width_log2, block_height_log2, sumfn, storefn>::
+ DcLeft(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* /*top_row*/,
+ const void* LIBGAV1_RESTRICT const left_column) {
const uint32x2_t sum =
sumfn(left_column, block_height_log2, false, nullptr, 0);
const uint32x2_t dc = vrshr_n_u32(sum, block_height_log2);
@@ -80,8 +81,9 @@ void DcPredFuncs_NEON<block_width_log2, block_height_log2, sumfn,
template <int block_width_log2, int block_height_log2, DcSumFunc sumfn,
DcStoreFunc storefn>
void DcPredFuncs_NEON<block_width_log2, block_height_log2, sumfn, storefn>::Dc(
- void* const dest, ptrdiff_t stride, const void* const top_row,
- const void* const left_column) {
+ void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const uint32x2_t sum =
sumfn(top_row, block_width_log2, true, left_column, block_height_log2);
if (block_width_log2 == block_height_log2) {
@@ -154,92 +156,116 @@ inline uint16x8_t LoadAndAdd64(const uint8_t* buf) {
// If |use_ref_1| is false then only sum |ref_0|.
// For |ref[01]_size_log2| == 4 this relies on |ref_[01]| being aligned to
// uint32_t.
-inline uint32x2_t DcSum_NEON(const void* ref_0, const int ref_0_size_log2,
- const bool use_ref_1, const void* ref_1,
+inline uint32x2_t DcSum_NEON(const void* LIBGAV1_RESTRICT ref_0,
+ const int ref_0_size_log2, const bool use_ref_1,
+ const void* LIBGAV1_RESTRICT ref_1,
const int ref_1_size_log2) {
const auto* const ref_0_u8 = static_cast<const uint8_t*>(ref_0);
const auto* const ref_1_u8 = static_cast<const uint8_t*>(ref_1);
if (ref_0_size_log2 == 2) {
uint8x8_t val = Load4(ref_0_u8);
if (use_ref_1) {
- if (ref_1_size_log2 == 2) { // 4x4
- val = Load4<1>(ref_1_u8, val);
- return Sum(vpaddl_u8(val));
- } else if (ref_1_size_log2 == 3) { // 4x8
- const uint8x8_t val_1 = vld1_u8(ref_1_u8);
- const uint16x4_t sum_0 = vpaddl_u8(val);
- const uint16x4_t sum_1 = vpaddl_u8(val_1);
- return Sum(vadd_u16(sum_0, sum_1));
- } else if (ref_1_size_log2 == 4) { // 4x16
- const uint8x16_t val_1 = vld1q_u8(ref_1_u8);
- return Sum(vaddw_u8(vpaddlq_u8(val_1), val));
+ switch (ref_1_size_log2) {
+ case 2: { // 4x4
+ val = Load4<1>(ref_1_u8, val);
+ return Sum(vpaddl_u8(val));
+ }
+ case 3: { // 4x8
+ const uint8x8_t val_1 = vld1_u8(ref_1_u8);
+ const uint16x4_t sum_0 = vpaddl_u8(val);
+ const uint16x4_t sum_1 = vpaddl_u8(val_1);
+ return Sum(vadd_u16(sum_0, sum_1));
+ }
+ case 4: { // 4x16
+ const uint8x16_t val_1 = vld1q_u8(ref_1_u8);
+ return Sum(vaddw_u8(vpaddlq_u8(val_1), val));
+ }
}
}
// 4x1
const uint16x4_t sum = vpaddl_u8(val);
return vpaddl_u16(sum);
- } else if (ref_0_size_log2 == 3) {
+ }
+ if (ref_0_size_log2 == 3) {
const uint8x8_t val_0 = vld1_u8(ref_0_u8);
if (use_ref_1) {
- if (ref_1_size_log2 == 2) { // 8x4
- const uint8x8_t val_1 = Load4(ref_1_u8);
- const uint16x4_t sum_0 = vpaddl_u8(val_0);
- const uint16x4_t sum_1 = vpaddl_u8(val_1);
- return Sum(vadd_u16(sum_0, sum_1));
- } else if (ref_1_size_log2 == 3) { // 8x8
- const uint8x8_t val_1 = vld1_u8(ref_1_u8);
- const uint16x4_t sum_0 = vpaddl_u8(val_0);
- const uint16x4_t sum_1 = vpaddl_u8(val_1);
- return Sum(vadd_u16(sum_0, sum_1));
- } else if (ref_1_size_log2 == 4) { // 8x16
- const uint8x16_t val_1 = vld1q_u8(ref_1_u8);
- return Sum(vaddw_u8(vpaddlq_u8(val_1), val_0));
- } else if (ref_1_size_log2 == 5) { // 8x32
- return Sum(vaddw_u8(LoadAndAdd32(ref_1_u8), val_0));
+ switch (ref_1_size_log2) {
+ case 2: { // 8x4
+ const uint8x8_t val_1 = Load4(ref_1_u8);
+ const uint16x4_t sum_0 = vpaddl_u8(val_0);
+ const uint16x4_t sum_1 = vpaddl_u8(val_1);
+ return Sum(vadd_u16(sum_0, sum_1));
+ }
+ case 3: { // 8x8
+ const uint8x8_t val_1 = vld1_u8(ref_1_u8);
+ const uint16x4_t sum_0 = vpaddl_u8(val_0);
+ const uint16x4_t sum_1 = vpaddl_u8(val_1);
+ return Sum(vadd_u16(sum_0, sum_1));
+ }
+ case 4: { // 8x16
+ const uint8x16_t val_1 = vld1q_u8(ref_1_u8);
+ return Sum(vaddw_u8(vpaddlq_u8(val_1), val_0));
+ }
+ case 5: { // 8x32
+ return Sum(vaddw_u8(LoadAndAdd32(ref_1_u8), val_0));
+ }
}
}
// 8x1
return Sum(vpaddl_u8(val_0));
- } else if (ref_0_size_log2 == 4) {
+ }
+ if (ref_0_size_log2 == 4) {
const uint8x16_t val_0 = vld1q_u8(ref_0_u8);
if (use_ref_1) {
- if (ref_1_size_log2 == 2) { // 16x4
- const uint8x8_t val_1 = Load4(ref_1_u8);
- return Sum(vaddw_u8(vpaddlq_u8(val_0), val_1));
- } else if (ref_1_size_log2 == 3) { // 16x8
- const uint8x8_t val_1 = vld1_u8(ref_1_u8);
- return Sum(vaddw_u8(vpaddlq_u8(val_0), val_1));
- } else if (ref_1_size_log2 == 4) { // 16x16
- const uint8x16_t val_1 = vld1q_u8(ref_1_u8);
- return Sum(Add(val_0, val_1));
- } else if (ref_1_size_log2 == 5) { // 16x32
- const uint16x8_t sum_0 = vpaddlq_u8(val_0);
- const uint16x8_t sum_1 = LoadAndAdd32(ref_1_u8);
- return Sum(vaddq_u16(sum_0, sum_1));
- } else if (ref_1_size_log2 == 6) { // 16x64
- const uint16x8_t sum_0 = vpaddlq_u8(val_0);
- const uint16x8_t sum_1 = LoadAndAdd64(ref_1_u8);
- return Sum(vaddq_u16(sum_0, sum_1));
+ switch (ref_1_size_log2) {
+ case 2: { // 16x4
+ const uint8x8_t val_1 = Load4(ref_1_u8);
+ return Sum(vaddw_u8(vpaddlq_u8(val_0), val_1));
+ }
+ case 3: { // 16x8
+ const uint8x8_t val_1 = vld1_u8(ref_1_u8);
+ return Sum(vaddw_u8(vpaddlq_u8(val_0), val_1));
+ }
+ case 4: { // 16x16
+ const uint8x16_t val_1 = vld1q_u8(ref_1_u8);
+ return Sum(Add(val_0, val_1));
+ }
+ case 5: { // 16x32
+ const uint16x8_t sum_0 = vpaddlq_u8(val_0);
+ const uint16x8_t sum_1 = LoadAndAdd32(ref_1_u8);
+ return Sum(vaddq_u16(sum_0, sum_1));
+ }
+ case 6: { // 16x64
+ const uint16x8_t sum_0 = vpaddlq_u8(val_0);
+ const uint16x8_t sum_1 = LoadAndAdd64(ref_1_u8);
+ return Sum(vaddq_u16(sum_0, sum_1));
+ }
}
}
// 16x1
return Sum(vpaddlq_u8(val_0));
- } else if (ref_0_size_log2 == 5) {
+ }
+ if (ref_0_size_log2 == 5) {
const uint16x8_t sum_0 = LoadAndAdd32(ref_0_u8);
if (use_ref_1) {
- if (ref_1_size_log2 == 3) { // 32x8
- const uint8x8_t val_1 = vld1_u8(ref_1_u8);
- return Sum(vaddw_u8(sum_0, val_1));
- } else if (ref_1_size_log2 == 4) { // 32x16
- const uint8x16_t val_1 = vld1q_u8(ref_1_u8);
- const uint16x8_t sum_1 = vpaddlq_u8(val_1);
- return Sum(vaddq_u16(sum_0, sum_1));
- } else if (ref_1_size_log2 == 5) { // 32x32
- const uint16x8_t sum_1 = LoadAndAdd32(ref_1_u8);
- return Sum(vaddq_u16(sum_0, sum_1));
- } else if (ref_1_size_log2 == 6) { // 32x64
- const uint16x8_t sum_1 = LoadAndAdd64(ref_1_u8);
- return Sum(vaddq_u16(sum_0, sum_1));
+ switch (ref_1_size_log2) {
+ case 3: { // 32x8
+ const uint8x8_t val_1 = vld1_u8(ref_1_u8);
+ return Sum(vaddw_u8(sum_0, val_1));
+ }
+ case 4: { // 32x16
+ const uint8x16_t val_1 = vld1q_u8(ref_1_u8);
+ const uint16x8_t sum_1 = vpaddlq_u8(val_1);
+ return Sum(vaddq_u16(sum_0, sum_1));
+ }
+ case 5: { // 32x32
+ const uint16x8_t sum_1 = LoadAndAdd32(ref_1_u8);
+ return Sum(vaddq_u16(sum_0, sum_1));
+ }
+ case 6: { // 32x64
+ const uint16x8_t sum_1 = LoadAndAdd64(ref_1_u8);
+ return Sum(vaddq_u16(sum_0, sum_1));
+ }
}
}
// 32x1
@@ -249,16 +275,20 @@ inline uint32x2_t DcSum_NEON(const void* ref_0, const int ref_0_size_log2,
assert(ref_0_size_log2 == 6);
const uint16x8_t sum_0 = LoadAndAdd64(ref_0_u8);
if (use_ref_1) {
- if (ref_1_size_log2 == 4) { // 64x16
- const uint8x16_t val_1 = vld1q_u8(ref_1_u8);
- const uint16x8_t sum_1 = vpaddlq_u8(val_1);
- return Sum(vaddq_u16(sum_0, sum_1));
- } else if (ref_1_size_log2 == 5) { // 64x32
- const uint16x8_t sum_1 = LoadAndAdd32(ref_1_u8);
- return Sum(vaddq_u16(sum_0, sum_1));
- } else if (ref_1_size_log2 == 6) { // 64x64
- const uint16x8_t sum_1 = LoadAndAdd64(ref_1_u8);
- return Sum(vaddq_u16(sum_0, sum_1));
+ switch (ref_1_size_log2) {
+ case 4: { // 64x16
+ const uint8x16_t val_1 = vld1q_u8(ref_1_u8);
+ const uint16x8_t sum_1 = vpaddlq_u8(val_1);
+ return Sum(vaddq_u16(sum_0, sum_1));
+ }
+ case 5: { // 64x32
+ const uint16x8_t sum_1 = LoadAndAdd32(ref_1_u8);
+ return Sum(vaddq_u16(sum_0, sum_1));
+ }
+ case 6: { // 64x64
+ const uint16x8_t sum_1 = LoadAndAdd64(ref_1_u8);
+ return Sum(vaddq_u16(sum_0, sum_1));
+ }
}
}
// 64x1
@@ -318,9 +348,10 @@ inline void DcStore_NEON(void* const dest, ptrdiff_t stride,
}
template <int width, int height>
-inline void Paeth4Or8xN_NEON(void* const dest, ptrdiff_t stride,
- const void* const top_row,
- const void* const left_column) {
+inline void Paeth4Or8xN_NEON(void* LIBGAV1_RESTRICT const dest,
+ ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
auto* dest_u8 = static_cast<uint8_t*>(dest);
const auto* const top_row_u8 = static_cast<const uint8_t*>(top_row);
const auto* const left_col_u8 = static_cast<const uint8_t*>(left_column);
@@ -425,9 +456,10 @@ inline uint8x16_t SelectPaeth(const uint8x16_t top, const uint8x16_t left,
top_dist, top_left_##num##_dist_low, top_left_##num##_dist_high)
template <int width, int height>
-inline void Paeth16PlusxN_NEON(void* const dest, ptrdiff_t stride,
- const void* const top_row,
- const void* const left_column) {
+inline void Paeth16PlusxN_NEON(void* LIBGAV1_RESTRICT const dest,
+ ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
auto* dest_u8 = static_cast<uint8_t*>(dest);
const auto* const top_row_u8 = static_cast<const uint8_t*>(top_row);
const auto* const left_col_u8 = static_cast<const uint8_t*>(left_column);
@@ -769,87 +801,111 @@ inline uint16x8_t LoadAndAdd64(const uint16_t* buf) {
// |ref_[01]| each point to 1 << |ref[01]_size_log2| packed uint16_t values.
// If |use_ref_1| is false then only sum |ref_0|.
-inline uint32x2_t DcSum_NEON(const void* ref_0, const int ref_0_size_log2,
- const bool use_ref_1, const void* ref_1,
+inline uint32x2_t DcSum_NEON(const void* LIBGAV1_RESTRICT ref_0,
+ const int ref_0_size_log2, const bool use_ref_1,
+ const void* LIBGAV1_RESTRICT ref_1,
const int ref_1_size_log2) {
const auto* ref_0_u16 = static_cast<const uint16_t*>(ref_0);
const auto* ref_1_u16 = static_cast<const uint16_t*>(ref_1);
if (ref_0_size_log2 == 2) {
const uint16x4_t val_0 = vld1_u16(ref_0_u16);
if (use_ref_1) {
- if (ref_1_size_log2 == 2) { // 4x4
- const uint16x4_t val_1 = vld1_u16(ref_1_u16);
- return Sum(vadd_u16(val_0, val_1));
- } else if (ref_1_size_log2 == 3) { // 4x8
- const uint16x8_t val_1 = vld1q_u16(ref_1_u16);
- const uint16x8_t sum_0 = vcombine_u16(vdup_n_u16(0), val_0);
- return Sum(vaddq_u16(sum_0, val_1));
- } else if (ref_1_size_log2 == 4) { // 4x16
- const uint16x8_t sum_0 = vcombine_u16(vdup_n_u16(0), val_0);
- const uint16x8_t sum_1 = LoadAndAdd16(ref_1_u16);
- return Sum(vaddq_u16(sum_0, sum_1));
+ switch (ref_1_size_log2) {
+ case 2: { // 4x4
+ const uint16x4_t val_1 = vld1_u16(ref_1_u16);
+ return Sum(vadd_u16(val_0, val_1));
+ }
+ case 3: { // 4x8
+ const uint16x8_t val_1 = vld1q_u16(ref_1_u16);
+ const uint16x8_t sum_0 = vcombine_u16(vdup_n_u16(0), val_0);
+ return Sum(vaddq_u16(sum_0, val_1));
+ }
+ case 4: { // 4x16
+ const uint16x8_t sum_0 = vcombine_u16(vdup_n_u16(0), val_0);
+ const uint16x8_t sum_1 = LoadAndAdd16(ref_1_u16);
+ return Sum(vaddq_u16(sum_0, sum_1));
+ }
}
}
// 4x1
return Sum(val_0);
- } else if (ref_0_size_log2 == 3) {
+ }
+ if (ref_0_size_log2 == 3) {
const uint16x8_t val_0 = vld1q_u16(ref_0_u16);
if (use_ref_1) {
- if (ref_1_size_log2 == 2) { // 8x4
- const uint16x4_t val_1 = vld1_u16(ref_1_u16);
- const uint16x8_t sum_1 = vcombine_u16(vdup_n_u16(0), val_1);
- return Sum(vaddq_u16(val_0, sum_1));
- } else if (ref_1_size_log2 == 3) { // 8x8
- const uint16x8_t val_1 = vld1q_u16(ref_1_u16);
- return Sum(vaddq_u16(val_0, val_1));
- } else if (ref_1_size_log2 == 4) { // 8x16
- const uint16x8_t sum_1 = LoadAndAdd16(ref_1_u16);
- return Sum(vaddq_u16(val_0, sum_1));
- } else if (ref_1_size_log2 == 5) { // 8x32
- const uint16x8_t sum_1 = LoadAndAdd32(ref_1_u16);
- return Sum(vaddq_u16(val_0, sum_1));
+ switch (ref_1_size_log2) {
+ case 2: { // 8x4
+ const uint16x4_t val_1 = vld1_u16(ref_1_u16);
+ const uint16x8_t sum_1 = vcombine_u16(vdup_n_u16(0), val_1);
+ return Sum(vaddq_u16(val_0, sum_1));
+ }
+ case 3: { // 8x8
+ const uint16x8_t val_1 = vld1q_u16(ref_1_u16);
+ return Sum(vaddq_u16(val_0, val_1));
+ }
+ case 4: { // 8x16
+ const uint16x8_t sum_1 = LoadAndAdd16(ref_1_u16);
+ return Sum(vaddq_u16(val_0, sum_1));
+ }
+ case 5: { // 8x32
+ const uint16x8_t sum_1 = LoadAndAdd32(ref_1_u16);
+ return Sum(vaddq_u16(val_0, sum_1));
+ }
}
}
// 8x1
return Sum(val_0);
- } else if (ref_0_size_log2 == 4) {
+ }
+ if (ref_0_size_log2 == 4) {
const uint16x8_t sum_0 = LoadAndAdd16(ref_0_u16);
if (use_ref_1) {
- if (ref_1_size_log2 == 2) { // 16x4
- const uint16x4_t val_1 = vld1_u16(ref_1_u16);
- const uint16x8_t sum_1 = vcombine_u16(vdup_n_u16(0), val_1);
- return Sum(vaddq_u16(sum_0, sum_1));
- } else if (ref_1_size_log2 == 3) { // 16x8
- const uint16x8_t val_1 = vld1q_u16(ref_1_u16);
- return Sum(vaddq_u16(sum_0, val_1));
- } else if (ref_1_size_log2 == 4) { // 16x16
- const uint16x8_t sum_1 = LoadAndAdd16(ref_1_u16);
- return Sum(vaddq_u16(sum_0, sum_1));
- } else if (ref_1_size_log2 == 5) { // 16x32
- const uint16x8_t sum_1 = LoadAndAdd32(ref_1_u16);
- return Sum(vaddq_u16(sum_0, sum_1));
- } else if (ref_1_size_log2 == 6) { // 16x64
- const uint16x8_t sum_1 = LoadAndAdd64(ref_1_u16);
- return Sum(vaddq_u16(sum_0, sum_1));
+ switch (ref_1_size_log2) {
+ case 2: { // 16x4
+ const uint16x4_t val_1 = vld1_u16(ref_1_u16);
+ const uint16x8_t sum_1 = vcombine_u16(vdup_n_u16(0), val_1);
+ return Sum(vaddq_u16(sum_0, sum_1));
+ }
+ case 3: { // 16x8
+ const uint16x8_t val_1 = vld1q_u16(ref_1_u16);
+ return Sum(vaddq_u16(sum_0, val_1));
+ }
+ case 4: { // 16x16
+ const uint16x8_t sum_1 = LoadAndAdd16(ref_1_u16);
+ return Sum(vaddq_u16(sum_0, sum_1));
+ }
+ case 5: { // 16x32
+ const uint16x8_t sum_1 = LoadAndAdd32(ref_1_u16);
+ return Sum(vaddq_u16(sum_0, sum_1));
+ }
+ case 6: { // 16x64
+ const uint16x8_t sum_1 = LoadAndAdd64(ref_1_u16);
+ return Sum(vaddq_u16(sum_0, sum_1));
+ }
}
}
// 16x1
return Sum(sum_0);
- } else if (ref_0_size_log2 == 5) {
+ }
+ if (ref_0_size_log2 == 5) {
const uint16x8_t sum_0 = LoadAndAdd32(ref_0_u16);
if (use_ref_1) {
- if (ref_1_size_log2 == 3) { // 32x8
- const uint16x8_t val_1 = vld1q_u16(ref_1_u16);
- return Sum(vaddq_u16(sum_0, val_1));
- } else if (ref_1_size_log2 == 4) { // 32x16
- const uint16x8_t sum_1 = LoadAndAdd16(ref_1_u16);
- return Sum(vaddq_u16(sum_0, sum_1));
- } else if (ref_1_size_log2 == 5) { // 32x32
- const uint16x8_t sum_1 = LoadAndAdd32(ref_1_u16);
- return Sum(vaddq_u16(sum_0, sum_1));
- } else if (ref_1_size_log2 == 6) { // 32x64
- const uint16x8_t sum_1 = LoadAndAdd64(ref_1_u16);
- return Sum(vaddq_u16(sum_0, sum_1));
+ switch (ref_1_size_log2) {
+ case 3: { // 32x8
+ const uint16x8_t val_1 = vld1q_u16(ref_1_u16);
+ return Sum(vaddq_u16(sum_0, val_1));
+ }
+ case 4: { // 32x16
+ const uint16x8_t sum_1 = LoadAndAdd16(ref_1_u16);
+ return Sum(vaddq_u16(sum_0, sum_1));
+ }
+ case 5: { // 32x32
+ const uint16x8_t sum_1 = LoadAndAdd32(ref_1_u16);
+ return Sum(vaddq_u16(sum_0, sum_1));
+ }
+ case 6: { // 32x64
+ const uint16x8_t sum_1 = LoadAndAdd64(ref_1_u16);
+ return Sum(vaddq_u16(sum_0, sum_1));
+ }
}
}
// 32x1
@@ -859,15 +915,19 @@ inline uint32x2_t DcSum_NEON(const void* ref_0, const int ref_0_size_log2,
assert(ref_0_size_log2 == 6);
const uint16x8_t sum_0 = LoadAndAdd64(ref_0_u16);
if (use_ref_1) {
- if (ref_1_size_log2 == 4) { // 64x16
- const uint16x8_t sum_1 = LoadAndAdd16(ref_1_u16);
- return Sum(vaddq_u16(sum_0, sum_1));
- } else if (ref_1_size_log2 == 5) { // 64x32
- const uint16x8_t sum_1 = LoadAndAdd32(ref_1_u16);
- return Sum(vaddq_u16(sum_0, sum_1));
- } else if (ref_1_size_log2 == 6) { // 64x64
- const uint16x8_t sum_1 = LoadAndAdd64(ref_1_u16);
- return Sum(vaddq_u16(sum_0, sum_1));
+ switch (ref_1_size_log2) {
+ case 4: { // 64x16
+ const uint16x8_t sum_1 = LoadAndAdd16(ref_1_u16);
+ return Sum(vaddq_u16(sum_0, sum_1));
+ }
+ case 5: { // 64x32
+ const uint16x8_t sum_1 = LoadAndAdd32(ref_1_u16);
+ return Sum(vaddq_u16(sum_0, sum_1));
+ }
+ case 6: { // 64x64
+ const uint16x8_t sum_1 = LoadAndAdd64(ref_1_u16);
+ return Sum(vaddq_u16(sum_0, sum_1));
+ }
}
}
// 64x1
@@ -968,9 +1028,9 @@ struct DcDefs {
// IntraPredFuncs_NEON::Horizontal -- duplicate left column across all rows
template <int block_height>
-void Horizontal4xH_NEON(void* const dest, ptrdiff_t stride,
+void Horizontal4xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
const void* /*top_row*/,
- const void* const left_column) {
+ const void* LIBGAV1_RESTRICT const left_column) {
const auto* const left = static_cast<const uint16_t*>(left_column);
auto* dst = static_cast<uint8_t*>(dest);
int y = 0;
@@ -983,9 +1043,9 @@ void Horizontal4xH_NEON(void* const dest, ptrdiff_t stride,
}
template <int block_height>
-void Horizontal8xH_NEON(void* const dest, ptrdiff_t stride,
+void Horizontal8xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
const void* /*top_row*/,
- const void* const left_column) {
+ const void* LIBGAV1_RESTRICT const left_column) {
const auto* const left = static_cast<const uint16_t*>(left_column);
auto* dst = static_cast<uint8_t*>(dest);
int y = 0;
@@ -998,9 +1058,9 @@ void Horizontal8xH_NEON(void* const dest, ptrdiff_t stride,
}
template <int block_height>
-void Horizontal16xH_NEON(void* const dest, ptrdiff_t stride,
+void Horizontal16xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
const void* /*top_row*/,
- const void* const left_column) {
+ const void* LIBGAV1_RESTRICT const left_column) {
const auto* const left = static_cast<const uint16_t*>(left_column);
auto* dst = static_cast<uint8_t*>(dest);
int y = 0;
@@ -1020,9 +1080,9 @@ void Horizontal16xH_NEON(void* const dest, ptrdiff_t stride,
}
template <int block_height>
-void Horizontal32xH_NEON(void* const dest, ptrdiff_t stride,
+void Horizontal32xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
const void* /*top_row*/,
- const void* const left_column) {
+ const void* LIBGAV1_RESTRICT const left_column) {
const auto* const left = static_cast<const uint16_t*>(left_column);
auto* dst = static_cast<uint8_t*>(dest);
int y = 0;
@@ -1048,8 +1108,8 @@ void Horizontal32xH_NEON(void* const dest, ptrdiff_t stride,
// IntraPredFuncs_NEON::Vertical -- copy top row to all rows
template <int block_height>
-void Vertical4xH_NEON(void* const dest, ptrdiff_t stride,
- const void* const top_row,
+void Vertical4xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
const void* const /*left_column*/) {
const auto* const top = static_cast<const uint8_t*>(top_row);
auto* dst = static_cast<uint8_t*>(dest);
@@ -1062,8 +1122,8 @@ void Vertical4xH_NEON(void* const dest, ptrdiff_t stride,
}
template <int block_height>
-void Vertical8xH_NEON(void* const dest, ptrdiff_t stride,
- const void* const top_row,
+void Vertical8xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
const void* const /*left_column*/) {
const auto* const top = static_cast<const uint8_t*>(top_row);
auto* dst = static_cast<uint8_t*>(dest);
@@ -1076,8 +1136,8 @@ void Vertical8xH_NEON(void* const dest, ptrdiff_t stride,
}
template <int block_height>
-void Vertical16xH_NEON(void* const dest, ptrdiff_t stride,
- const void* const top_row,
+void Vertical16xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
const void* const /*left_column*/) {
const auto* const top = static_cast<const uint8_t*>(top_row);
auto* dst = static_cast<uint8_t*>(dest);
@@ -1096,8 +1156,8 @@ void Vertical16xH_NEON(void* const dest, ptrdiff_t stride,
}
template <int block_height>
-void Vertical32xH_NEON(void* const dest, ptrdiff_t stride,
- const void* const top_row,
+void Vertical32xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
const void* const /*left_column*/) {
const auto* const top = static_cast<const uint8_t*>(top_row);
auto* dst = static_cast<uint8_t*>(dest);
@@ -1122,8 +1182,8 @@ void Vertical32xH_NEON(void* const dest, ptrdiff_t stride,
}
template <int block_height>
-void Vertical64xH_NEON(void* const dest, ptrdiff_t stride,
- const void* const top_row,
+void Vertical64xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
const void* const /*left_column*/) {
const auto* const top = static_cast<const uint8_t*>(top_row);
auto* dst = static_cast<uint8_t*>(dest);
@@ -1159,6 +1219,145 @@ void Vertical64xH_NEON(void* const dest, ptrdiff_t stride,
} while (y != 0);
}
+template <int height>
+inline void Paeth4xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_ptr,
+ const void* LIBGAV1_RESTRICT const left_ptr) {
+ auto* dst = static_cast<uint8_t*>(dest);
+ const auto* const top_row = static_cast<const uint16_t*>(top_ptr);
+ const auto* const left_col = static_cast<const uint16_t*>(left_ptr);
+
+ const uint16x4_t top_left = vdup_n_u16(top_row[-1]);
+ const uint16x4_t top_left_x2 = vshl_n_u16(top_left, 1);
+ const uint16x4_t top = vld1_u16(top_row);
+
+ for (int y = 0; y < height; ++y) {
+ auto* dst16 = reinterpret_cast<uint16_t*>(dst);
+ const uint16x4_t left = vdup_n_u16(left_col[y]);
+
+ const uint16x4_t left_dist = vabd_u16(top, top_left);
+ const uint16x4_t top_dist = vabd_u16(left, top_left);
+ const uint16x4_t top_left_dist = vabd_u16(vadd_u16(top, left), top_left_x2);
+
+ const uint16x4_t left_le_top = vcle_u16(left_dist, top_dist);
+ const uint16x4_t left_le_top_left = vcle_u16(left_dist, top_left_dist);
+ const uint16x4_t top_le_top_left = vcle_u16(top_dist, top_left_dist);
+
+ // if (left_dist <= top_dist && left_dist <= top_left_dist)
+ const uint16x4_t left_mask = vand_u16(left_le_top, left_le_top_left);
+ // dest[x] = left_column[y];
+ // Fill all the unused spaces with 'top'. They will be overwritten when
+ // the positions for top_left are known.
+ uint16x4_t result = vbsl_u16(left_mask, left, top);
+ // else if (top_dist <= top_left_dist)
+ // dest[x] = top_row[x];
+ // Add these values to the mask. They were already set.
+ const uint16x4_t left_or_top_mask = vorr_u16(left_mask, top_le_top_left);
+ // else
+ // dest[x] = top_left;
+ result = vbsl_u16(left_or_top_mask, result, top_left);
+
+ vst1_u16(dst16, result);
+ dst += stride;
+ }
+}
+
+template <int height>
+inline void Paeth8xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_ptr,
+ const void* LIBGAV1_RESTRICT const left_ptr) {
+ auto* dst = static_cast<uint8_t*>(dest);
+ const auto* const top_row = static_cast<const uint16_t*>(top_ptr);
+ const auto* const left_col = static_cast<const uint16_t*>(left_ptr);
+
+ const uint16x8_t top_left = vdupq_n_u16(top_row[-1]);
+ const uint16x8_t top_left_x2 = vshlq_n_u16(top_left, 1);
+ const uint16x8_t top = vld1q_u16(top_row);
+
+ for (int y = 0; y < height; ++y) {
+ auto* dst16 = reinterpret_cast<uint16_t*>(dst);
+ const uint16x8_t left = vdupq_n_u16(left_col[y]);
+
+ const uint16x8_t left_dist = vabdq_u16(top, top_left);
+ const uint16x8_t top_dist = vabdq_u16(left, top_left);
+ const uint16x8_t top_left_dist =
+ vabdq_u16(vaddq_u16(top, left), top_left_x2);
+
+ const uint16x8_t left_le_top = vcleq_u16(left_dist, top_dist);
+ const uint16x8_t left_le_top_left = vcleq_u16(left_dist, top_left_dist);
+ const uint16x8_t top_le_top_left = vcleq_u16(top_dist, top_left_dist);
+
+ // if (left_dist <= top_dist && left_dist <= top_left_dist)
+ const uint16x8_t left_mask = vandq_u16(left_le_top, left_le_top_left);
+ // dest[x] = left_column[y];
+ // Fill all the unused spaces with 'top'. They will be overwritten when
+ // the positions for top_left are known.
+ uint16x8_t result = vbslq_u16(left_mask, left, top);
+ // else if (top_dist <= top_left_dist)
+ // dest[x] = top_row[x];
+ // Add these values to the mask. They were already set.
+ const uint16x8_t left_or_top_mask = vorrq_u16(left_mask, top_le_top_left);
+ // else
+ // dest[x] = top_left;
+ result = vbslq_u16(left_or_top_mask, result, top_left);
+
+ vst1q_u16(dst16, result);
+ dst += stride;
+ }
+}
+
+// For 16xH and above.
+template <int width, int height>
+inline void PaethWxH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_ptr,
+ const void* LIBGAV1_RESTRICT const left_ptr) {
+ auto* dst = static_cast<uint8_t*>(dest);
+ const auto* const top_row = static_cast<const uint16_t*>(top_ptr);
+ const auto* const left_col = static_cast<const uint16_t*>(left_ptr);
+
+ const uint16x8_t top_left = vdupq_n_u16(top_row[-1]);
+ const uint16x8_t top_left_x2 = vshlq_n_u16(top_left, 1);
+
+ uint16x8_t top[width >> 3];
+ for (int i = 0; i < width >> 3; ++i) {
+ top[i] = vld1q_u16(top_row + (i << 3));
+ }
+
+ for (int y = 0; y < height; ++y) {
+ auto* dst_x = reinterpret_cast<uint16_t*>(dst);
+ const uint16x8_t left = vdupq_n_u16(left_col[y]);
+ const uint16x8_t top_dist = vabdq_u16(left, top_left);
+
+ for (int i = 0; i < (width >> 3); ++i) {
+ const uint16x8_t left_dist = vabdq_u16(top[i], top_left);
+ const uint16x8_t top_left_dist =
+ vabdq_u16(vaddq_u16(top[i], left), top_left_x2);
+
+ const uint16x8_t left_le_top = vcleq_u16(left_dist, top_dist);
+ const uint16x8_t left_le_top_left = vcleq_u16(left_dist, top_left_dist);
+ const uint16x8_t top_le_top_left = vcleq_u16(top_dist, top_left_dist);
+
+ // if (left_dist <= top_dist && left_dist <= top_left_dist)
+ const uint16x8_t left_mask = vandq_u16(left_le_top, left_le_top_left);
+ // dest[x] = left_column[y];
+ // Fill all the unused spaces with 'top'. They will be overwritten when
+ // the positions for top_left are known.
+ uint16x8_t result = vbslq_u16(left_mask, left, top[i]);
+ // else if (top_dist <= top_left_dist)
+ // dest[x] = top_row[x];
+ // Add these values to the mask. They were already set.
+ const uint16x8_t left_or_top_mask = vorrq_u16(left_mask, top_le_top_left);
+ // else
+ // dest[x] = top_left;
+ result = vbslq_u16(left_or_top_mask, result, top_left);
+
+ vst1q_u16(dst_x, result);
+ dst_x += 8;
+ }
+ dst += stride;
+ }
+}
+
void Init10bpp() {
Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
assert(dsp != nullptr);
@@ -1170,6 +1369,8 @@ void Init10bpp() {
DcDefs::_4x4::Dc;
dsp->intra_predictors[kTransformSize4x4][kIntraPredictorVertical] =
Vertical4xH_NEON<4>;
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorPaeth] =
+ Paeth4xH_NEON<4>;
// 4x8
dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcTop] =
@@ -1182,6 +1383,8 @@ void Init10bpp() {
Horizontal4xH_NEON<8>;
dsp->intra_predictors[kTransformSize4x8][kIntraPredictorVertical] =
Vertical4xH_NEON<8>;
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorPaeth] =
+ Paeth4xH_NEON<8>;
// 4x16
dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcTop] =
@@ -1194,6 +1397,8 @@ void Init10bpp() {
Horizontal4xH_NEON<16>;
dsp->intra_predictors[kTransformSize4x16][kIntraPredictorVertical] =
Vertical4xH_NEON<16>;
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorPaeth] =
+ Paeth4xH_NEON<16>;
// 8x4
dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcTop] =
@@ -1204,6 +1409,8 @@ void Init10bpp() {
DcDefs::_8x4::Dc;
dsp->intra_predictors[kTransformSize8x4][kIntraPredictorVertical] =
Vertical8xH_NEON<4>;
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorPaeth] =
+ Paeth8xH_NEON<4>;
// 8x8
dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcTop] =
@@ -1216,6 +1423,8 @@ void Init10bpp() {
Horizontal8xH_NEON<8>;
dsp->intra_predictors[kTransformSize8x8][kIntraPredictorVertical] =
Vertical8xH_NEON<8>;
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorPaeth] =
+ Paeth8xH_NEON<8>;
// 8x16
dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcTop] =
@@ -1226,6 +1435,8 @@ void Init10bpp() {
DcDefs::_8x16::Dc;
dsp->intra_predictors[kTransformSize8x16][kIntraPredictorVertical] =
Vertical8xH_NEON<16>;
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorPaeth] =
+ Paeth8xH_NEON<16>;
// 8x32
dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcTop] =
@@ -1238,6 +1449,8 @@ void Init10bpp() {
Horizontal8xH_NEON<32>;
dsp->intra_predictors[kTransformSize8x32][kIntraPredictorVertical] =
Vertical8xH_NEON<32>;
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorPaeth] =
+ Paeth8xH_NEON<32>;
// 16x4
dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcTop] =
@@ -1248,6 +1461,8 @@ void Init10bpp() {
DcDefs::_16x4::Dc;
dsp->intra_predictors[kTransformSize16x4][kIntraPredictorVertical] =
Vertical16xH_NEON<4>;
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorPaeth] =
+ PaethWxH_NEON<16, 4>;
// 16x8
dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcTop] =
@@ -1260,6 +1475,8 @@ void Init10bpp() {
Horizontal16xH_NEON<8>;
dsp->intra_predictors[kTransformSize16x8][kIntraPredictorVertical] =
Vertical16xH_NEON<8>;
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorPaeth] =
+ PaethWxH_NEON<16, 8>;
// 16x16
dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcTop] =
@@ -1270,6 +1487,8 @@ void Init10bpp() {
DcDefs::_16x16::Dc;
dsp->intra_predictors[kTransformSize16x16][kIntraPredictorVertical] =
Vertical16xH_NEON<16>;
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorPaeth] =
+ PaethWxH_NEON<16, 16>;
// 16x32
dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcTop] =
@@ -1280,6 +1499,8 @@ void Init10bpp() {
DcDefs::_16x32::Dc;
dsp->intra_predictors[kTransformSize16x32][kIntraPredictorVertical] =
Vertical16xH_NEON<32>;
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorPaeth] =
+ PaethWxH_NEON<16, 32>;
// 16x64
dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcTop] =
@@ -1290,6 +1511,8 @@ void Init10bpp() {
DcDefs::_16x64::Dc;
dsp->intra_predictors[kTransformSize16x64][kIntraPredictorVertical] =
Vertical16xH_NEON<64>;
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorPaeth] =
+ PaethWxH_NEON<16, 64>;
// 32x8
dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcTop] =
@@ -1300,6 +1523,8 @@ void Init10bpp() {
DcDefs::_32x8::Dc;
dsp->intra_predictors[kTransformSize32x8][kIntraPredictorVertical] =
Vertical32xH_NEON<8>;
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorPaeth] =
+ PaethWxH_NEON<32, 8>;
// 32x16
dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcTop] =
@@ -1310,6 +1535,8 @@ void Init10bpp() {
DcDefs::_32x16::Dc;
dsp->intra_predictors[kTransformSize32x16][kIntraPredictorVertical] =
Vertical32xH_NEON<16>;
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorPaeth] =
+ PaethWxH_NEON<32, 16>;
// 32x32
dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcTop] =
@@ -1320,6 +1547,8 @@ void Init10bpp() {
DcDefs::_32x32::Dc;
dsp->intra_predictors[kTransformSize32x32][kIntraPredictorVertical] =
Vertical32xH_NEON<32>;
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorPaeth] =
+ PaethWxH_NEON<32, 32>;
// 32x64
dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcTop] =
@@ -1332,6 +1561,8 @@ void Init10bpp() {
Horizontal32xH_NEON<64>;
dsp->intra_predictors[kTransformSize32x64][kIntraPredictorVertical] =
Vertical32xH_NEON<64>;
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorPaeth] =
+ PaethWxH_NEON<32, 64>;
// 64x16
dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcTop] =
@@ -1342,6 +1573,8 @@ void Init10bpp() {
DcDefs::_64x16::Dc;
dsp->intra_predictors[kTransformSize64x16][kIntraPredictorVertical] =
Vertical64xH_NEON<16>;
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorPaeth] =
+ PaethWxH_NEON<64, 16>;
// 64x32
dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcTop] =
@@ -1352,6 +1585,8 @@ void Init10bpp() {
DcDefs::_64x32::Dc;
dsp->intra_predictors[kTransformSize64x32][kIntraPredictorVertical] =
Vertical64xH_NEON<32>;
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorPaeth] =
+ PaethWxH_NEON<64, 32>;
// 64x64
dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcTop] =
@@ -1362,6 +1597,8 @@ void Init10bpp() {
DcDefs::_64x64::Dc;
dsp->intra_predictors[kTransformSize64x64][kIntraPredictorVertical] =
Vertical64xH_NEON<64>;
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorPaeth] =
+ PaethWxH_NEON<64, 64>;
}
} // namespace