aboutsummaryrefslogtreecommitdiff
path: root/src/dsp/arm
diff options
context:
space:
mode:
authorqinxialei <xialeiqin@gmail.com>2021-04-22 11:20:15 +0800
committerqinxialei <xialeiqin@gmail.com>2021-04-22 11:20:15 +0800
commit2381d803c76105f44717d75f089ec37f51e5cfe4 (patch)
tree33f40fb4dfd1039ac262d5f1c1065d298578ddc1 /src/dsp/arm
parente8d277081293b6fb2a5d469616baaa7a06f52496 (diff)
downloadlibgav1-2381d803c76105f44717d75f089ec37f51e5cfe4.tar.gz
libgav1-2381d803c76105f44717d75f089ec37f51e5cfe4.tar.bz2
libgav1-2381d803c76105f44717d75f089ec37f51e5cfe4.zip
New upstream version 0.16.3
Diffstat (limited to 'src/dsp/arm')
-rw-r--r--src/dsp/arm/average_blend_neon.cc135
-rw-r--r--src/dsp/arm/cdef_neon.cc11
-rw-r--r--src/dsp/arm/common_neon.h70
-rw-r--r--src/dsp/arm/convolve_neon.cc943
-rw-r--r--src/dsp/arm/distance_weighted_blend_neon.cc162
-rw-r--r--src/dsp/arm/distance_weighted_blend_neon.h2
-rw-r--r--src/dsp/arm/film_grain_neon.cc2
-rw-r--r--src/dsp/arm/intra_edge_neon.cc243
-rw-r--r--src/dsp/arm/intra_edge_neon.h3
-rw-r--r--src/dsp/arm/intrapred_cfl_neon.cc1012
-rw-r--r--src/dsp/arm/intrapred_cfl_neon.h179
-rw-r--r--src/dsp/arm/intrapred_directional_neon.cc594
-rw-r--r--src/dsp/arm/intrapred_directional_neon.h56
-rw-r--r--src/dsp/arm/intrapred_filter_neon.cc (renamed from src/dsp/arm/intrapred_filter_intra_neon.cc)10
-rw-r--r--src/dsp/arm/intrapred_filter_neon.h37
-rw-r--r--src/dsp/arm/intrapred_neon.cc247
-rw-r--r--src/dsp/arm/intrapred_neon.h218
-rw-r--r--src/dsp/arm/intrapred_smooth_neon.cc5
-rw-r--r--src/dsp/arm/intrapred_smooth_neon.h149
-rw-r--r--src/dsp/arm/inverse_transform_10bit_neon.cc2543
-rw-r--r--src/dsp/arm/inverse_transform_neon.cc2
-rw-r--r--src/dsp/arm/inverse_transform_neon.h16
-rw-r--r--src/dsp/arm/loop_filter_neon.cc18
-rw-r--r--src/dsp/arm/loop_restoration_neon.cc1470
-rw-r--r--src/dsp/arm/mask_blend_neon.cc2
-rw-r--r--src/dsp/arm/motion_field_projection_neon.cc2
-rw-r--r--src/dsp/arm/motion_vector_search_neon.cc2
-rw-r--r--src/dsp/arm/obmc_neon.cc2
-rw-r--r--src/dsp/arm/super_res_neon.cc151
-rw-r--r--src/dsp/arm/super_res_neon.h5
-rw-r--r--src/dsp/arm/warp_neon.cc4
-rw-r--r--src/dsp/arm/weight_mask_neon.cc2
32 files changed, 7046 insertions, 1251 deletions
diff --git a/src/dsp/arm/average_blend_neon.cc b/src/dsp/arm/average_blend_neon.cc
index 834e8b4..5b4c094 100644
--- a/src/dsp/arm/average_blend_neon.cc
+++ b/src/dsp/arm/average_blend_neon.cc
@@ -35,6 +35,11 @@ namespace {
constexpr int kInterPostRoundBit =
kInterRoundBitsVertical - kInterRoundBitsCompoundVertical;
+} // namespace
+
+namespace low_bitdepth {
+namespace {
+
inline uint8x8_t AverageBlend8Row(const int16_t* prediction_0,
const int16_t* prediction_1) {
const int16x8_t pred0 = vld1q_s16(prediction_0);
@@ -128,13 +133,139 @@ void Init8bpp() {
}
} // namespace
+} // namespace low_bitdepth
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+inline uint16x8_t AverageBlend8Row(const uint16_t* prediction_0,
+ const uint16_t* prediction_1,
+ const int32x4_t compound_offset,
+ const uint16x8_t v_bitdepth) {
+ const uint16x8_t pred0 = vld1q_u16(prediction_0);
+ const uint16x8_t pred1 = vld1q_u16(prediction_1);
+ const uint32x4_t pred_lo =
+ vaddl_u16(vget_low_u16(pred0), vget_low_u16(pred1));
+ const uint32x4_t pred_hi =
+ vaddl_u16(vget_high_u16(pred0), vget_high_u16(pred1));
+ const int32x4_t offset_lo =
+ vsubq_s32(vreinterpretq_s32_u32(pred_lo), compound_offset);
+ const int32x4_t offset_hi =
+ vsubq_s32(vreinterpretq_s32_u32(pred_hi), compound_offset);
+ const uint16x4_t res_lo = vqrshrun_n_s32(offset_lo, kInterPostRoundBit + 1);
+ const uint16x4_t res_hi = vqrshrun_n_s32(offset_hi, kInterPostRoundBit + 1);
+ return vminq_u16(vcombine_u16(res_lo, res_hi), v_bitdepth);
+}
+
+inline void AverageBlendLargeRow(const uint16_t* prediction_0,
+ const uint16_t* prediction_1, const int width,
+ uint16_t* dest,
+ const int32x4_t compound_offset,
+ const uint16x8_t v_bitdepth) {
+ int x = width;
+ do {
+ vst1q_u16(dest, AverageBlend8Row(prediction_0, prediction_1,
+ compound_offset, v_bitdepth));
+ prediction_0 += 8;
+ prediction_1 += 8;
+ dest += 8;
+
+ vst1q_u16(dest, AverageBlend8Row(prediction_0, prediction_1,
+ compound_offset, v_bitdepth));
+ prediction_0 += 8;
+ prediction_1 += 8;
+ dest += 8;
+
+ x -= 16;
+ } while (x != 0);
+}
+
+void AverageBlend_NEON(const void* prediction_0, const void* prediction_1,
+ const int width, const int height, void* const dest,
+ const ptrdiff_t dest_stride) {
+ auto* dst = static_cast<uint16_t*>(dest);
+ const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+ int y = height;
+
+ const ptrdiff_t dst_stride = dest_stride >> 1;
+ const int32x4_t compound_offset =
+ vdupq_n_s32(static_cast<int32_t>(kCompoundOffset + kCompoundOffset));
+ const uint16x8_t v_bitdepth = vdupq_n_u16((1 << kBitdepth10) - 1);
+ if (width == 4) {
+ do {
+ const uint16x8_t result =
+ AverageBlend8Row(pred_0, pred_1, compound_offset, v_bitdepth);
+ pred_0 += 8;
+ pred_1 += 8;
+
+ vst1_u16(dst, vget_low_u16(result));
+ dst += dst_stride;
+ vst1_u16(dst, vget_high_u16(result));
+ dst += dst_stride;
+ y -= 2;
+ } while (y != 0);
+ return;
+ }
+
+ if (width == 8) {
+ do {
+ vst1q_u16(dst,
+ AverageBlend8Row(pred_0, pred_1, compound_offset, v_bitdepth));
+ dst += dst_stride;
+ pred_0 += 8;
+ pred_1 += 8;
+
+ vst1q_u16(dst,
+ AverageBlend8Row(pred_0, pred_1, compound_offset, v_bitdepth));
+ dst += dst_stride;
+ pred_0 += 8;
+ pred_1 += 8;
+
+ y -= 2;
+ } while (y != 0);
+ return;
+ }
+
+ do {
+ AverageBlendLargeRow(pred_0, pred_1, width, dst, compound_offset,
+ v_bitdepth);
+ dst += dst_stride;
+ pred_0 += width;
+ pred_1 += width;
+
+ AverageBlendLargeRow(pred_0, pred_1, width, dst, compound_offset,
+ v_bitdepth);
+ dst += dst_stride;
+ pred_0 += width;
+ pred_1 += width;
+
+ y -= 2;
+ } while (y != 0);
+}
+
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+ dsp->average_blend = AverageBlend_NEON;
+}
+
+} // namespace
+} // namespace high_bitdepth
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
-void AverageBlendInit_NEON() { Init8bpp(); }
+void AverageBlendInit_NEON() {
+ low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ high_bitdepth::Init10bpp();
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+}
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_NEON
+#else // !LIBGAV1_ENABLE_NEON
namespace libgav1 {
namespace dsp {
diff --git a/src/dsp/arm/cdef_neon.cc b/src/dsp/arm/cdef_neon.cc
index 4d0e76f..60c72d6 100644
--- a/src/dsp/arm/cdef_neon.cc
+++ b/src/dsp/arm/cdef_neon.cc
@@ -265,7 +265,7 @@ LIBGAV1_ALWAYS_INLINE void AddPartial(const void* const source,
// 05 15 25 35 45 55 65 75 00 00 00 00 00 00 00 00
// 06 16 26 36 46 56 66 76 00 00 00 00 00 00 00 00
// 07 17 27 37 47 57 67 77 00 00 00 00 00 00 00 00
- partial_lo[2] = vsetq_lane_u16(SumVector(v_src[0]), partial_lo[2], 0);
+ partial_lo[2] = vsetq_lane_u16(SumVector(v_src[0]), vdupq_n_u16(0), 0);
partial_lo[2] = vsetq_lane_u16(SumVector(v_src[1]), partial_lo[2], 1);
partial_lo[2] = vsetq_lane_u16(SumVector(v_src[2]), partial_lo[2], 2);
partial_lo[2] = vsetq_lane_u16(SumVector(v_src[3]), partial_lo[2], 3);
@@ -285,9 +285,8 @@ LIBGAV1_ALWAYS_INLINE void AddPartial(const void* const source,
// 50 51 52 53 54 55 56 57 00 00 00 00 00 00 00 00
// 60 61 62 63 64 65 66 67 00 00 00 00 00 00 00 00
// 70 71 72 73 74 75 76 77 00 00 00 00 00 00 00 00
- const uint8x8_t v_zero = vdup_n_u8(0);
- partial_lo[6] = vaddl_u8(v_zero, v_src[0]);
- for (int i = 1; i < 8; ++i) {
+ partial_lo[6] = vaddl_u8(v_src[0], v_src[1]);
+ for (int i = 2; i < 8; ++i) {
partial_lo[6] = vaddw_u8(partial_lo[6], v_src[i]);
}
@@ -451,7 +450,7 @@ void LoadDirection4(const uint16_t* const src, const ptrdiff_t stride,
int16x8_t Constrain(const uint16x8_t pixel, const uint16x8_t reference,
const uint16x8_t threshold, const int16x8_t damping) {
- // If reference > pixel, the difference will be negative, so covert to 0 or
+ // If reference > pixel, the difference will be negative, so convert to 0 or
// -1.
const uint16x8_t sign = vcgtq_u16(reference, pixel);
const uint16x8_t abs_diff = vabdq_u16(pixel, reference);
@@ -686,7 +685,7 @@ void CdefInit_NEON() { low_bitdepth::Init8bpp(); }
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_NEON
+#else // !LIBGAV1_ENABLE_NEON
namespace libgav1 {
namespace dsp {
diff --git a/src/dsp/arm/common_neon.h b/src/dsp/arm/common_neon.h
index dcb7567..05e0d05 100644
--- a/src/dsp/arm/common_neon.h
+++ b/src/dsp/arm/common_neon.h
@@ -28,8 +28,7 @@
#if 0
#include <cstdio>
-
-#include "absl/strings/str_cat.h"
+#include <string>
constexpr bool kEnablePrintRegs = true;
@@ -86,11 +85,11 @@ inline void PrintVectQ(const DebugRegisterQ r, const char* const name,
inline void PrintReg(const int32x4x2_t val, const std::string& name) {
DebugRegisterQ r;
- vst1q_u32(r.u32, val.val[0]);
- const std::string name0 = absl::StrCat(name, ".val[0]").c_str();
+ vst1q_s32(r.i32, val.val[0]);
+ const std::string name0 = name + std::string(".val[0]");
PrintVectQ(r, name0.c_str(), 32);
- vst1q_u32(r.u32, val.val[1]);
- const std::string name1 = absl::StrCat(name, ".val[1]").c_str();
+ vst1q_s32(r.i32, val.val[1]);
+ const std::string name1 = name + std::string(".val[1]");
PrintVectQ(r, name1.c_str(), 32);
}
@@ -169,14 +168,14 @@ inline void PrintReg(const int8x8_t val, const char* name) {
// Print an individual (non-vector) value in decimal format.
inline void PrintReg(const int x, const char* name) {
if (kEnablePrintRegs) {
- printf("%s: %d\n", name, x);
+ fprintf(stderr, "%s: %d\n", name, x);
}
}
// Print an individual (non-vector) value in hexadecimal format.
inline void PrintHex(const int x, const char* name) {
if (kEnablePrintRegs) {
- printf("%s: %x\n", name, x);
+ fprintf(stderr, "%s: %x\n", name, x);
}
}
@@ -277,22 +276,32 @@ inline void Store2(uint16_t* const buf, const uint16x4_t val) {
ValueToMem<uint32_t>(buf, vget_lane_u32(vreinterpret_u32_u16(val), lane));
}
+// Simplify code when caller has |buf| cast as uint8_t*.
+inline void Store4(void* const buf, const uint16x4_t val) {
+ vst1_u16(static_cast<uint16_t*>(buf), val);
+}
+
+// Simplify code when caller has |buf| cast as uint8_t*.
+inline void Store8(void* const buf, const uint16x8_t val) {
+ vst1q_u16(static_cast<uint16_t*>(buf), val);
+}
+
//------------------------------------------------------------------------------
// Bit manipulation.
// vshXX_n_XX() requires an immediate.
template <int shift>
-inline uint8x8_t LeftShift(const uint8x8_t vector) {
+inline uint8x8_t LeftShiftVector(const uint8x8_t vector) {
return vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(vector), shift));
}
template <int shift>
-inline uint8x8_t RightShift(const uint8x8_t vector) {
+inline uint8x8_t RightShiftVector(const uint8x8_t vector) {
return vreinterpret_u8_u64(vshr_n_u64(vreinterpret_u64_u8(vector), shift));
}
template <int shift>
-inline int8x8_t RightShift(const int8x8_t vector) {
+inline int8x8_t RightShiftVector(const int8x8_t vector) {
return vreinterpret_s8_u64(vshr_n_u64(vreinterpret_u64_s8(vector), shift));
}
@@ -387,6 +396,15 @@ inline uint16_t SumVector(const uint8x8_t a) {
#endif // defined(__aarch64__)
}
+inline uint32_t SumVector(const uint32x2_t a) {
+#if defined(__aarch64__)
+ return vaddv_u32(a);
+#else
+ const uint64x1_t b = vpaddl_u32(a);
+ return vget_lane_u32(vreinterpret_u32_u64(b), 0);
+#endif // defined(__aarch64__)
+}
+
inline uint32_t SumVector(const uint32x4_t a) {
#if defined(__aarch64__)
return vaddvq_u32(a);
@@ -447,6 +465,36 @@ inline uint16x8x2_t VtrnqU64(uint32x4_t a0, uint32x4_t a1) {
}
// Input:
+// 00 01 02 03
+// 10 11 12 13
+// 20 21 22 23
+// 30 31 32 33
+inline void Transpose4x4(uint16x4_t a[4]) {
+ // b:
+ // 00 10 02 12
+ // 01 11 03 13
+ const uint16x4x2_t b = vtrn_u16(a[0], a[1]);
+ // c:
+ // 20 30 22 32
+ // 21 31 23 33
+ const uint16x4x2_t c = vtrn_u16(a[2], a[3]);
+ // d:
+ // 00 10 20 30
+ // 02 12 22 32
+ const uint32x2x2_t d =
+ vtrn_u32(vreinterpret_u32_u16(b.val[0]), vreinterpret_u32_u16(c.val[0]));
+ // e:
+ // 01 11 21 31
+ // 03 13 23 33
+ const uint32x2x2_t e =
+ vtrn_u32(vreinterpret_u32_u16(b.val[1]), vreinterpret_u32_u16(c.val[1]));
+ a[0] = vreinterpret_u16_u32(d.val[0]);
+ a[1] = vreinterpret_u16_u32(e.val[0]);
+ a[2] = vreinterpret_u16_u32(d.val[1]);
+ a[3] = vreinterpret_u16_u32(e.val[1]);
+}
+
+// Input:
// a: 00 01 02 03 10 11 12 13
// b: 20 21 22 23 30 31 32 33
// Output:
diff --git a/src/dsp/arm/convolve_neon.cc b/src/dsp/arm/convolve_neon.cc
index fd9b912..331bfe2 100644
--- a/src/dsp/arm/convolve_neon.cc
+++ b/src/dsp/arm/convolve_neon.cc
@@ -101,245 +101,278 @@ int16x8_t SumOnePassTaps(const uint8x8_t* const src,
return vreinterpretq_s16_u16(sum);
}
-template <int filter_index, bool negative_outside_taps>
-int16x8_t SumHorizontalTaps(const uint8_t* const src,
- const uint8x8_t* const v_tap) {
- uint8x8_t v_src[8];
- const uint8x16_t src_long = vld1q_u8(src);
- int16x8_t sum;
-
- if (filter_index < 2) {
- v_src[0] = vget_low_u8(vextq_u8(src_long, src_long, 1));
- v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 2));
- v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 3));
- v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 4));
- v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 5));
- v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 6));
- sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src, v_tap + 1);
- } else if (filter_index == 2) {
- v_src[0] = vget_low_u8(src_long);
- v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
- v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2));
- v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3));
- v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 4));
- v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 5));
- v_src[6] = vget_low_u8(vextq_u8(src_long, src_long, 6));
- v_src[7] = vget_low_u8(vextq_u8(src_long, src_long, 7));
- sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src, v_tap);
- } else if (filter_index == 3) {
- v_src[0] = vget_low_u8(vextq_u8(src_long, src_long, 3));
- v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 4));
- sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src, v_tap + 3);
- } else if (filter_index > 3) {
- v_src[0] = vget_low_u8(vextq_u8(src_long, src_long, 2));
- v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 3));
- v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 4));
- v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 5));
- sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src, v_tap + 2);
- }
- return sum;
-}
-
-template <int filter_index, bool negative_outside_taps>
-uint8x8_t SimpleHorizontalTaps(const uint8_t* const src,
- const uint8x8_t* const v_tap) {
- int16x8_t sum =
- SumHorizontalTaps<filter_index, negative_outside_taps>(src, v_tap);
-
- // Normally the Horizontal pass does the downshift in two passes:
- // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
- // kInterRoundBitsHorizontal). Each one uses a rounding shift. Combining them
- // requires adding the rounding offset from the skipped shift.
- constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2);
-
- sum = vaddq_s16(sum, vdupq_n_s16(first_shift_rounding_bit));
- return vqrshrun_n_s16(sum, kFilterBits - 1);
-}
-
-template <int filter_index, bool negative_outside_taps>
-uint16x8_t HorizontalTaps8To16(const uint8_t* const src,
- const uint8x8_t* const v_tap) {
- const int16x8_t sum =
- SumHorizontalTaps<filter_index, negative_outside_taps>(src, v_tap);
-
- return vreinterpretq_u16_s16(
- vrshrq_n_s16(sum, kInterRoundBitsHorizontal - 1));
-}
-
-template <int filter_index>
-int16x8_t SumHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride,
- const uint8x8_t* const v_tap) {
- uint16x8_t sum;
- const uint8x8_t input0 = vld1_u8(src);
- src += src_stride;
- const uint8x8_t input1 = vld1_u8(src);
- uint8x8x2_t input = vzip_u8(input0, input1);
-
- if (filter_index == 3) {
- // tap signs : + +
- sum = vmull_u8(vext_u8(input.val[0], input.val[1], 6), v_tap[3]);
- sum = vmlal_u8(sum, input.val[1], v_tap[4]);
- } else if (filter_index == 4) {
- // tap signs : - + + -
- sum = vmull_u8(vext_u8(input.val[0], input.val[1], 6), v_tap[3]);
- sum = vmlsl_u8(sum, RightShift<4 * 8>(input.val[0]), v_tap[2]);
- sum = vmlal_u8(sum, input.val[1], v_tap[4]);
- sum = vmlsl_u8(sum, RightShift<2 * 8>(input.val[1]), v_tap[5]);
- } else {
- // tap signs : + + + +
- sum = vmull_u8(RightShift<4 * 8>(input.val[0]), v_tap[2]);
- sum = vmlal_u8(sum, vext_u8(input.val[0], input.val[1], 6), v_tap[3]);
- sum = vmlal_u8(sum, input.val[1], v_tap[4]);
- sum = vmlal_u8(sum, RightShift<2 * 8>(input.val[1]), v_tap[5]);
- }
-
- return vreinterpretq_s16_u16(sum);
-}
-
-template <int filter_index>
-uint8x8_t SimpleHorizontalTaps2x2(const uint8_t* src,
- const ptrdiff_t src_stride,
- const uint8x8_t* const v_tap) {
- int16x8_t sum = SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
-
- // Normally the Horizontal pass does the downshift in two passes:
- // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
- // kInterRoundBitsHorizontal). Each one uses a rounding shift. Combining them
- // requires adding the rounding offset from the skipped shift.
- constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2);
-
- sum = vaddq_s16(sum, vdupq_n_s16(first_shift_rounding_bit));
- return vqrshrun_n_s16(sum, kFilterBits - 1);
-}
-
-template <int filter_index>
-uint16x8_t HorizontalTaps8To16_2x2(const uint8_t* src,
- const ptrdiff_t src_stride,
- const uint8x8_t* const v_tap) {
- const int16x8_t sum =
- SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
-
- return vreinterpretq_u16_s16(
- vrshrq_n_s16(sum, kInterRoundBitsHorizontal - 1));
-}
-
-template <int num_taps, int step, int filter_index,
- bool negative_outside_taps = true, bool is_2d = false,
- bool is_compound = false>
-void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride,
- void* const dest, const ptrdiff_t pred_stride,
- const int width, const int height,
- const uint8x8_t* const v_tap) {
+template <int filter_index, bool negative_outside_taps, bool is_2d,
+ bool is_compound>
+void FilterHorizontalWidth8AndUp(const uint8_t* src, const ptrdiff_t src_stride,
+ void* const dest, const ptrdiff_t pred_stride,
+ const int width, const int height,
+ const uint8x8_t* const v_tap) {
auto* dest8 = static_cast<uint8_t*>(dest);
auto* dest16 = static_cast<uint16_t*>(dest);
-
- // 4 tap filters are never used when width > 4.
- if (num_taps != 4 && width > 4) {
- int y = 0;
+ if (!is_2d) {
+ int y = height;
do {
int x = 0;
- do {
- if (is_2d || is_compound) {
- const uint16x8_t v_sum =
- HorizontalTaps8To16<filter_index, negative_outside_taps>(&src[x],
- v_tap);
+ do { // Increasing loop counter x is better.
+ const uint8x16_t src_long = vld1q_u8(src + x);
+ uint8x8_t v_src[8];
+ int16x8_t sum;
+ if (filter_index < 2) {
+ v_src[0] = vget_low_u8(src_long);
+ v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+ v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2));
+ v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3));
+ v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 4));
+ v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 5));
+ sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src,
+ v_tap + 1);
+ } else if (filter_index == 2) {
+ v_src[0] = vget_low_u8(src_long);
+ v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+ v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2));
+ v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3));
+ v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 4));
+ v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 5));
+ v_src[6] = vget_low_u8(vextq_u8(src_long, src_long, 6));
+ v_src[7] = vget_low_u8(vextq_u8(src_long, src_long, 7));
+ sum = SumOnePassTaps<filter_index, false>(v_src, v_tap);
+ } else if (filter_index == 3) {
+ v_src[0] = vget_low_u8(src_long);
+ v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+ sum = SumOnePassTaps<filter_index, false>(v_src, v_tap + 3);
+ } else if (filter_index > 3) {
+ v_src[0] = vget_low_u8(src_long);
+ v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+ v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2));
+ v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3));
+ sum = SumOnePassTaps<filter_index, false>(v_src, v_tap + 2);
+ }
+ if (is_compound) {
+ const uint16x8_t v_sum = vreinterpretq_u16_s16(
+ vrshrq_n_s16(sum, kInterRoundBitsHorizontal - 1));
vst1q_u16(&dest16[x], v_sum);
} else {
- const uint8x8_t result =
- SimpleHorizontalTaps<filter_index, negative_outside_taps>(&src[x],
- v_tap);
+ // Normally the Horizontal pass does the downshift in two passes:
+ // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
+ // kInterRoundBitsHorizontal). Each one uses a rounding shift.
+ // Combining them requires adding the rounding offset from the skipped
+ // shift.
+ constexpr int first_shift_rounding_bit =
+ 1 << (kInterRoundBitsHorizontal - 2);
+ sum = vaddq_s16(sum, vdupq_n_s16(first_shift_rounding_bit));
+ const uint8x8_t result = vqrshrun_n_s16(sum, kFilterBits - 1);
vst1_u8(&dest8[x], result);
}
- x += step;
+ x += 8;
} while (x < width);
src += src_stride;
dest8 += pred_stride;
dest16 += pred_stride;
- } while (++y < height);
+ } while (--y != 0);
+ } else {
+ int x = 0;
+ do {
+ const uint8_t* s = src + x;
+ int y = height;
+ do { // Increasing loop counter x is better.
+ const uint8x16_t src_long = vld1q_u8(s);
+ uint8x8_t v_src[8];
+ int16x8_t sum;
+ if (filter_index < 2) {
+ v_src[0] = vget_low_u8(src_long);
+ v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+ v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2));
+ v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3));
+ v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 4));
+ v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 5));
+ sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src,
+ v_tap + 1);
+ } else if (filter_index == 2) {
+ v_src[0] = vget_low_u8(src_long);
+ v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+ v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2));
+ v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3));
+ v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 4));
+ v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 5));
+ v_src[6] = vget_low_u8(vextq_u8(src_long, src_long, 6));
+ v_src[7] = vget_low_u8(vextq_u8(src_long, src_long, 7));
+ sum = SumOnePassTaps<filter_index, false>(v_src, v_tap);
+ } else if (filter_index == 3) {
+ v_src[0] = vget_low_u8(src_long);
+ v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+ sum = SumOnePassTaps<filter_index, false>(v_src, v_tap + 3);
+ } else if (filter_index > 3) {
+ v_src[0] = vget_low_u8(src_long);
+ v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+ v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2));
+ v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3));
+ sum = SumOnePassTaps<filter_index, false>(v_src, v_tap + 2);
+ }
+ const uint16x8_t v_sum = vreinterpretq_u16_s16(
+ vrshrq_n_s16(sum, kInterRoundBitsHorizontal - 1));
+ vst1q_u16(dest16, v_sum);
+ s += src_stride;
+ dest16 += 8;
+ } while (--y != 0);
+ x += 8;
+ } while (x < width);
+ }
+}
+
+template <int filter_index, bool is_2d, bool is_compound>
+void FilterHorizontalWidth4(const uint8_t* src, const ptrdiff_t src_stride,
+ void* const dest, const ptrdiff_t pred_stride,
+ const int height, const uint8x8_t* const v_tap) {
+ auto* dest8 = static_cast<uint8_t*>(dest);
+ auto* dest16 = static_cast<uint16_t*>(dest);
+ int y = height;
+ do {
+ uint8x8_t v_src[4];
+ int16x8_t sum;
+ v_src[0] = vld1_u8(src);
+ if (filter_index == 3) {
+ v_src[1] = RightShiftVector<1 * 8>(v_src[0]);
+ sum = SumOnePassTaps<filter_index, false>(v_src, v_tap + 3);
+ } else {
+ v_src[1] = RightShiftVector<1 * 8>(v_src[0]);
+ v_src[2] = RightShiftVector<2 * 8>(v_src[0]);
+ v_src[3] = RightShiftVector<3 * 8>(v_src[0]);
+ sum = SumOnePassTaps<filter_index, false>(v_src, v_tap + 2);
+ }
+ if (is_2d || is_compound) {
+ const uint16x4_t v_sum = vreinterpret_u16_s16(
+ vrshr_n_s16(vget_low_s16(sum), kInterRoundBitsHorizontal - 1));
+ vst1_u16(dest16, v_sum);
+ } else {
+ constexpr int first_shift_rounding_bit =
+ 1 << (kInterRoundBitsHorizontal - 2);
+ sum = vaddq_s16(sum, vdupq_n_s16(first_shift_rounding_bit));
+ const uint8x8_t result = vqrshrun_n_s16(sum, kFilterBits - 1);
+ StoreLo4(&dest8[0], result);
+ }
+ src += src_stride;
+ dest8 += pred_stride;
+ dest16 += pred_stride;
+ } while (--y != 0);
+}
+
+template <int filter_index, bool is_2d>
+void FilterHorizontalWidth2(const uint8_t* src, const ptrdiff_t src_stride,
+ void* const dest, const ptrdiff_t pred_stride,
+ const int height, const uint8x8_t* const v_tap) {
+ auto* dest8 = static_cast<uint8_t*>(dest);
+ auto* dest16 = static_cast<uint16_t*>(dest);
+ int y = height >> 1;
+ do {
+ const uint8x8_t input0 = vld1_u8(src);
+ const uint8x8_t input1 = vld1_u8(src + src_stride);
+ const uint8x8x2_t input = vzip_u8(input0, input1);
+ uint16x8_t sum;
+ if (filter_index == 3) {
+ // tap signs : + +
+ sum = vmull_u8(input.val[0], v_tap[3]);
+ sum = vmlal_u8(sum, vext_u8(input.val[0], input.val[1], 2), v_tap[4]);
+ } else if (filter_index == 4) {
+ // tap signs : - + + -
+ sum = vmull_u8(RightShiftVector<2 * 8>(input.val[0]), v_tap[3]);
+ sum = vmlsl_u8(sum, input.val[0], v_tap[2]);
+ sum = vmlal_u8(sum, RightShiftVector<4 * 8>(input.val[0]), v_tap[4]);
+ sum = vmlsl_u8(sum, vext_u8(input.val[0], input.val[1], 6), v_tap[5]);
+ } else {
+ // tap signs : + + + +
+ sum = vmull_u8(input.val[0], v_tap[2]);
+ sum = vmlal_u8(sum, RightShiftVector<2 * 8>(input.val[0]), v_tap[3]);
+ sum = vmlal_u8(sum, RightShiftVector<4 * 8>(input.val[0]), v_tap[4]);
+ sum = vmlal_u8(sum, vext_u8(input.val[0], input.val[1], 6), v_tap[5]);
+ }
+ int16x8_t s = vreinterpretq_s16_u16(sum);
+ if (is_2d) {
+ const uint16x8_t v_sum =
+ vreinterpretq_u16_s16(vrshrq_n_s16(s, kInterRoundBitsHorizontal - 1));
+ dest16[0] = vgetq_lane_u16(v_sum, 0);
+ dest16[1] = vgetq_lane_u16(v_sum, 2);
+ dest16 += pred_stride;
+ dest16[0] = vgetq_lane_u16(v_sum, 1);
+ dest16[1] = vgetq_lane_u16(v_sum, 3);
+ dest16 += pred_stride;
+ } else {
+ // Normally the Horizontal pass does the downshift in two passes:
+ // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
+ // kInterRoundBitsHorizontal). Each one uses a rounding shift.
+ // Combining them requires adding the rounding offset from the skipped
+ // shift.
+ constexpr int first_shift_rounding_bit =
+ 1 << (kInterRoundBitsHorizontal - 2);
+ s = vaddq_s16(s, vdupq_n_s16(first_shift_rounding_bit));
+ const uint8x8_t result = vqrshrun_n_s16(s, kFilterBits - 1);
+ dest8[0] = vget_lane_u8(result, 0);
+ dest8[1] = vget_lane_u8(result, 2);
+ dest8 += pred_stride;
+ dest8[0] = vget_lane_u8(result, 1);
+ dest8[1] = vget_lane_u8(result, 3);
+ dest8 += pred_stride;
+ }
+ src += src_stride << 1;
+ } while (--y != 0);
+
+ // The 2d filters have an odd |height| because the horizontal pass
+ // generates context for the vertical pass.
+ if (is_2d) {
+ assert(height % 2 == 1);
+ const uint8x8_t input = vld1_u8(src);
+ uint16x8_t sum;
+ if (filter_index == 3) {
+ sum = vmull_u8(input, v_tap[3]);
+ sum = vmlal_u8(sum, RightShiftVector<1 * 8>(input), v_tap[4]);
+ } else if (filter_index == 4) {
+ sum = vmull_u8(RightShiftVector<1 * 8>(input), v_tap[3]);
+ sum = vmlsl_u8(sum, input, v_tap[2]);
+ sum = vmlal_u8(sum, RightShiftVector<2 * 8>(input), v_tap[4]);
+ sum = vmlsl_u8(sum, RightShiftVector<3 * 8>(input), v_tap[5]);
+ } else {
+ assert(filter_index == 5);
+ sum = vmull_u8(input, v_tap[2]);
+ sum = vmlal_u8(sum, RightShiftVector<1 * 8>(input), v_tap[3]);
+ sum = vmlal_u8(sum, RightShiftVector<2 * 8>(input), v_tap[4]);
+ sum = vmlal_u8(sum, RightShiftVector<3 * 8>(input), v_tap[5]);
+ }
+ // |sum| contains an int16_t value.
+ sum = vreinterpretq_u16_s16(vrshrq_n_s16(vreinterpretq_s16_u16(sum),
+ kInterRoundBitsHorizontal - 1));
+ Store2<0>(dest16, sum);
+ }
+}
+
+template <int filter_index, bool negative_outside_taps, bool is_2d,
+ bool is_compound>
+void FilterHorizontal(const uint8_t* const src, const ptrdiff_t src_stride,
+ void* const dest, const ptrdiff_t pred_stride,
+ const int width, const int height,
+ const uint8x8_t* const v_tap) {
+ assert(width < 8 || filter_index <= 3);
+ // Don't simplify the redundant if conditions with the template parameters,
+ // which helps the compiler generate compact code.
+ if (width >= 8 && filter_index <= 3) {
+ FilterHorizontalWidth8AndUp<filter_index, negative_outside_taps, is_2d,
+ is_compound>(src, src_stride, dest, pred_stride,
+ width, height, v_tap);
return;
}
- // Horizontal passes only needs to account for |num_taps| 2 and 4 when
+ // Horizontal passes only needs to account for number of taps 2 and 4 when
// |width| <= 4.
assert(width <= 4);
- assert(num_taps <= 4);
- if (num_taps <= 4) {
+ assert(filter_index >= 3 && filter_index <= 5);
+ if (filter_index >= 3 && filter_index <= 5) {
if (width == 4) {
- int y = 0;
- do {
- if (is_2d || is_compound) {
- const uint16x8_t v_sum =
- HorizontalTaps8To16<filter_index, negative_outside_taps>(src,
- v_tap);
- vst1_u16(dest16, vget_low_u16(v_sum));
- } else {
- const uint8x8_t result =
- SimpleHorizontalTaps<filter_index, negative_outside_taps>(src,
- v_tap);
- StoreLo4(&dest8[0], result);
- }
- src += src_stride;
- dest8 += pred_stride;
- dest16 += pred_stride;
- } while (++y < height);
+ FilterHorizontalWidth4<filter_index, is_2d, is_compound>(
+ src, src_stride, dest, pred_stride, height, v_tap);
return;
}
-
+ assert(width == 2);
if (!is_compound) {
- int y = 0;
- do {
- if (is_2d) {
- const uint16x8_t sum =
- HorizontalTaps8To16_2x2<filter_index>(src, src_stride, v_tap);
- dest16[0] = vgetq_lane_u16(sum, 0);
- dest16[1] = vgetq_lane_u16(sum, 2);
- dest16 += pred_stride;
- dest16[0] = vgetq_lane_u16(sum, 1);
- dest16[1] = vgetq_lane_u16(sum, 3);
- dest16 += pred_stride;
- } else {
- const uint8x8_t sum =
- SimpleHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
-
- dest8[0] = vget_lane_u8(sum, 0);
- dest8[1] = vget_lane_u8(sum, 2);
- dest8 += pred_stride;
-
- dest8[0] = vget_lane_u8(sum, 1);
- dest8[1] = vget_lane_u8(sum, 3);
- dest8 += pred_stride;
- }
-
- src += src_stride << 1;
- y += 2;
- } while (y < height - 1);
-
- // The 2d filters have an odd |height| because the horizontal pass
- // generates context for the vertical pass.
- if (is_2d) {
- assert(height % 2 == 1);
- uint16x8_t sum;
- const uint8x8_t input = vld1_u8(src);
- if (filter_index == 3) { // |num_taps| == 2
- sum = vmull_u8(RightShift<3 * 8>(input), v_tap[3]);
- sum = vmlal_u8(sum, RightShift<4 * 8>(input), v_tap[4]);
- } else if (filter_index == 4) {
- sum = vmull_u8(RightShift<3 * 8>(input), v_tap[3]);
- sum = vmlsl_u8(sum, RightShift<2 * 8>(input), v_tap[2]);
- sum = vmlal_u8(sum, RightShift<4 * 8>(input), v_tap[4]);
- sum = vmlsl_u8(sum, RightShift<5 * 8>(input), v_tap[5]);
- } else {
- assert(filter_index == 5);
- sum = vmull_u8(RightShift<2 * 8>(input), v_tap[2]);
- sum = vmlal_u8(sum, RightShift<3 * 8>(input), v_tap[3]);
- sum = vmlal_u8(sum, RightShift<4 * 8>(input), v_tap[4]);
- sum = vmlal_u8(sum, RightShift<5 * 8>(input), v_tap[5]);
- }
- // |sum| contains an int16_t value.
- sum = vreinterpretq_u16_s16(vrshrq_n_s16(
- vreinterpretq_s16_u16(sum), kInterRoundBitsHorizontal - 1));
- Store2<0>(dest16, sum);
- }
+ FilterHorizontalWidth2<filter_index, is_2d>(src, src_stride, dest,
+ pred_stride, height, v_tap);
}
}
}
@@ -451,78 +484,85 @@ int16x8_t SimpleSum2DVerticalTaps(const int16x8_t* const src,
}
template <int num_taps, bool is_compound = false>
-void Filter2DVertical(const uint16_t* src, void* const dst,
- const ptrdiff_t dst_stride, const int width,
- const int height, const int16x8_t taps) {
+void Filter2DVerticalWidth8AndUp(const uint16_t* src, void* const dst,
+ const ptrdiff_t dst_stride, const int width,
+ const int height, const int16x8_t taps) {
assert(width >= 8);
constexpr int next_row = num_taps - 1;
- // The Horizontal pass uses |width| as |stride| for the intermediate buffer.
- const ptrdiff_t src_stride = width;
-
- auto* dst8 = static_cast<uint8_t*>(dst);
- auto* dst16 = static_cast<uint16_t*>(dst);
+ auto* const dst8 = static_cast<uint8_t*>(dst);
+ auto* const dst16 = static_cast<uint16_t*>(dst);
int x = 0;
do {
- int16x8_t srcs[8];
- const uint16_t* src_x = src + x;
- srcs[0] = vreinterpretq_s16_u16(vld1q_u16(src_x));
- src_x += src_stride;
+ int16x8_t srcs[9];
+ srcs[0] = vreinterpretq_s16_u16(vld1q_u16(src));
+ src += 8;
if (num_taps >= 4) {
- srcs[1] = vreinterpretq_s16_u16(vld1q_u16(src_x));
- src_x += src_stride;
- srcs[2] = vreinterpretq_s16_u16(vld1q_u16(src_x));
- src_x += src_stride;
+ srcs[1] = vreinterpretq_s16_u16(vld1q_u16(src));
+ src += 8;
+ srcs[2] = vreinterpretq_s16_u16(vld1q_u16(src));
+ src += 8;
if (num_taps >= 6) {
- srcs[3] = vreinterpretq_s16_u16(vld1q_u16(src_x));
- src_x += src_stride;
- srcs[4] = vreinterpretq_s16_u16(vld1q_u16(src_x));
- src_x += src_stride;
+ srcs[3] = vreinterpretq_s16_u16(vld1q_u16(src));
+ src += 8;
+ srcs[4] = vreinterpretq_s16_u16(vld1q_u16(src));
+ src += 8;
if (num_taps == 8) {
- srcs[5] = vreinterpretq_s16_u16(vld1q_u16(src_x));
- src_x += src_stride;
- srcs[6] = vreinterpretq_s16_u16(vld1q_u16(src_x));
- src_x += src_stride;
+ srcs[5] = vreinterpretq_s16_u16(vld1q_u16(src));
+ src += 8;
+ srcs[6] = vreinterpretq_s16_u16(vld1q_u16(src));
+ src += 8;
}
}
}
- int y = 0;
+ uint8_t* d8 = dst8 + x;
+ uint16_t* d16 = dst16 + x;
+ int y = height;
do {
- srcs[next_row] = vreinterpretq_s16_u16(vld1q_u16(src_x));
- src_x += src_stride;
-
- const int16x8_t sum =
- SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps);
+ srcs[next_row] = vreinterpretq_s16_u16(vld1q_u16(src));
+ src += 8;
+ srcs[next_row + 1] = vreinterpretq_s16_u16(vld1q_u16(src));
+ src += 8;
+ const int16x8_t sum0 =
+ SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs + 0, taps);
+ const int16x8_t sum1 =
+ SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs + 1, taps);
if (is_compound) {
- vst1q_u16(dst16 + x + y * dst_stride, vreinterpretq_u16_s16(sum));
+ vst1q_u16(d16, vreinterpretq_u16_s16(sum0));
+ d16 += dst_stride;
+ vst1q_u16(d16, vreinterpretq_u16_s16(sum1));
+ d16 += dst_stride;
} else {
- vst1_u8(dst8 + x + y * dst_stride, vqmovun_s16(sum));
+ vst1_u8(d8, vqmovun_s16(sum0));
+ d8 += dst_stride;
+ vst1_u8(d8, vqmovun_s16(sum1));
+ d8 += dst_stride;
}
-
- srcs[0] = srcs[1];
+ srcs[0] = srcs[2];
if (num_taps >= 4) {
- srcs[1] = srcs[2];
- srcs[2] = srcs[3];
+ srcs[1] = srcs[3];
+ srcs[2] = srcs[4];
if (num_taps >= 6) {
- srcs[3] = srcs[4];
- srcs[4] = srcs[5];
+ srcs[3] = srcs[5];
+ srcs[4] = srcs[6];
if (num_taps == 8) {
- srcs[5] = srcs[6];
- srcs[6] = srcs[7];
+ srcs[5] = srcs[7];
+ srcs[6] = srcs[8];
}
}
}
- } while (++y < height);
+ y -= 2;
+ } while (y != 0);
x += 8;
} while (x < width);
}
// Take advantage of |src_stride| == |width| to process two rows at a time.
template <int num_taps, bool is_compound = false>
-void Filter2DVertical4xH(const uint16_t* src, void* const dst,
- const ptrdiff_t dst_stride, const int height,
- const int16x8_t taps) {
+void Filter2DVerticalWidth4(const uint16_t* src, void* const dst,
+ const ptrdiff_t dst_stride, const int height,
+ const int16x8_t taps) {
auto* dst8 = static_cast<uint8_t*>(dst);
auto* dst16 = static_cast<uint16_t*>(dst);
@@ -545,7 +585,7 @@ void Filter2DVertical4xH(const uint16_t* src, void* const dst,
}
}
- int y = 0;
+ int y = height;
do {
srcs[num_taps] = vreinterpretq_s16_u16(vld1q_u16(src));
src += 8;
@@ -580,15 +620,15 @@ void Filter2DVertical4xH(const uint16_t* src, void* const dst,
}
}
}
- y += 2;
- } while (y < height);
+ y -= 2;
+ } while (y != 0);
}
// Take advantage of |src_stride| == |width| to process four rows at a time.
template <int num_taps>
-void Filter2DVertical2xH(const uint16_t* src, void* const dst,
- const ptrdiff_t dst_stride, const int height,
- const int16x8_t taps) {
+void Filter2DVerticalWidth2(const uint16_t* src, void* const dst,
+ const ptrdiff_t dst_stride, const int height,
+ const int16x8_t taps) {
constexpr int next_row = (num_taps < 6) ? 4 : 8;
auto* dst8 = static_cast<uint8_t*>(dst);
@@ -672,29 +712,47 @@ LIBGAV1_ALWAYS_INLINE void DoHorizontalPass(
}
if (filter_index == 2) { // 8 tap.
- FilterHorizontal<8, 8, 2, true, is_2d, is_compound>(
+ FilterHorizontal<2, true, is_2d, is_compound>(
src, src_stride, dst, dst_stride, width, height, v_tap);
} else if (filter_index == 1) { // 6 tap.
// Check if outside taps are positive.
if ((filter_id == 1) | (filter_id == 15)) {
- FilterHorizontal<6, 8, 1, false, is_2d, is_compound>(
- src, src_stride, dst, dst_stride, width, height, v_tap);
+ FilterHorizontal<1, false, is_2d, is_compound>(
+ src + 1, src_stride, dst, dst_stride, width, height, v_tap);
} else {
- FilterHorizontal<6, 8, 1, true, is_2d, is_compound>(
- src, src_stride, dst, dst_stride, width, height, v_tap);
+ FilterHorizontal<1, true, is_2d, is_compound>(
+ src + 1, src_stride, dst, dst_stride, width, height, v_tap);
}
} else if (filter_index == 0) { // 6 tap.
- FilterHorizontal<6, 8, 0, true, is_2d, is_compound>(
- src, src_stride, dst, dst_stride, width, height, v_tap);
+ FilterHorizontal<0, true, is_2d, is_compound>(
+ src + 1, src_stride, dst, dst_stride, width, height, v_tap);
} else if (filter_index == 4) { // 4 tap.
- FilterHorizontal<4, 8, 4, true, is_2d, is_compound>(
- src, src_stride, dst, dst_stride, width, height, v_tap);
+ FilterHorizontal<4, true, is_2d, is_compound>(
+ src + 2, src_stride, dst, dst_stride, width, height, v_tap);
} else if (filter_index == 5) { // 4 tap.
- FilterHorizontal<4, 8, 5, true, is_2d, is_compound>(
- src, src_stride, dst, dst_stride, width, height, v_tap);
+ FilterHorizontal<5, true, is_2d, is_compound>(
+ src + 2, src_stride, dst, dst_stride, width, height, v_tap);
} else { // 2 tap.
- FilterHorizontal<2, 8, 3, true, is_2d, is_compound>(
- src, src_stride, dst, dst_stride, width, height, v_tap);
+ FilterHorizontal<3, true, is_2d, is_compound>(
+ src + 3, src_stride, dst, dst_stride, width, height, v_tap);
+ }
+}
+
+template <int vertical_taps>
+void Filter2DVertical(const uint16_t* const intermediate_result,
+ const int width, const int height, const int16x8_t taps,
+ void* const prediction, const ptrdiff_t pred_stride) {
+ auto* const dest = static_cast<uint8_t*>(prediction);
+ if (width >= 8) {
+ Filter2DVerticalWidth8AndUp<vertical_taps>(
+ intermediate_result, dest, pred_stride, width, height, taps);
+ } else if (width == 4) {
+ Filter2DVerticalWidth4<vertical_taps>(intermediate_result, dest,
+ pred_stride, height, taps);
+ } else {
+ assert(width == 2);
+ Filter2DVerticalWidth2<vertical_taps>(intermediate_result, dest,
+ pred_stride, height, taps);
}
}
@@ -704,7 +762,7 @@ void Convolve2D_NEON(const void* const reference,
const int vertical_filter_index,
const int horizontal_filter_id,
const int vertical_filter_id, const int width,
- const int height, void* prediction,
+ const int height, void* const prediction,
const ptrdiff_t pred_stride) {
const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
@@ -715,67 +773,31 @@ void Convolve2D_NEON(const void* const reference,
intermediate_result[kMaxSuperBlockSizeInPixels *
(kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
const int intermediate_height = height + vertical_taps - 1;
-
const ptrdiff_t src_stride = reference_stride;
- const auto* src = static_cast<const uint8_t*>(reference) -
- (vertical_taps / 2 - 1) * src_stride - kHorizontalOffset;
+ const auto* const src = static_cast<const uint8_t*>(reference) -
+ (vertical_taps / 2 - 1) * src_stride -
+ kHorizontalOffset;
DoHorizontalPass</*is_2d=*/true>(src, src_stride, intermediate_result, width,
width, intermediate_height,
horizontal_filter_id, horiz_filter_index);
// Vertical filter.
- auto* dest = static_cast<uint8_t*>(prediction);
- const ptrdiff_t dest_stride = pred_stride;
assert(vertical_filter_id != 0);
-
const int16x8_t taps = vmovl_s8(
vld1_s8(kHalfSubPixelFilters[vert_filter_index][vertical_filter_id]));
-
if (vertical_taps == 8) {
- if (width == 2) {
- Filter2DVertical2xH<8>(intermediate_result, dest, dest_stride, height,
- taps);
- } else if (width == 4) {
- Filter2DVertical4xH<8>(intermediate_result, dest, dest_stride, height,
- taps);
- } else {
- Filter2DVertical<8>(intermediate_result, dest, dest_stride, width, height,
- taps);
- }
+ Filter2DVertical<8>(intermediate_result, width, height, taps, prediction,
+ pred_stride);
} else if (vertical_taps == 6) {
- if (width == 2) {
- Filter2DVertical2xH<6>(intermediate_result, dest, dest_stride, height,
- taps);
- } else if (width == 4) {
- Filter2DVertical4xH<6>(intermediate_result, dest, dest_stride, height,
- taps);
- } else {
- Filter2DVertical<6>(intermediate_result, dest, dest_stride, width, height,
- taps);
- }
+ Filter2DVertical<6>(intermediate_result, width, height, taps, prediction,
+ pred_stride);
} else if (vertical_taps == 4) {
- if (width == 2) {
- Filter2DVertical2xH<4>(intermediate_result, dest, dest_stride, height,
- taps);
- } else if (width == 4) {
- Filter2DVertical4xH<4>(intermediate_result, dest, dest_stride, height,
- taps);
- } else {
- Filter2DVertical<4>(intermediate_result, dest, dest_stride, width, height,
- taps);
- }
+ Filter2DVertical<4>(intermediate_result, width, height, taps, prediction,
+ pred_stride);
} else { // |vertical_taps| == 2
- if (width == 2) {
- Filter2DVertical2xH<2>(intermediate_result, dest, dest_stride, height,
- taps);
- } else if (width == 4) {
- Filter2DVertical4xH<2>(intermediate_result, dest, dest_stride, height,
- taps);
- } else {
- Filter2DVertical<2>(intermediate_result, dest, dest_stride, width, height,
- taps);
- }
+ Filter2DVertical<2>(intermediate_result, width, height, taps, prediction,
+ pred_stride);
}
}
@@ -788,7 +810,7 @@ void Convolve2D_NEON(const void* const reference,
// increments. The first load covers the initial elements of src_x, while the
// final load covers the taps.
template <int grade_x>
-inline uint8x8x3_t LoadSrcVals(const uint8_t* src_x) {
+inline uint8x8x3_t LoadSrcVals(const uint8_t* const src_x) {
uint8x8x3_t ret;
const uint8x16_t src_val = vld1q_u8(src_x);
ret.val[0] = vget_low_u8(src_val);
@@ -811,7 +833,7 @@ inline uint8x16_t GetPositive2TapFilter(const int tap_index) {
}
template <int grade_x>
-inline void ConvolveKernelHorizontal2Tap(const uint8_t* src,
+inline void ConvolveKernelHorizontal2Tap(const uint8_t* const src,
const ptrdiff_t src_stride,
const int width, const int subpixel_x,
const int step_x,
@@ -843,7 +865,7 @@ inline void ConvolveKernelHorizontal2Tap(const uint8_t* src,
// on x.
const uint8x8_t taps[2] = {VQTbl1U8(filter_taps0, filter_indices),
VQTbl1U8(filter_taps1, filter_indices)};
- int y = 0;
+ int y = intermediate_height;
do {
// Load a pool of samples to select from using stepped indices.
const uint8x16_t src_vals = vld1q_u8(src_x);
@@ -860,7 +882,7 @@ inline void ConvolveKernelHorizontal2Tap(const uint8_t* src,
kInterRoundBitsHorizontal - 1));
src_x += src_stride;
intermediate += kIntermediateStride;
- } while (++y < intermediate_height);
+ } while (--y != 0);
return;
}
@@ -883,7 +905,7 @@ inline void ConvolveKernelHorizontal2Tap(const uint8_t* src,
// on x.
const uint8x8_t taps[2] = {VQTbl1U8(filter_taps0, filter_indices),
VQTbl1U8(filter_taps1, filter_indices)};
- int y = 0;
+ int y = intermediate_height;
do {
// Load a pool of samples to select from using stepped indices.
const uint8x8x3_t src_vals = LoadSrcVals<grade_x>(src_x);
@@ -900,7 +922,7 @@ inline void ConvolveKernelHorizontal2Tap(const uint8_t* src,
kInterRoundBitsHorizontal - 1));
src_x += src_stride;
intermediate_x += kIntermediateStride;
- } while (++y < intermediate_height);
+ } while (--y != 0);
x += 8;
p += step_x8;
} while (x < width);
@@ -921,7 +943,7 @@ inline uint8x16_t GetPositive4TapFilter(const int tap_index) {
// This filter is only possible when width <= 4.
void ConvolveKernelHorizontalPositive4Tap(
- const uint8_t* src, const ptrdiff_t src_stride, const int subpixel_x,
+ const uint8_t* const src, const ptrdiff_t src_stride, const int subpixel_x,
const int step_x, const int intermediate_height, int16_t* intermediate) {
const int kernel_offset = 2;
const int ref_x = subpixel_x >> kScaleSubPixelBits;
@@ -950,7 +972,7 @@ void ConvolveKernelHorizontalPositive4Tap(
const uint8x8_t src_indices =
vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits));
- int y = 0;
+ int y = intermediate_height;
do {
// Load a pool of samples to select from using stepped index vectors.
const uint8x16_t src_vals = vld1q_u8(src_x);
@@ -970,7 +992,7 @@ void ConvolveKernelHorizontalPositive4Tap(
src_x += src_stride;
intermediate += kIntermediateStride;
- } while (++y < intermediate_height);
+ } while (--y != 0);
}
// Pre-transpose the 4 tap filters in |kAbsHalfSubPixelFilters|[4].
@@ -988,7 +1010,7 @@ inline uint8x16_t GetSigned4TapFilter(const int tap_index) {
// This filter is only possible when width <= 4.
inline void ConvolveKernelHorizontalSigned4Tap(
- const uint8_t* src, const ptrdiff_t src_stride, const int subpixel_x,
+ const uint8_t* const src, const ptrdiff_t src_stride, const int subpixel_x,
const int step_x, const int intermediate_height, int16_t* intermediate) {
const int kernel_offset = 2;
const int ref_x = subpixel_x >> kScaleSubPixelBits;
@@ -1025,7 +1047,7 @@ inline void ConvolveKernelHorizontalSigned4Tap(
vadd_u8(src_indices_base, vdup_n_u8(2)),
vadd_u8(src_indices_base, vdup_n_u8(3))};
- int y = 0;
+ int y = intermediate_height;
do {
// Load a pool of samples to select from using stepped indices.
const uint8x16_t src_vals = vld1q_u8(src_x);
@@ -1042,7 +1064,7 @@ inline void ConvolveKernelHorizontalSigned4Tap(
kInterRoundBitsHorizontal - 1));
src_x += src_stride;
intermediate += kIntermediateStride;
- } while (++y < intermediate_height);
+ } while (--y != 0);
}
// Pre-transpose the 6 tap filters in |kAbsHalfSubPixelFilters|[0].
@@ -1063,9 +1085,9 @@ inline uint8x16_t GetSigned6TapFilter(const int tap_index) {
// This filter is only possible when width >= 8.
template <int grade_x>
inline void ConvolveKernelHorizontalSigned6Tap(
- const uint8_t* src, const ptrdiff_t src_stride, const int width,
+ const uint8_t* const src, const ptrdiff_t src_stride, const int width,
const int subpixel_x, const int step_x, const int intermediate_height,
- int16_t* intermediate) {
+ int16_t* const intermediate) {
const int kernel_offset = 1;
const uint8x8_t one = vdup_n_u8(1);
const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
@@ -1107,7 +1129,7 @@ inline void ConvolveKernelHorizontalSigned6Tap(
for (int i = 0; i < 6; ++i) {
taps[i] = VQTbl1U8(filter_taps[i], filter_indices);
}
- int y = 0;
+ int y = intermediate_height;
do {
// Load a pool of samples to select from using stepped indices.
const uint8x8x3_t src_vals = LoadSrcVals<grade_x>(src_x);
@@ -1122,7 +1144,7 @@ inline void ConvolveKernelHorizontalSigned6Tap(
kInterRoundBitsHorizontal - 1));
src_x += src_stride;
intermediate_x += kIntermediateStride;
- } while (++y < intermediate_height);
+ } while (--y != 0);
x += 8;
p += step_x8;
} while (x < width);
@@ -1156,9 +1178,9 @@ inline int8x16_t GetMixed6TapFilter(const int tap_index) {
// This filter is only possible when width >= 8.
template <int grade_x>
inline void ConvolveKernelHorizontalMixed6Tap(
- const uint8_t* src, const ptrdiff_t src_stride, const int width,
+ const uint8_t* const src, const ptrdiff_t src_stride, const int width,
const int subpixel_x, const int step_x, const int intermediate_height,
- int16_t* intermediate) {
+ int16_t* const intermediate) {
const int kernel_offset = 1;
const uint8x8_t one = vdup_n_u8(1);
const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
@@ -1205,7 +1227,7 @@ inline void ConvolveKernelHorizontalMixed6Tap(
mixed_taps[0] = vmovl_s8(VQTbl1S8(mixed_filter_taps[0], filter_indices));
mixed_taps[1] = vmovl_s8(VQTbl1S8(mixed_filter_taps[1], filter_indices));
- int y = 0;
+ int y = intermediate_height;
do {
// Load a pool of samples to select from using stepped indices.
const uint8x8x3_t src_vals = LoadSrcVals<grade_x>(src_x);
@@ -1224,7 +1246,7 @@ inline void ConvolveKernelHorizontalMixed6Tap(
kInterRoundBitsHorizontal - 1));
src_x += src_stride;
intermediate_x += kIntermediateStride;
- } while (++y < intermediate_height);
+ } while (--y != 0);
x += 8;
p += step_x8;
} while (x < width);
@@ -1250,9 +1272,9 @@ inline uint8x16_t GetSigned8TapFilter(const int tap_index) {
// This filter is only possible when width >= 8.
template <int grade_x>
inline void ConvolveKernelHorizontalSigned8Tap(
- const uint8_t* src, const ptrdiff_t src_stride, const int width,
+ const uint8_t* const src, const ptrdiff_t src_stride, const int width,
const int subpixel_x, const int step_x, const int intermediate_height,
- int16_t* intermediate) {
+ int16_t* const intermediate) {
const uint8x8_t one = vdup_n_u8(1);
const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
const int ref_x = subpixel_x >> kScaleSubPixelBits;
@@ -1290,7 +1312,7 @@ inline void ConvolveKernelHorizontalSigned8Tap(
taps[i] = VQTbl1U8(filter_taps[i], filter_indices);
}
- int y = 0;
+ int y = intermediate_height;
do {
// Load a pool of samples to select from using stepped indices.
const uint8x8x3_t src_vals = LoadSrcVals<grade_x>(src_x);
@@ -1306,7 +1328,7 @@ inline void ConvolveKernelHorizontalSigned8Tap(
kInterRoundBitsHorizontal - 1));
src_x += src_stride;
intermediate_x += kIntermediateStride;
- } while (++y < intermediate_height);
+ } while (--y != 0);
x += 8;
p += step_x8;
} while (x < width);
@@ -1314,9 +1336,9 @@ inline void ConvolveKernelHorizontalSigned8Tap(
// This function handles blocks of width 2 or 4.
template <int num_taps, int grade_y, int width, bool is_compound>
-void ConvolveVerticalScale4xH(const int16_t* src, const int subpixel_y,
+void ConvolveVerticalScale4xH(const int16_t* const src, const int subpixel_y,
const int filter_index, const int step_y,
- const int height, void* dest,
+ const int height, void* const dest,
const ptrdiff_t dest_stride) {
constexpr ptrdiff_t src_stride = kIntermediateStride;
const int16_t* src_y = src;
@@ -1327,8 +1349,8 @@ void ConvolveVerticalScale4xH(const int16_t* src, const int subpixel_y,
int p = subpixel_y & 1023;
int prev_p = p;
- int y = 0;
- do { // y < height
+ int y = height;
+ do {
for (int i = 0; i < num_taps; ++i) {
s[i] = vld1_s16(src_y + i * src_stride);
}
@@ -1381,16 +1403,16 @@ void ConvolveVerticalScale4xH(const int16_t* src, const int subpixel_y,
prev_p = p;
dest16_y += dest_stride;
dest_y += dest_stride;
-
- y += 2;
- } while (y < height);
+ y -= 2;
+ } while (y != 0);
}
template <int num_taps, int grade_y, bool is_compound>
-inline void ConvolveVerticalScale(const int16_t* src, const int width,
+inline void ConvolveVerticalScale(const int16_t* const src, const int width,
const int subpixel_y, const int filter_index,
const int step_y, const int height,
- void* dest, const ptrdiff_t dest_stride) {
+ void* const dest,
+ const ptrdiff_t dest_stride) {
constexpr ptrdiff_t src_stride = kIntermediateStride;
// A possible improvement is to use arithmetic to decide how many times to
// apply filters to same source before checking whether to load new srcs.
@@ -1401,15 +1423,15 @@ inline void ConvolveVerticalScale(const int16_t* src, const int width,
uint8_t* dest_y;
int x = 0;
- do { // x < width
- const int16_t* src_x = src + x;
+ do {
+ const int16_t* const src_x = src + x;
const int16_t* src_y = src_x;
dest16_y = static_cast<uint16_t*>(dest) + x;
dest_y = static_cast<uint8_t*>(dest) + x;
int p = subpixel_y & 1023;
int prev_p = p;
- int y = 0;
- do { // y < height
+ int y = height;
+ do {
for (int i = 0; i < num_taps; ++i) {
s[i] = vld1q_s16(src_y + i * src_stride);
}
@@ -1448,9 +1470,8 @@ inline void ConvolveVerticalScale(const int16_t* src, const int width,
prev_p = p;
dest16_y += dest_stride;
dest_y += dest_stride;
-
- y += 2;
- } while (y < height);
+ y -= 2;
+ } while (y != 0);
x += 8;
} while (x < width);
}
@@ -1462,7 +1483,7 @@ void ConvolveScale2D_NEON(const void* const reference,
const int vertical_filter_index, const int subpixel_x,
const int subpixel_y, const int step_x,
const int step_y, const int width, const int height,
- void* prediction, const ptrdiff_t pred_stride) {
+ void* const prediction, const ptrdiff_t pred_stride) {
const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
assert(step_x <= 2048);
@@ -1699,12 +1720,13 @@ void ConvolveHorizontal_NEON(const void* const reference,
const int /*vertical_filter_index*/,
const int horizontal_filter_id,
const int /*vertical_filter_id*/, const int width,
- const int height, void* prediction,
+ const int height, void* const prediction,
const ptrdiff_t pred_stride) {
const int filter_index = GetFilterIndex(horizontal_filter_index, width);
// Set |src| to the outermost tap.
- const auto* src = static_cast<const uint8_t*>(reference) - kHorizontalOffset;
- auto* dest = static_cast<uint8_t*>(prediction);
+ const auto* const src =
+ static_cast<const uint8_t*>(reference) - kHorizontalOffset;
+ auto* const dest = static_cast<uint8_t*>(prediction);
DoHorizontalPass(src, reference_stride, dest, pred_stride, width, height,
horizontal_filter_id, filter_index);
@@ -1719,14 +1741,14 @@ uint16x8_t Compound1DShift(const int16x8_t sum) {
template <int filter_index, bool is_compound = false,
bool negative_outside_taps = false>
-void FilterVertical(const uint8_t* src, const ptrdiff_t src_stride,
+void FilterVertical(const uint8_t* const src, const ptrdiff_t src_stride,
void* const dst, const ptrdiff_t dst_stride,
const int width, const int height,
const uint8x8_t* const taps) {
const int num_taps = GetNumTapsInFilter(filter_index);
const int next_row = num_taps - 1;
- auto* dst8 = static_cast<uint8_t*>(dst);
- auto* dst16 = static_cast<uint16_t*>(dst);
+ auto* const dst8 = static_cast<uint8_t*>(dst);
+ auto* const dst16 = static_cast<uint16_t*>(dst);
assert(width >= 8);
int x = 0;
@@ -1754,6 +1776,9 @@ void FilterVertical(const uint8_t* src, const ptrdiff_t src_stride,
}
}
+ // Decreasing the y loop counter produces worse code with clang.
+ // Don't unroll this loop since it generates too much code and the decoder
+ // is even slower.
int y = 0;
do {
srcs[next_row] = vld1_u8(src_x);
@@ -1804,7 +1829,7 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
srcs[0] = Load4(src);
src += src_stride;
- int y = 0;
+ int y = height;
do {
srcs[0] = Load4<1>(src, srcs[0]);
src += src_stride;
@@ -1829,8 +1854,8 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
}
srcs[0] = srcs[2];
- y += 2;
- } while (y < height);
+ y -= 2;
+ } while (y != 0);
} else if (num_taps == 4) {
srcs[4] = vdup_n_u8(0);
@@ -1842,7 +1867,7 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
src += src_stride;
srcs[1] = vext_u8(srcs[0], srcs[2], 4);
- int y = 0;
+ int y = height;
do {
srcs[2] = Load4<1>(src, srcs[2]);
src += src_stride;
@@ -1869,8 +1894,8 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
srcs[0] = srcs[2];
srcs[1] = srcs[3];
srcs[2] = srcs[4];
- y += 2;
- } while (y < height);
+ y -= 2;
+ } while (y != 0);
} else if (num_taps == 6) {
srcs[6] = vdup_n_u8(0);
@@ -1887,7 +1912,7 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
src += src_stride;
srcs[3] = vext_u8(srcs[2], srcs[4], 4);
- int y = 0;
+ int y = height;
do {
srcs[4] = Load4<1>(src, srcs[4]);
src += src_stride;
@@ -1916,8 +1941,8 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
srcs[2] = srcs[4];
srcs[3] = srcs[5];
srcs[4] = srcs[6];
- y += 2;
- } while (y < height);
+ y -= 2;
+ } while (y != 0);
} else if (num_taps == 8) {
srcs[8] = vdup_n_u8(0);
@@ -1939,7 +1964,7 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
src += src_stride;
srcs[5] = vext_u8(srcs[4], srcs[6], 4);
- int y = 0;
+ int y = height;
do {
srcs[6] = Load4<1>(src, srcs[6]);
src += src_stride;
@@ -1970,8 +1995,8 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
srcs[4] = srcs[6];
srcs[5] = srcs[7];
srcs[6] = srcs[8];
- y += 2;
- } while (y < height);
+ y -= 2;
+ } while (y != 0);
}
}
@@ -2186,14 +2211,14 @@ void ConvolveVertical_NEON(const void* const reference,
const int vertical_filter_index,
const int /*horizontal_filter_id*/,
const int vertical_filter_id, const int width,
- const int height, void* prediction,
+ const int height, void* const prediction,
const ptrdiff_t pred_stride) {
const int filter_index = GetFilterIndex(vertical_filter_index, height);
const int vertical_taps = GetNumTapsInFilter(filter_index);
const ptrdiff_t src_stride = reference_stride;
const auto* src = static_cast<const uint8_t*>(reference) -
(vertical_taps / 2 - 1) * src_stride;
- auto* dest = static_cast<uint8_t*>(prediction);
+ auto* const dest = static_cast<uint8_t*>(prediction);
const ptrdiff_t dest_stride = pred_stride;
assert(vertical_filter_id != 0);
@@ -2303,7 +2328,7 @@ void ConvolveCompoundCopy_NEON(
const void* const reference, const ptrdiff_t reference_stride,
const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
const int /*horizontal_filter_id*/, const int /*vertical_filter_id*/,
- const int width, const int height, void* prediction,
+ const int width, const int height, void* const prediction,
const ptrdiff_t /*pred_stride*/) {
const auto* src = static_cast<const uint8_t*>(reference);
const ptrdiff_t src_stride = reference_stride;
@@ -2312,7 +2337,7 @@ void ConvolveCompoundCopy_NEON(
kInterRoundBitsVertical - kInterRoundBitsCompoundVertical;
if (width >= 16) {
- int y = 0;
+ int y = height;
do {
int x = 0;
do {
@@ -2328,20 +2353,20 @@ void ConvolveCompoundCopy_NEON(
} while (x < width);
src += src_stride;
dest += width;
- } while (++y < height);
+ } while (--y != 0);
} else if (width == 8) {
- int y = 0;
+ int y = height;
do {
const uint8x8_t v_src = vld1_u8(&src[0]);
const uint16x8_t v_dest = vshll_n_u8(v_src, final_shift);
vst1q_u16(&dest[0], v_dest);
src += src_stride;
dest += width;
- } while (++y < height);
- } else { /* width == 4 */
+ } while (--y != 0);
+ } else { // width == 4
uint8x8_t v_src = vdup_n_u8(0);
- int y = 0;
+ int y = height;
do {
v_src = Load4<0>(&src[0], v_src);
src += src_stride;
@@ -2350,8 +2375,8 @@ void ConvolveCompoundCopy_NEON(
const uint16x8_t v_dest = vshll_n_u8(v_src, final_shift);
vst1q_u16(&dest[0], v_dest);
dest += 4 << 1;
- y += 2;
- } while (y < height);
+ y -= 2;
+ } while (y != 0);
}
}
@@ -2359,14 +2384,14 @@ void ConvolveCompoundVertical_NEON(
const void* const reference, const ptrdiff_t reference_stride,
const int /*horizontal_filter_index*/, const int vertical_filter_index,
const int /*horizontal_filter_id*/, const int vertical_filter_id,
- const int width, const int height, void* prediction,
+ const int width, const int height, void* const prediction,
const ptrdiff_t /*pred_stride*/) {
const int filter_index = GetFilterIndex(vertical_filter_index, height);
const int vertical_taps = GetNumTapsInFilter(filter_index);
const ptrdiff_t src_stride = reference_stride;
const auto* src = static_cast<const uint8_t*>(reference) -
(vertical_taps / 2 - 1) * src_stride;
- auto* dest = static_cast<uint16_t*>(prediction);
+ auto* const dest = static_cast<uint16_t*>(prediction);
assert(vertical_filter_id != 0);
uint8x8_t taps[8];
@@ -2454,24 +2479,39 @@ void ConvolveCompoundHorizontal_NEON(
const void* const reference, const ptrdiff_t reference_stride,
const int horizontal_filter_index, const int /*vertical_filter_index*/,
const int horizontal_filter_id, const int /*vertical_filter_id*/,
- const int width, const int height, void* prediction,
+ const int width, const int height, void* const prediction,
const ptrdiff_t /*pred_stride*/) {
const int filter_index = GetFilterIndex(horizontal_filter_index, width);
- const auto* src = static_cast<const uint8_t*>(reference) - kHorizontalOffset;
- auto* dest = static_cast<uint16_t*>(prediction);
+ const auto* const src =
+ static_cast<const uint8_t*>(reference) - kHorizontalOffset;
+ auto* const dest = static_cast<uint16_t*>(prediction);
DoHorizontalPass</*is_2d=*/false, /*is_compound=*/true>(
src, reference_stride, dest, width, width, height, horizontal_filter_id,
filter_index);
}
+template <int vertical_taps>
+void Compound2DVertical(const uint16_t* const intermediate_result,
+ const int width, const int height, const int16x8_t taps,
+ void* const prediction) {
+ auto* const dest = static_cast<uint16_t*>(prediction);
+ if (width == 4) {
+ Filter2DVerticalWidth4<vertical_taps, /*is_compound=*/true>(
+ intermediate_result, dest, width, height, taps);
+ } else {
+ Filter2DVerticalWidth8AndUp<vertical_taps, /*is_compound=*/true>(
+ intermediate_result, dest, width, width, height, taps);
+ }
+}
+
void ConvolveCompound2D_NEON(const void* const reference,
const ptrdiff_t reference_stride,
const int horizontal_filter_index,
const int vertical_filter_index,
const int horizontal_filter_id,
const int vertical_filter_id, const int width,
- const int height, void* prediction,
+ const int height, void* const prediction,
const ptrdiff_t /*pred_stride*/) {
// The output of the horizontal filter, i.e. the intermediate_result, is
// guaranteed to fit in int16_t.
@@ -2492,55 +2532,26 @@ void ConvolveCompound2D_NEON(const void* const reference,
const auto* const src = static_cast<const uint8_t*>(reference) -
(vertical_taps / 2 - 1) * src_stride -
kHorizontalOffset;
-
DoHorizontalPass</*is_2d=*/true, /*is_compound=*/true>(
src, src_stride, intermediate_result, width, width, intermediate_height,
horizontal_filter_id, horiz_filter_index);
// Vertical filter.
- auto* dest = static_cast<uint16_t*>(prediction);
assert(vertical_filter_id != 0);
-
- const ptrdiff_t dest_stride = width;
const int16x8_t taps = vmovl_s8(
vld1_s8(kHalfSubPixelFilters[vert_filter_index][vertical_filter_id]));
-
if (vertical_taps == 8) {
- if (width == 4) {
- Filter2DVertical4xH<8, /*is_compound=*/true>(intermediate_result, dest,
- dest_stride, height, taps);
- } else {
- Filter2DVertical<8, /*is_compound=*/true>(
- intermediate_result, dest, dest_stride, width, height, taps);
- }
+ Compound2DVertical<8>(intermediate_result, width, height, taps, prediction);
} else if (vertical_taps == 6) {
- if (width == 4) {
- Filter2DVertical4xH<6, /*is_compound=*/true>(intermediate_result, dest,
- dest_stride, height, taps);
- } else {
- Filter2DVertical<6, /*is_compound=*/true>(
- intermediate_result, dest, dest_stride, width, height, taps);
- }
+ Compound2DVertical<6>(intermediate_result, width, height, taps, prediction);
} else if (vertical_taps == 4) {
- if (width == 4) {
- Filter2DVertical4xH<4, /*is_compound=*/true>(intermediate_result, dest,
- dest_stride, height, taps);
- } else {
- Filter2DVertical<4, /*is_compound=*/true>(
- intermediate_result, dest, dest_stride, width, height, taps);
- }
+ Compound2DVertical<4>(intermediate_result, width, height, taps, prediction);
} else { // |vertical_taps| == 2
- if (width == 4) {
- Filter2DVertical4xH<2, /*is_compound=*/true>(intermediate_result, dest,
- dest_stride, height, taps);
- } else {
- Filter2DVertical<2, /*is_compound=*/true>(
- intermediate_result, dest, dest_stride, width, height, taps);
- }
+ Compound2DVertical<2>(intermediate_result, width, height, taps, prediction);
}
}
-inline void HalfAddHorizontal(const uint8_t* src, uint8_t* dst) {
+inline void HalfAddHorizontal(const uint8_t* const src, uint8_t* const dst) {
const uint8x16_t left = vld1q_u8(src);
const uint8x16_t right = vld1q_u8(src + 1);
vst1q_u8(dst, vrhaddq_u8(left, right));
@@ -2554,7 +2565,7 @@ inline void IntraBlockCopyHorizontal(const uint8_t* src,
const ptrdiff_t src_remainder_stride = src_stride - (width - 16);
const ptrdiff_t dst_remainder_stride = dst_stride - (width - 16);
- int y = 0;
+ int y = height;
do {
HalfAddHorizontal(src, dst);
if (width >= 32) {
@@ -2586,7 +2597,7 @@ inline void IntraBlockCopyHorizontal(const uint8_t* src,
}
src += src_remainder_stride;
dst += dst_remainder_stride;
- } while (++y < height);
+ } while (--y != 0);
}
void ConvolveIntraBlockCopyHorizontal_NEON(
@@ -2610,7 +2621,7 @@ void ConvolveIntraBlockCopyHorizontal_NEON(
IntraBlockCopyHorizontal<16>(src, reference_stride, height, dest,
pred_stride);
} else if (width == 8) {
- int y = 0;
+ int y = height;
do {
const uint8x8_t left = vld1_u8(src);
const uint8x8_t right = vld1_u8(src + 1);
@@ -2618,11 +2629,11 @@ void ConvolveIntraBlockCopyHorizontal_NEON(
src += reference_stride;
dest += pred_stride;
- } while (++y < height);
+ } while (--y != 0);
} else if (width == 4) {
uint8x8_t left = vdup_n_u8(0);
uint8x8_t right = vdup_n_u8(0);
- int y = 0;
+ int y = height;
do {
left = Load4<0>(src, left);
right = Load4<0>(src + 1, right);
@@ -2637,13 +2648,13 @@ void ConvolveIntraBlockCopyHorizontal_NEON(
dest += pred_stride;
StoreHi4(dest, result);
dest += pred_stride;
- y += 2;
- } while (y < height);
+ y -= 2;
+ } while (y != 0);
} else {
assert(width == 2);
uint8x8_t left = vdup_n_u8(0);
uint8x8_t right = vdup_n_u8(0);
- int y = 0;
+ int y = height;
do {
left = Load2<0>(src, left);
right = Load2<0>(src + 1, right);
@@ -2658,8 +2669,8 @@ void ConvolveIntraBlockCopyHorizontal_NEON(
dest += pred_stride;
Store2<1>(dest, result);
dest += pred_stride;
- y += 2;
- } while (y < height);
+ y -= 2;
+ } while (y != 0);
}
}
@@ -2694,7 +2705,7 @@ inline void IntraBlockCopyVertical(const uint8_t* src,
}
src += src_remainder_stride;
- int y = 0;
+ int y = height;
do {
below[0] = vld1q_u8(src);
if (width >= 32) {
@@ -2749,7 +2760,7 @@ inline void IntraBlockCopyVertical(const uint8_t* src,
}
}
dst += dst_remainder_stride;
- } while (++y < height);
+ } while (--y != 0);
}
void ConvolveIntraBlockCopyVertical_NEON(
@@ -2778,7 +2789,7 @@ void ConvolveIntraBlockCopyVertical_NEON(
row = vld1_u8(src);
src += reference_stride;
- int y = 0;
+ int y = height;
do {
below = vld1_u8(src);
src += reference_stride;
@@ -2787,13 +2798,13 @@ void ConvolveIntraBlockCopyVertical_NEON(
dest += pred_stride;
row = below;
- } while (++y < height);
+ } while (--y != 0);
} else if (width == 4) {
uint8x8_t row = Load4(src);
uint8x8_t below = vdup_n_u8(0);
src += reference_stride;
- int y = 0;
+ int y = height;
do {
below = Load4<0>(src, below);
src += reference_stride;
@@ -2802,14 +2813,14 @@ void ConvolveIntraBlockCopyVertical_NEON(
dest += pred_stride;
row = below;
- } while (++y < height);
+ } while (--y != 0);
} else {
assert(width == 2);
uint8x8_t row = Load2(src);
uint8x8_t below = vdup_n_u8(0);
src += reference_stride;
- int y = 0;
+ int y = height;
do {
below = Load2<0>(src, below);
src += reference_stride;
@@ -2818,7 +2829,7 @@ void ConvolveIntraBlockCopyVertical_NEON(
dest += pred_stride;
row = below;
- } while (++y < height);
+ } while (--y != 0);
}
}
@@ -2870,7 +2881,7 @@ inline void IntraBlockCopy2D(const uint8_t* src, const ptrdiff_t src_stride,
}
src += src_remainder_stride;
- int y = 0;
+ int y = height;
do {
const uint16x8_t below_0 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[0], below_0), 2));
@@ -2981,7 +2992,7 @@ inline void IntraBlockCopy2D(const uint8_t* src, const ptrdiff_t src_stride,
}
src += src_remainder_stride;
dst += dst_remainder_stride;
- } while (++y < height);
+ } while (--y != 0);
}
void ConvolveIntraBlockCopy2D_NEON(
@@ -3013,7 +3024,7 @@ void ConvolveIntraBlockCopy2D_NEON(
uint16x4_t row = vget_low_u16(vaddl_u8(left, right));
- int y = 0;
+ int y = height;
do {
left = Load4<0>(src, left);
right = Load4<0>(src + 1, right);
@@ -3032,8 +3043,8 @@ void ConvolveIntraBlockCopy2D_NEON(
dest += pred_stride;
row = vget_high_u16(below);
- y += 2;
- } while (y < height);
+ y -= 2;
+ } while (y != 0);
} else {
uint8x8_t left = Load2(src);
uint8x8_t right = Load2(src + 1);
@@ -3041,7 +3052,7 @@ void ConvolveIntraBlockCopy2D_NEON(
uint16x4_t row = vget_low_u16(vaddl_u8(left, right));
- int y = 0;
+ int y = height;
do {
left = Load2<0>(src, left);
right = Load2<0>(src + 1, right);
@@ -3060,8 +3071,8 @@ void ConvolveIntraBlockCopy2D_NEON(
dest += pred_stride;
row = vget_high_u16(below);
- y += 2;
- } while (y < height);
+ y -= 2;
+ } while (y != 0);
}
}
@@ -3093,7 +3104,7 @@ void ConvolveInit_NEON() { low_bitdepth::Init8bpp(); }
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_NEON
+#else // !LIBGAV1_ENABLE_NEON
namespace libgav1 {
namespace dsp {
diff --git a/src/dsp/arm/distance_weighted_blend_neon.cc b/src/dsp/arm/distance_weighted_blend_neon.cc
index 04952ab..a0cd0ac 100644
--- a/src/dsp/arm/distance_weighted_blend_neon.cc
+++ b/src/dsp/arm/distance_weighted_blend_neon.cc
@@ -30,10 +30,12 @@
namespace libgav1 {
namespace dsp {
-namespace {
constexpr int kInterPostRoundBit = 4;
+namespace low_bitdepth {
+namespace {
+
inline int16x8_t ComputeWeightedAverage8(const int16x8_t pred0,
const int16x8_t pred1,
const int16x4_t weights[2]) {
@@ -185,13 +187,167 @@ void Init8bpp() {
}
} // namespace
+} // namespace low_bitdepth
+
+//------------------------------------------------------------------------------
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+inline uint16x4x2_t ComputeWeightedAverage8(const uint16x4x2_t pred0,
+ const uint16x4x2_t pred1,
+ const uint16x4_t weights[2]) {
+ const uint32x4_t wpred0_lo = vmull_u16(weights[0], pred0.val[0]);
+ const uint32x4_t wpred0_hi = vmull_u16(weights[0], pred0.val[1]);
+ const uint32x4_t blended_lo = vmlal_u16(wpred0_lo, weights[1], pred1.val[0]);
+ const uint32x4_t blended_hi = vmlal_u16(wpred0_hi, weights[1], pred1.val[1]);
+ const int32x4_t offset = vdupq_n_s32(kCompoundOffset * 16);
+ const int32x4_t res_lo = vsubq_s32(vreinterpretq_s32_u32(blended_lo), offset);
+ const int32x4_t res_hi = vsubq_s32(vreinterpretq_s32_u32(blended_hi), offset);
+ const uint16x4_t bd_max = vdup_n_u16((1 << kBitdepth10) - 1);
+ // Clip the result at (1 << bd) - 1.
+ uint16x4x2_t result;
+ result.val[0] =
+ vmin_u16(vqrshrun_n_s32(res_lo, kInterPostRoundBit + 4), bd_max);
+ result.val[1] =
+ vmin_u16(vqrshrun_n_s32(res_hi, kInterPostRoundBit + 4), bd_max);
+ return result;
+}
+
+inline uint16x4x4_t ComputeWeightedAverage8(const uint16x4x4_t pred0,
+ const uint16x4x4_t pred1,
+ const uint16x4_t weights[2]) {
+ const int32x4_t offset = vdupq_n_s32(kCompoundOffset * 16);
+ const uint32x4_t wpred0 = vmull_u16(weights[0], pred0.val[0]);
+ const uint32x4_t wpred1 = vmull_u16(weights[0], pred0.val[1]);
+ const uint32x4_t blended0 = vmlal_u16(wpred0, weights[1], pred1.val[0]);
+ const uint32x4_t blended1 = vmlal_u16(wpred1, weights[1], pred1.val[1]);
+ const int32x4_t res0 = vsubq_s32(vreinterpretq_s32_u32(blended0), offset);
+ const int32x4_t res1 = vsubq_s32(vreinterpretq_s32_u32(blended1), offset);
+ const uint32x4_t wpred2 = vmull_u16(weights[0], pred0.val[2]);
+ const uint32x4_t wpred3 = vmull_u16(weights[0], pred0.val[3]);
+ const uint32x4_t blended2 = vmlal_u16(wpred2, weights[1], pred1.val[2]);
+ const uint32x4_t blended3 = vmlal_u16(wpred3, weights[1], pred1.val[3]);
+ const int32x4_t res2 = vsubq_s32(vreinterpretq_s32_u32(blended2), offset);
+ const int32x4_t res3 = vsubq_s32(vreinterpretq_s32_u32(blended3), offset);
+ const uint16x4_t bd_max = vdup_n_u16((1 << kBitdepth10) - 1);
+ // Clip the result at (1 << bd) - 1.
+ uint16x4x4_t result;
+ result.val[0] =
+ vmin_u16(vqrshrun_n_s32(res0, kInterPostRoundBit + 4), bd_max);
+ result.val[1] =
+ vmin_u16(vqrshrun_n_s32(res1, kInterPostRoundBit + 4), bd_max);
+ result.val[2] =
+ vmin_u16(vqrshrun_n_s32(res2, kInterPostRoundBit + 4), bd_max);
+ result.val[3] =
+ vmin_u16(vqrshrun_n_s32(res3, kInterPostRoundBit + 4), bd_max);
+
+ return result;
+}
+
+// We could use vld1_u16_x2, but for compatibility reasons, use this function
+// instead. The compiler optimizes to the correct instruction.
+inline uint16x4x2_t LoadU16x4_x2(uint16_t const* ptr) {
+ uint16x4x2_t x;
+ // gcc/clang (64 bit) optimizes the following to ldp.
+ x.val[0] = vld1_u16(ptr);
+ x.val[1] = vld1_u16(ptr + 4);
+ return x;
+}
+
+// We could use vld1_u16_x4, but for compatibility reasons, use this function
+// instead. The compiler optimizes to a pair of vld1_u16_x2, which showed better
+// performance in the speed tests.
+inline uint16x4x4_t LoadU16x4_x4(uint16_t const* ptr) {
+ uint16x4x4_t x;
+ x.val[0] = vld1_u16(ptr);
+ x.val[1] = vld1_u16(ptr + 4);
+ x.val[2] = vld1_u16(ptr + 8);
+ x.val[3] = vld1_u16(ptr + 12);
+ return x;
+}
+
+void DistanceWeightedBlend_NEON(const void* prediction_0,
+ const void* prediction_1,
+ const uint8_t weight_0, const uint8_t weight_1,
+ const int width, const int height,
+ void* const dest, const ptrdiff_t dest_stride) {
+ const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+ auto* dst = static_cast<uint16_t*>(dest);
+ const ptrdiff_t dst_stride = dest_stride / sizeof(dst[0]);
+ const uint16x4_t weights[2] = {vdup_n_u16(weight_0), vdup_n_u16(weight_1)};
-void DistanceWeightedBlendInit_NEON() { Init8bpp(); }
+ if (width == 4) {
+ int y = height;
+ do {
+ const uint16x4x2_t src0 = LoadU16x4_x2(pred_0);
+ const uint16x4x2_t src1 = LoadU16x4_x2(pred_1);
+ const uint16x4x2_t res = ComputeWeightedAverage8(src0, src1, weights);
+ vst1_u16(dst, res.val[0]);
+ vst1_u16(dst + dst_stride, res.val[1]);
+ dst += dst_stride << 1;
+ pred_0 += 8;
+ pred_1 += 8;
+ y -= 2;
+ } while (y != 0);
+ } else if (width == 8) {
+ int y = height;
+ do {
+ const uint16x4x4_t src0 = LoadU16x4_x4(pred_0);
+ const uint16x4x4_t src1 = LoadU16x4_x4(pred_1);
+ const uint16x4x4_t res = ComputeWeightedAverage8(src0, src1, weights);
+ vst1_u16(dst, res.val[0]);
+ vst1_u16(dst + 4, res.val[1]);
+ vst1_u16(dst + dst_stride, res.val[2]);
+ vst1_u16(dst + dst_stride + 4, res.val[3]);
+ dst += dst_stride << 1;
+ pred_0 += 16;
+ pred_1 += 16;
+ y -= 2;
+ } while (y != 0);
+ } else {
+ int y = height;
+ do {
+ int x = 0;
+ do {
+ const uint16x4x4_t src0 = LoadU16x4_x4(pred_0 + x);
+ const uint16x4x4_t src1 = LoadU16x4_x4(pred_1 + x);
+ const uint16x4x4_t res = ComputeWeightedAverage8(src0, src1, weights);
+ vst1_u16(dst + x, res.val[0]);
+ vst1_u16(dst + x + 4, res.val[1]);
+ vst1_u16(dst + x + 8, res.val[2]);
+ vst1_u16(dst + x + 12, res.val[3]);
+ x += 16;
+ } while (x < width);
+ dst += dst_stride;
+ pred_0 += width;
+ pred_1 += width;
+ } while (--y != 0);
+ }
+}
+
+void Init10bpp() {
+ Dsp* dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+ dsp->distance_weighted_blend = DistanceWeightedBlend_NEON;
+}
+
+} // namespace
+} // namespace high_bitdepth
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+void DistanceWeightedBlendInit_NEON() {
+ low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ high_bitdepth::Init10bpp();
+#endif
+}
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_NEON
+#else // !LIBGAV1_ENABLE_NEON
namespace libgav1 {
namespace dsp {
diff --git a/src/dsp/arm/distance_weighted_blend_neon.h b/src/dsp/arm/distance_weighted_blend_neon.h
index 4d8824c..94a799c 100644
--- a/src/dsp/arm/distance_weighted_blend_neon.h
+++ b/src/dsp/arm/distance_weighted_blend_neon.h
@@ -34,6 +34,8 @@ void DistanceWeightedBlendInit_NEON();
#if LIBGAV1_ENABLE_NEON
#define LIBGAV1_Dsp8bpp_DistanceWeightedBlend LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_DistanceWeightedBlend LIBGAV1_CPU_NEON
+
#endif // LIBGAV1_ENABLE_NEON
#endif // LIBGAV1_SRC_DSP_ARM_DISTANCE_WEIGHTED_BLEND_NEON_H_
diff --git a/src/dsp/arm/film_grain_neon.cc b/src/dsp/arm/film_grain_neon.cc
index 2612466..8ee3745 100644
--- a/src/dsp/arm/film_grain_neon.cc
+++ b/src/dsp/arm/film_grain_neon.cc
@@ -1176,7 +1176,7 @@ void FilmGrainInit_NEON() {
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_NEON
+#else // !LIBGAV1_ENABLE_NEON
namespace libgav1 {
namespace dsp {
diff --git a/src/dsp/arm/intra_edge_neon.cc b/src/dsp/arm/intra_edge_neon.cc
index 00b186a..074283f 100644
--- a/src/dsp/arm/intra_edge_neon.cc
+++ b/src/dsp/arm/intra_edge_neon.cc
@@ -25,7 +25,7 @@
#include "src/dsp/arm/common_neon.h"
#include "src/dsp/constants.h"
#include "src/dsp/dsp.h"
-#include "src/utils/common.h" // RightShiftWithRounding()
+#include "src/utils/common.h"
namespace libgav1 {
namespace dsp {
@@ -35,6 +35,11 @@ namespace {
// required.
constexpr int kKernelsNEON[3][2] = {{4, 8}, {5, 6}};
+} // namespace
+
+namespace low_bitdepth {
+namespace {
+
void IntraEdgeFilter_NEON(void* buffer, const int size, const int strength) {
assert(strength == 1 || strength == 2 || strength == 3);
const int kernel_index = strength - 1;
@@ -44,6 +49,8 @@ void IntraEdgeFilter_NEON(void* buffer, const int size, const int strength) {
// elements written is |size| - 1.
if (size == 1) return;
+ const uint8x16_t v_index = vcombine_u8(vcreate_u8(0x0706050403020100),
+ vcreate_u8(0x0f0e0d0c0b0a0908));
// |strength| 1 and 2 use a 3 tap filter.
if (strength < 3) {
// The last value requires extending the buffer (duplicating
@@ -89,7 +96,6 @@ void IntraEdgeFilter_NEON(void* buffer, const int size, const int strength) {
// |remainder| == 1 then we don't have to do anything.
const int remainder = (size - 1) & 0xf;
if (remainder > 1) {
- uint8_t temp[16];
const uint8x16_t src_1 = vld1q_u8(dst_buffer + i);
const uint8x16_t src_2 = vld1q_u8(dst_buffer + i + 1);
@@ -102,9 +108,11 @@ void IntraEdgeFilter_NEON(void* buffer, const int size, const int strength) {
const uint8x16_t result =
vcombine_u8(vrshrn_n_u16(sum_lo, 4), vrshrn_n_u16(sum_hi, 4));
-
- vst1q_u8(temp, result);
- memcpy(dst_buffer + i, temp, remainder);
+ const uint8x16_t v_remainder = vdupq_n_u8(remainder);
+ // Create over write mask.
+ const uint8x16_t mask = vcleq_u8(v_remainder, v_index);
+ const uint8x16_t dst_remainder = vbslq_u8(mask, src_1, result);
+ vst1q_u8(dst_buffer + i, dst_remainder);
}
dst_buffer[size - 1] = last_val;
@@ -173,7 +181,6 @@ void IntraEdgeFilter_NEON(void* buffer, const int size, const int strength) {
// Like the 3 tap but if there are two remaining values we have already
// calculated them.
if (remainder > 2) {
- uint8_t temp[16];
const uint8x16_t src_2 = vld1q_u8(dst_buffer + i);
const uint8x16_t src_3 = vld1q_u8(dst_buffer + i + 1);
const uint8x16_t src_4 = vld1q_u8(dst_buffer + i + 2);
@@ -193,9 +200,11 @@ void IntraEdgeFilter_NEON(void* buffer, const int size, const int strength) {
const uint8x16_t result =
vcombine_u8(vrshrn_n_u16(sum_lo, 4), vrshrn_n_u16(sum_hi, 4));
-
- vst1q_u8(temp, result);
- memcpy(dst_buffer + i, temp, remainder);
+ const uint8x16_t v_remainder = vdupq_n_u8(remainder);
+ // Create over write mask.
+ const uint8x16_t mask = vcleq_u8(v_remainder, v_index);
+ const uint8x16_t dst_remainder = vbslq_u8(mask, src_2, result);
+ vst1q_u8(dst_buffer + i, dst_remainder);
}
dst_buffer[1] = special_vals[0];
@@ -284,13 +293,225 @@ void Init8bpp() {
}
} // namespace
+} // namespace low_bitdepth
+
+//------------------------------------------------------------------------------
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+const uint16_t kRemainderMask[8][8] = {
+ {0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000},
+ {0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000},
+ {0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000},
+ {0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000},
+ {0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000},
+ {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000},
+ {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000},
+ {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000},
+};
+
+void IntraEdgeFilter_NEON(void* buffer, const int size, const int strength) {
+ assert(strength == 1 || strength == 2 || strength == 3);
+ const int kernel_index = strength - 1;
+ auto* const dst_buffer = static_cast<uint16_t*>(buffer);
+
+ // The first element is not written out (but it is input) so the number of
+ // elements written is |size| - 1.
+ if (size == 1) return;
+
+ // |strength| 1 and 2 use a 3 tap filter.
+ if (strength < 3) {
+ // The last value requires extending the buffer (duplicating
+ // |dst_buffer[size - 1]). Calculate it here to avoid extra processing in
+ // neon.
+ const uint16_t last_val = RightShiftWithRounding(
+ kKernelsNEON[kernel_index][0] * dst_buffer[size - 2] +
+ kKernelsNEON[kernel_index][1] * dst_buffer[size - 1] +
+ kKernelsNEON[kernel_index][0] * dst_buffer[size - 1],
+ 4);
+
+ const uint16_t krn0 = kKernelsNEON[kernel_index][0];
+ const uint16_t krn1 = kKernelsNEON[kernel_index][1];
+
+ // The first value we need gets overwritten by the output from the
+ // previous iteration.
+ uint16x8_t src_0 = vld1q_u16(dst_buffer);
+ int i = 1;
+
+ // Process blocks until there are less than 16 values remaining.
+ for (; i < size - 7; i += 8) {
+ // Loading these at the end of the block with |src_0| will read past the
+ // end of |top_row_data[160]|, the source of |buffer|.
+ const uint16x8_t src_1 = vld1q_u16(dst_buffer + i);
+ const uint16x8_t src_2 = vld1q_u16(dst_buffer + i + 1);
+ const uint16x8_t sum_02 = vmulq_n_u16(vaddq_u16(src_0, src_2), krn0);
+ const uint16x8_t sum = vmlaq_n_u16(sum_02, src_1, krn1);
+ const uint16x8_t result = vrshrq_n_u16(sum, 4);
+ // Load the next row before overwriting. This loads an extra 7 values
+ // past |size| on the trailing iteration.
+ src_0 = vld1q_u16(dst_buffer + i + 7);
+ vst1q_u16(dst_buffer + i, result);
+ }
+
+ // The last output value |last_val| was already calculated so if
+ // |remainder| == 1 then we don't have to do anything.
+ const int remainder = (size - 1) & 0x7;
+ if (remainder > 1) {
+ const uint16x8_t src_1 = vld1q_u16(dst_buffer + i);
+ const uint16x8_t src_2 = vld1q_u16(dst_buffer + i + 1);
+ const uint16x8_t sum_02 = vmulq_n_u16(vaddq_u16(src_0, src_2), krn0);
+ const uint16x8_t sum = vmlaq_n_u16(sum_02, src_1, krn1);
+ const uint16x8_t result = vrshrq_n_u16(sum, 4);
+ const uint16x8_t mask = vld1q_u16(kRemainderMask[remainder]);
+ const uint16x8_t dst_remainder = vbslq_u16(mask, result, src_1);
+ vst1q_u16(dst_buffer + i, dst_remainder);
+ }
+
+ dst_buffer[size - 1] = last_val;
+ return;
+ }
+
+ assert(strength == 3);
+ // 5 tap filter. The first element requires duplicating |buffer[0]| and the
+ // last two elements require duplicating |buffer[size - 1]|.
+ uint16_t special_vals[3];
+ special_vals[0] = RightShiftWithRounding(
+ (dst_buffer[0] << 1) + (dst_buffer[0] << 2) + (dst_buffer[1] << 2) +
+ (dst_buffer[2] << 2) + (dst_buffer[3] << 1),
+ 4);
+ // Clamp index for very small |size| values.
+ const int first_index_min = std::max(size - 4, 0);
+ const int second_index_min = std::max(size - 3, 0);
+ const int third_index_min = std::max(size - 2, 0);
+ special_vals[1] = RightShiftWithRounding(
+ (dst_buffer[first_index_min] << 1) + (dst_buffer[second_index_min] << 2) +
+ (dst_buffer[third_index_min] << 2) + (dst_buffer[size - 1] << 2) +
+ (dst_buffer[size - 1] << 1),
+ 4);
+ special_vals[2] = RightShiftWithRounding(
+ (dst_buffer[second_index_min] << 1) + (dst_buffer[third_index_min] << 2) +
+ // x << 2 + x << 2 == x << 3
+ (dst_buffer[size - 1] << 3) + (dst_buffer[size - 1] << 1),
+ 4);
+
+ // The first two values we need get overwritten by the output from the
+ // previous iteration.
+ uint16x8_t src_0 = vld1q_u16(dst_buffer - 1);
+ uint16x8_t src_1 = vld1q_u16(dst_buffer);
+ int i = 1;
+
+ for (; i < size - 7; i += 8) {
+ // Loading these at the end of the block with |src_[01]| will read past
+ // the end of |top_row_data[160]|, the source of |buffer|.
+ const uint16x8_t src_2 = vld1q_u16(dst_buffer + i);
+ const uint16x8_t src_3 = vld1q_u16(dst_buffer + i + 1);
+ const uint16x8_t src_4 = vld1q_u16(dst_buffer + i + 2);
+ const uint16x8_t sum_04 = vshlq_n_u16(vaddq_u16(src_0, src_4), 1);
+ const uint16x8_t sum_123 = vaddq_u16(vaddq_u16(src_1, src_2), src_3);
+ const uint16x8_t sum = vaddq_u16(sum_04, vshlq_n_u16(sum_123, 2));
+ const uint16x8_t result = vrshrq_n_u16(sum, 4);
+
+ // Load the next before overwriting.
+ src_0 = vld1q_u16(dst_buffer + i + 6);
+ src_1 = vld1q_u16(dst_buffer + i + 7);
+
+ vst1q_u16(dst_buffer + i, result);
+ }
+
+ const int remainder = (size - 1) & 0x7;
+ // Like the 3 tap but if there are two remaining values we have already
+ // calculated them.
+ if (remainder > 2) {
+ const uint16x8_t src_2 = vld1q_u16(dst_buffer + i);
+ const uint16x8_t src_3 = vld1q_u16(dst_buffer + i + 1);
+ const uint16x8_t src_4 = vld1q_u16(dst_buffer + i + 2);
+ const uint16x8_t sum_04 = vshlq_n_u16(vaddq_u16(src_0, src_4), 1);
+ const uint16x8_t sum_123 = vaddq_u16(vaddq_u16(src_1, src_2), src_3);
+ const uint16x8_t sum = vaddq_u16(sum_04, vshlq_n_u16(sum_123, 2));
+ const uint16x8_t result = vrshrq_n_u16(sum, 4);
+ const uint16x8_t mask = vld1q_u16(kRemainderMask[remainder]);
+ const uint16x8_t dst_remainder = vbslq_u16(mask, result, src_2);
+ vst1q_u16(dst_buffer + i, dst_remainder);
+ }
+
+ dst_buffer[1] = special_vals[0];
+ // Avoid overwriting |dst_buffer[0]|.
+ if (size > 2) dst_buffer[size - 2] = special_vals[1];
+ dst_buffer[size - 1] = special_vals[2];
+}
+
+void IntraEdgeUpsampler_NEON(void* buffer, const int size) {
+ assert(size % 4 == 0 && size <= 16);
+ auto* const pixel_buffer = static_cast<uint16_t*>(buffer);
-void IntraEdgeInit_NEON() { Init8bpp(); }
+ // Extend first/last samples
+ pixel_buffer[-2] = pixel_buffer[-1];
+ pixel_buffer[size] = pixel_buffer[size - 1];
+
+ const int16x8_t src_lo = vreinterpretq_s16_u16(vld1q_u16(pixel_buffer - 2));
+ const int16x8_t src_hi =
+ vreinterpretq_s16_u16(vld1q_u16(pixel_buffer - 2 + 8));
+ const int16x8_t src9_hi = vaddq_s16(src_hi, vshlq_n_s16(src_hi, 3));
+ const int16x8_t src9_lo = vaddq_s16(src_lo, vshlq_n_s16(src_lo, 3));
+
+ int16x8_t sum_lo = vsubq_s16(vextq_s16(src9_lo, src9_hi, 1), src_lo);
+ sum_lo = vaddq_s16(sum_lo, vextq_s16(src9_lo, src9_hi, 2));
+ sum_lo = vsubq_s16(sum_lo, vextq_s16(src_lo, src_hi, 3));
+ sum_lo = vrshrq_n_s16(sum_lo, 4);
+
+ uint16x8x2_t result_lo;
+ result_lo.val[0] =
+ vminq_u16(vreinterpretq_u16_s16(vmaxq_s16(sum_lo, vdupq_n_s16(0))),
+ vdupq_n_u16((1 << kBitdepth10) - 1));
+ result_lo.val[1] = vreinterpretq_u16_s16(vextq_s16(src_lo, src_hi, 2));
+
+ if (size > 8) {
+ const int16x8_t src_hi_extra =
+ vreinterpretq_s16_u16(vld1q_u16(pixel_buffer + 16 - 2));
+ const int16x8_t src9_hi_extra =
+ vaddq_s16(src_hi_extra, vshlq_n_s16(src_hi_extra, 3));
+
+ int16x8_t sum_hi = vsubq_s16(vextq_s16(src9_hi, src9_hi_extra, 1), src_hi);
+ sum_hi = vaddq_s16(sum_hi, vextq_s16(src9_hi, src9_hi_extra, 2));
+ sum_hi = vsubq_s16(sum_hi, vextq_s16(src_hi, src_hi_extra, 3));
+ sum_hi = vrshrq_n_s16(sum_hi, 4);
+
+ uint16x8x2_t result_hi;
+ result_hi.val[0] =
+ vminq_u16(vreinterpretq_u16_s16(vmaxq_s16(sum_hi, vdupq_n_s16(0))),
+ vdupq_n_u16((1 << kBitdepth10) - 1));
+ result_hi.val[1] =
+ vreinterpretq_u16_s16(vextq_s16(src_hi, src_hi_extra, 2));
+ vst2q_u16(pixel_buffer - 1, result_lo);
+ vst2q_u16(pixel_buffer + 15, result_hi);
+ } else {
+ vst2q_u16(pixel_buffer - 1, result_lo);
+ }
+}
+
+void Init10bpp() {
+ Dsp* dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+ dsp->intra_edge_filter = IntraEdgeFilter_NEON;
+ dsp->intra_edge_upsampler = IntraEdgeUpsampler_NEON;
+}
+
+} // namespace
+} // namespace high_bitdepth
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+void IntraEdgeInit_NEON() {
+ low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ high_bitdepth::Init10bpp();
+#endif
+}
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_NEON
+#else // !LIBGAV1_ENABLE_NEON
namespace libgav1 {
namespace dsp {
diff --git a/src/dsp/arm/intra_edge_neon.h b/src/dsp/arm/intra_edge_neon.h
index d3bb243..28e3494 100644
--- a/src/dsp/arm/intra_edge_neon.h
+++ b/src/dsp/arm/intra_edge_neon.h
@@ -34,6 +34,9 @@ void IntraEdgeInit_NEON();
#define LIBGAV1_Dsp8bpp_IntraEdgeFilter LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_IntraEdgeUpsampler LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_IntraEdgeFilter LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_IntraEdgeUpsampler LIBGAV1_CPU_NEON
+
#endif // LIBGAV1_ENABLE_NEON
#endif // LIBGAV1_SRC_DSP_ARM_INTRA_EDGE_NEON_H_
diff --git a/src/dsp/arm/intrapred_cfl_neon.cc b/src/dsp/arm/intrapred_cfl_neon.cc
index 45fe33b..8d8748f 100644
--- a/src/dsp/arm/intrapred_cfl_neon.cc
+++ b/src/dsp/arm/intrapred_cfl_neon.cc
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "src/dsp/intrapred.h"
+#include "src/dsp/intrapred_cfl.h"
#include "src/utils/cpu.h"
#if LIBGAV1_ENABLE_NEON
@@ -27,45 +27,20 @@
#include "src/dsp/constants.h"
#include "src/dsp/dsp.h"
#include "src/utils/common.h"
+#include "src/utils/constants.h"
namespace libgav1 {
namespace dsp {
-namespace low_bitdepth {
-namespace {
-
-uint8x16_t Set2ValuesQ(const uint8_t* a) {
- uint16_t combined_values = a[0] | a[1] << 8;
- return vreinterpretq_u8_u16(vdupq_n_u16(combined_values));
-}
-
-uint32_t SumVector(uint32x2_t a) {
-#if defined(__aarch64__)
- return vaddv_u32(a);
-#else
- const uint64x1_t b = vpaddl_u32(a);
- return vget_lane_u32(vreinterpret_u32_u64(b), 0);
-#endif // defined(__aarch64__)
-}
-
-uint32_t SumVector(uint32x4_t a) {
-#if defined(__aarch64__)
- return vaddvq_u32(a);
-#else
- const uint64x2_t b = vpaddlq_u32(a);
- const uint64x1_t c = vadd_u64(vget_low_u64(b), vget_high_u64(b));
- return vget_lane_u32(vreinterpret_u32_u64(c), 0);
-#endif // defined(__aarch64__)
-}
// Divide by the number of elements.
-uint32_t Average(const uint32_t sum, const int width, const int height) {
+inline uint32_t Average(const uint32_t sum, const int width, const int height) {
return RightShiftWithRounding(sum, FloorLog2(width) + FloorLog2(height));
}
// Subtract |val| from every element in |a|.
-void BlockSubtract(const uint32_t val,
- int16_t a[kCflLumaBufferStride][kCflLumaBufferStride],
- const int width, const int height) {
+inline void BlockSubtract(const uint32_t val,
+ int16_t a[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int width, const int height) {
assert(val <= INT16_MAX);
const int16x8_t val_v = vdupq_n_s16(static_cast<int16_t>(val));
@@ -94,6 +69,9 @@ void BlockSubtract(const uint32_t val,
}
}
+namespace low_bitdepth {
+namespace {
+
template <int block_width, int block_height>
void CflSubsampler420_NEON(
int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
@@ -122,26 +100,27 @@ void CflSubsampler420_NEON(
sum = SumVector(running_sum);
} else if (block_width == 8) {
- const uint8x16_t x_index = {0, 0, 2, 2, 4, 4, 6, 6,
- 8, 8, 10, 10, 12, 12, 14, 14};
- const uint8x16_t x_max_index = vdupq_n_u8(max_luma_width - 2);
- const uint8x16_t x_mask = vcltq_u8(x_index, x_max_index);
+ const uint16x8_t x_index = {0, 2, 4, 6, 8, 10, 12, 14};
+ const uint16x8_t x_max_index =
+ vdupq_n_u16(max_luma_width == 8 ? max_luma_width - 2 : 16);
+ const uint16x8_t x_mask = vcltq_u16(x_index, x_max_index);
uint32x4_t running_sum = vdupq_n_u32(0);
for (int y = 0; y < block_height; ++y) {
- const uint8x16_t x_max0 = Set2ValuesQ(src + max_luma_width - 2);
- const uint8x16_t x_max1 = Set2ValuesQ(src + max_luma_width - 2 + stride);
+ const uint8x16_t row0 = vld1q_u8(src);
+ const uint8x16_t row1 = vld1q_u8(src + stride);
+ const uint16x8_t sum_row = vpadalq_u8(vpaddlq_u8(row0), row1);
+ const uint16x8_t sum_row_shifted = vshlq_n_u16(sum_row, 1);
- uint8x16_t row0 = vld1q_u8(src);
- row0 = vbslq_u8(x_mask, row0, x_max0);
- uint8x16_t row1 = vld1q_u8(src + stride);
- row1 = vbslq_u8(x_mask, row1, x_max1);
+ // Dup the 2x2 sum at the max luma offset.
+ const uint16x8_t max_luma_sum =
+ vdupq_lane_u16(vget_low_u16(sum_row_shifted), 3);
+ const uint16x8_t final_sum_row =
+ vbslq_u16(x_mask, sum_row_shifted, max_luma_sum);
+ vst1q_s16(luma[y], vreinterpretq_s16_u16(final_sum_row));
- uint16x8_t sum_row = vpadalq_u8(vpaddlq_u8(row0), row1);
- sum_row = vshlq_n_u16(sum_row, 1);
- running_sum = vpadalq_u16(running_sum, sum_row);
- vst1q_s16(luma[y], vreinterpretq_s16_u16(sum_row));
+ running_sum = vpadalq_u16(running_sum, final_sum_row);
if (y << 1 < max_luma_height - 2) {
src += stride << 1;
@@ -150,45 +129,35 @@ void CflSubsampler420_NEON(
sum = SumVector(running_sum);
} else /* block_width >= 16 */ {
- const uint8x16_t x_max_index = vdupq_n_u8(max_luma_width - 2);
+ const uint16x8_t x_max_index = vdupq_n_u16(max_luma_width - 2);
uint32x4_t running_sum = vdupq_n_u32(0);
for (int y = 0; y < block_height; ++y) {
- uint8x16_t x_index = {0, 2, 4, 6, 8, 10, 12, 14,
- 16, 18, 20, 22, 24, 26, 28, 30};
- const uint8x16_t x_max00 = vdupq_n_u8(src[max_luma_width - 2]);
- const uint8x16_t x_max01 = vdupq_n_u8(src[max_luma_width - 2 + 1]);
- const uint8x16_t x_max10 = vdupq_n_u8(src[stride + max_luma_width - 2]);
- const uint8x16_t x_max11 =
- vdupq_n_u8(src[stride + max_luma_width - 2 + 1]);
- for (int x = 0; x < block_width; x += 16) {
- const ptrdiff_t src_x_offset = x << 1;
- const uint8x16_t x_mask = vcltq_u8(x_index, x_max_index);
- const uint8x16x2_t row0 = vld2q_u8(src + src_x_offset);
- const uint8x16x2_t row1 = vld2q_u8(src + src_x_offset + stride);
- const uint8x16_t row_masked_00 = vbslq_u8(x_mask, row0.val[0], x_max00);
- const uint8x16_t row_masked_01 = vbslq_u8(x_mask, row0.val[1], x_max01);
- const uint8x16_t row_masked_10 = vbslq_u8(x_mask, row1.val[0], x_max10);
- const uint8x16_t row_masked_11 = vbslq_u8(x_mask, row1.val[1], x_max11);
-
- uint16x8_t sum_row_lo =
- vaddl_u8(vget_low_u8(row_masked_00), vget_low_u8(row_masked_01));
- sum_row_lo = vaddw_u8(sum_row_lo, vget_low_u8(row_masked_10));
- sum_row_lo = vaddw_u8(sum_row_lo, vget_low_u8(row_masked_11));
- sum_row_lo = vshlq_n_u16(sum_row_lo, 1);
- running_sum = vpadalq_u16(running_sum, sum_row_lo);
- vst1q_s16(luma[y] + x, vreinterpretq_s16_u16(sum_row_lo));
-
- uint16x8_t sum_row_hi =
- vaddl_u8(vget_high_u8(row_masked_00), vget_high_u8(row_masked_01));
- sum_row_hi = vaddw_u8(sum_row_hi, vget_high_u8(row_masked_10));
- sum_row_hi = vaddw_u8(sum_row_hi, vget_high_u8(row_masked_11));
- sum_row_hi = vshlq_n_u16(sum_row_hi, 1);
- running_sum = vpadalq_u16(running_sum, sum_row_hi);
- vst1q_s16(luma[y] + x + 8, vreinterpretq_s16_u16(sum_row_hi));
-
- x_index = vaddq_u8(x_index, vdupq_n_u8(32));
+ // Calculate the 2x2 sum at the max_luma offset
+ const uint8_t a00 = src[max_luma_width - 2];
+ const uint8_t a01 = src[max_luma_width - 1];
+ const uint8_t a10 = src[max_luma_width - 2 + stride];
+ const uint8_t a11 = src[max_luma_width - 1 + stride];
+ // Dup the 2x2 sum at the max luma offset.
+ const uint16x8_t max_luma_sum =
+ vdupq_n_u16((uint16_t)((a00 + a01 + a10 + a11) << 1));
+ uint16x8_t x_index = {0, 2, 4, 6, 8, 10, 12, 14};
+
+ ptrdiff_t src_x_offset = 0;
+ for (int x = 0; x < block_width; x += 8, src_x_offset += 16) {
+ const uint16x8_t x_mask = vcltq_u16(x_index, x_max_index);
+ const uint8x16_t row0 = vld1q_u8(src + src_x_offset);
+ const uint8x16_t row1 = vld1q_u8(src + src_x_offset + stride);
+ const uint16x8_t sum_row = vpadalq_u8(vpaddlq_u8(row0), row1);
+ const uint16x8_t sum_row_shifted = vshlq_n_u16(sum_row, 1);
+ const uint16x8_t final_sum_row =
+ vbslq_u16(x_mask, sum_row_shifted, max_luma_sum);
+ vst1q_s16(luma[y] + x, vreinterpretq_s16_u16(final_sum_row));
+
+ running_sum = vpadalq_u16(running_sum, final_sum_row);
+ x_index = vaddq_u16(x_index, vdupq_n_u16(16));
}
+
if (y << 1 < max_luma_height - 2) {
src += stride << 1;
}
@@ -209,17 +178,30 @@ void CflSubsampler444_NEON(
uint32_t sum;
if (block_width == 4) {
assert(max_luma_width >= 4);
+ assert(max_luma_height <= block_height);
+ assert((max_luma_height % 2) == 0);
uint32x4_t running_sum = vdupq_n_u32(0);
uint8x8_t row = vdup_n_u8(0);
- for (int y = 0; y < block_height; y += 2) {
+ uint16x8_t row_shifted;
+ int y = 0;
+ do {
row = Load4<0>(src, row);
row = Load4<1>(src + stride, row);
if (y < (max_luma_height - 1)) {
src += stride << 1;
}
- const uint16x8_t row_shifted = vshll_n_u8(row, 3);
+ row_shifted = vshll_n_u8(row, 3);
+ running_sum = vpadalq_u16(running_sum, row_shifted);
+ vst1_s16(luma[y], vreinterpret_s16_u16(vget_low_u16(row_shifted)));
+ vst1_s16(luma[y + 1], vreinterpret_s16_u16(vget_high_u16(row_shifted)));
+ y += 2;
+ } while (y < max_luma_height);
+
+ row_shifted =
+ vcombine_u16(vget_high_u16(row_shifted), vget_high_u16(row_shifted));
+ for (; y < block_height; y += 2) {
running_sum = vpadalq_u16(running_sum, row_shifted);
vst1_s16(luma[y], vreinterpret_s16_u16(vget_low_u16(row_shifted)));
vst1_s16(luma[y + 1], vreinterpret_s16_u16(vget_high_u16(row_shifted)));
@@ -463,12 +445,874 @@ void Init8bpp() {
} // namespace
} // namespace low_bitdepth
-void IntraPredCflInit_NEON() { low_bitdepth::Init8bpp(); }
+//------------------------------------------------------------------------------
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+//------------------------------------------------------------------------------
+// CflSubsampler
+#ifndef __aarch64__
+uint16x8_t vpaddq_u16(uint16x8_t a, uint16x8_t b) {
+ return vcombine_u16(vpadd_u16(vget_low_u16(a), vget_high_u16(a)),
+ vpadd_u16(vget_low_u16(b), vget_high_u16(b)));
+}
+#endif
+
+// This duplicates the last two 16-bit values in |row|.
+inline uint16x8_t LastRowSamples(const uint16x8_t row) {
+ const uint32x2_t a = vget_high_u32(vreinterpretq_u32_u16(row));
+ const uint32x4_t b = vdupq_lane_u32(a, 1);
+ return vreinterpretq_u16_u32(b);
+}
+
+// This duplicates the last unsigned 16-bit value in |row|.
+inline uint16x8_t LastRowResult(const uint16x8_t row) {
+ const uint16x4_t a = vget_high_u16(row);
+ const uint16x8_t b = vdupq_lane_u16(a, 0x3);
+ return b;
+}
+
+// This duplicates the last signed 16-bit value in |row|.
+inline int16x8_t LastRowResult(const int16x8_t row) {
+ const int16x4_t a = vget_high_s16(row);
+ const int16x8_t b = vdupq_lane_s16(a, 0x3);
+ return b;
+}
+
+// Takes in two sums of input row pairs, and completes the computation for two
+// output rows.
+inline uint16x8_t StoreLumaResults4_420(const uint16x8_t vertical_sum0,
+ const uint16x8_t vertical_sum1,
+ int16_t* luma_ptr) {
+ const uint16x8_t result = vpaddq_u16(vertical_sum0, vertical_sum1);
+ const uint16x8_t result_shifted = vshlq_n_u16(result, 1);
+ vst1_s16(luma_ptr, vreinterpret_s16_u16(vget_low_u16(result_shifted)));
+ vst1_s16(luma_ptr + kCflLumaBufferStride,
+ vreinterpret_s16_u16(vget_high_u16(result_shifted)));
+ return result_shifted;
+}
+
+// Takes two halves of a vertically added pair of rows and completes the
+// computation for one output row.
+inline uint16x8_t StoreLumaResults8_420(const uint16x8_t vertical_sum0,
+ const uint16x8_t vertical_sum1,
+ int16_t* luma_ptr) {
+ const uint16x8_t result = vpaddq_u16(vertical_sum0, vertical_sum1);
+ const uint16x8_t result_shifted = vshlq_n_u16(result, 1);
+ vst1q_s16(luma_ptr, vreinterpretq_s16_u16(result_shifted));
+ return result_shifted;
+}
+
+template <int block_height_log2, bool is_inside>
+void CflSubsampler444_4xH_NEON(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_height, const void* const source, ptrdiff_t stride) {
+ static_assert(block_height_log2 <= 4, "");
+ const int block_height = 1 << block_height_log2;
+ const int visible_height = max_luma_height;
+ const auto* src = static_cast<const uint16_t*>(source);
+ const ptrdiff_t src_stride = stride / sizeof(src[0]);
+ int16_t* luma_ptr = luma[0];
+ uint16x4_t sum = vdup_n_u16(0);
+ uint16x4_t samples[2];
+ int y = visible_height;
+
+ do {
+ samples[0] = vld1_u16(src);
+ samples[1] = vld1_u16(src + src_stride);
+ src += src_stride << 1;
+ sum = vadd_u16(sum, samples[0]);
+ sum = vadd_u16(sum, samples[1]);
+ y -= 2;
+ } while (y != 0);
+
+ if (!is_inside) {
+ y = visible_height;
+ samples[1] = vshl_n_u16(samples[1], 1);
+ do {
+ sum = vadd_u16(sum, samples[1]);
+ y += 2;
+ } while (y < block_height);
+ }
+
+ // Here the left shift by 3 (to increase precision) is nullified in right
+ // shift ((log2 of width 4) + 1).
+ const uint32_t average_sum =
+ RightShiftWithRounding(SumVector(vpaddl_u16(sum)), block_height_log2 - 1);
+ const int16x4_t averages = vdup_n_s16(static_cast<int16_t>(average_sum));
+
+ const auto* ssrc = static_cast<const int16_t*>(source);
+ int16x4_t ssample;
+ luma_ptr = luma[0];
+ y = visible_height;
+ do {
+ ssample = vld1_s16(ssrc);
+ ssample = vshl_n_s16(ssample, 3);
+ vst1_s16(luma_ptr, vsub_s16(ssample, averages));
+ ssrc += src_stride;
+ luma_ptr += kCflLumaBufferStride;
+ } while (--y != 0);
+
+ if (!is_inside) {
+ y = visible_height;
+ // Replicate last line
+ do {
+ vst1_s16(luma_ptr, vsub_s16(ssample, averages));
+ luma_ptr += kCflLumaBufferStride;
+ } while (++y < block_height);
+ }
+}
+
+template <int block_height_log2>
+void CflSubsampler444_4xH_NEON(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_width, const int max_luma_height,
+ const void* const source, ptrdiff_t stride) {
+ static_cast<void>(max_luma_width);
+ static_cast<void>(max_luma_height);
+ static_assert(block_height_log2 <= 4, "");
+ assert(max_luma_width >= 4);
+ assert(max_luma_height >= 4);
+ const int block_height = 1 << block_height_log2;
+
+ if (block_height <= max_luma_height) {
+ CflSubsampler444_4xH_NEON<block_height_log2, true>(luma, max_luma_height,
+ source, stride);
+ } else {
+ CflSubsampler444_4xH_NEON<block_height_log2, false>(luma, max_luma_height,
+ source, stride);
+ }
+}
+
+template <int block_height_log2, bool is_inside>
+void CflSubsampler444_8xH_NEON(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_height, const void* const source, ptrdiff_t stride) {
+ const int block_height = 1 << block_height_log2;
+ const int visible_height = max_luma_height;
+ const auto* src = static_cast<const uint16_t*>(source);
+ const ptrdiff_t src_stride = stride / sizeof(src[0]);
+ int16_t* luma_ptr = luma[0];
+ uint32x4_t sum = vdupq_n_u32(0);
+ uint16x8_t samples;
+ int y = visible_height;
+
+ do {
+ samples = vld1q_u16(src);
+ src += src_stride;
+ sum = vpadalq_u16(sum, samples);
+ } while (--y != 0);
+
+ if (!is_inside) {
+ y = visible_height;
+ do {
+ sum = vpadalq_u16(sum, samples);
+ } while (++y < block_height);
+ }
+
+ // Here the left shift by 3 (to increase precision) is nullified in right
+ // shift (log2 of width 8).
+ const uint32_t average_sum =
+ RightShiftWithRounding(SumVector(sum), block_height_log2);
+ const int16x8_t averages = vdupq_n_s16(static_cast<int16_t>(average_sum));
+
+ const auto* ssrc = static_cast<const int16_t*>(source);
+ int16x8_t ssample;
+ luma_ptr = luma[0];
+ y = visible_height;
+ do {
+ ssample = vld1q_s16(ssrc);
+ ssample = vshlq_n_s16(ssample, 3);
+ vst1q_s16(luma_ptr, vsubq_s16(ssample, averages));
+ ssrc += src_stride;
+ luma_ptr += kCflLumaBufferStride;
+ } while (--y != 0);
+
+ if (!is_inside) {
+ y = visible_height;
+ // Replicate last line
+ do {
+ vst1q_s16(luma_ptr, vsubq_s16(ssample, averages));
+ luma_ptr += kCflLumaBufferStride;
+ } while (++y < block_height);
+ }
+}
+
+template <int block_height_log2>
+void CflSubsampler444_8xH_NEON(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_width, const int max_luma_height,
+ const void* const source, ptrdiff_t stride) {
+ static_cast<void>(max_luma_width);
+ static_cast<void>(max_luma_height);
+ static_assert(block_height_log2 <= 5, "");
+ assert(max_luma_width >= 4);
+ assert(max_luma_height >= 4);
+ const int block_height = 1 << block_height_log2;
+ const int block_width = 8;
+
+ const int horz_inside = block_width <= max_luma_width;
+ const int vert_inside = block_height <= max_luma_height;
+ if (horz_inside && vert_inside) {
+ CflSubsampler444_8xH_NEON<block_height_log2, true>(luma, max_luma_height,
+ source, stride);
+ } else {
+ CflSubsampler444_8xH_NEON<block_height_log2, false>(luma, max_luma_height,
+ source, stride);
+ }
+}
+
+template <int block_width_log2, int block_height_log2, bool is_inside>
+void CflSubsampler444_WxH_NEON(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_width, const int max_luma_height,
+ const void* const source, ptrdiff_t stride) {
+ const int block_height = 1 << block_height_log2;
+ const int visible_height = max_luma_height;
+ const int block_width = 1 << block_width_log2;
+ const auto* src = static_cast<const uint16_t*>(source);
+ const ptrdiff_t src_stride = stride / sizeof(src[0]);
+ int16_t* luma_ptr = luma[0];
+ uint32x4_t sum = vdupq_n_u32(0);
+ uint16x8_t samples[4];
+ int y = visible_height;
+
+ do {
+ samples[0] = vld1q_u16(src);
+ samples[1] =
+ (max_luma_width >= 16) ? vld1q_u16(src + 8) : LastRowResult(samples[0]);
+ uint16x8_t inner_sum = vaddq_u16(samples[0], samples[1]);
+ if (block_width == 32) {
+ samples[2] = (max_luma_width >= 24) ? vld1q_u16(src + 16)
+ : LastRowResult(samples[1]);
+ samples[3] = (max_luma_width == 32) ? vld1q_u16(src + 24)
+ : LastRowResult(samples[2]);
+ inner_sum = vaddq_u16(samples[2], inner_sum);
+ inner_sum = vaddq_u16(samples[3], inner_sum);
+ }
+ sum = vpadalq_u16(sum, inner_sum);
+ src += src_stride;
+ } while (--y != 0);
+
+ if (!is_inside) {
+ y = visible_height;
+ uint16x8_t inner_sum = vaddq_u16(samples[0], samples[1]);
+ if (block_width == 32) {
+ inner_sum = vaddq_u16(samples[2], inner_sum);
+ inner_sum = vaddq_u16(samples[3], inner_sum);
+ }
+ do {
+ sum = vpadalq_u16(sum, inner_sum);
+ } while (++y < block_height);
+ }
+
+ // Here the left shift by 3 (to increase precision) is subtracted in right
+ // shift factor (block_width_log2 + block_height_log2 - 3).
+ const uint32_t average_sum = RightShiftWithRounding(
+ SumVector(sum), block_width_log2 + block_height_log2 - 3);
+ const int16x8_t averages = vdupq_n_s16(static_cast<int16_t>(average_sum));
+
+ const auto* ssrc = static_cast<const int16_t*>(source);
+ int16x8_t ssamples_ext = vdupq_n_s16(0);
+ int16x8_t ssamples[4];
+ luma_ptr = luma[0];
+ y = visible_height;
+ do {
+ int idx = 0;
+ for (int x = 0; x < block_width; x += 8) {
+ if (max_luma_width > x) {
+ ssamples[idx] = vld1q_s16(&ssrc[x]);
+ ssamples[idx] = vshlq_n_s16(ssamples[idx], 3);
+ ssamples_ext = ssamples[idx];
+ } else {
+ ssamples[idx] = LastRowResult(ssamples_ext);
+ }
+ vst1q_s16(&luma_ptr[x], vsubq_s16(ssamples[idx++], averages));
+ }
+ ssrc += src_stride;
+ luma_ptr += kCflLumaBufferStride;
+ } while (--y != 0);
+
+ if (!is_inside) {
+ y = visible_height;
+ // Replicate last line
+ do {
+ int idx = 0;
+ for (int x = 0; x < block_width; x += 8) {
+ vst1q_s16(&luma_ptr[x], vsubq_s16(ssamples[idx++], averages));
+ }
+ luma_ptr += kCflLumaBufferStride;
+ } while (++y < block_height);
+ }
+}
+
+template <int block_width_log2, int block_height_log2>
+void CflSubsampler444_WxH_NEON(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_width, const int max_luma_height,
+ const void* const source, ptrdiff_t stride) {
+ static_assert(block_width_log2 == 4 || block_width_log2 == 5,
+ "This function will only work for block_width 16 and 32.");
+ static_assert(block_height_log2 <= 5, "");
+ assert(max_luma_width >= 4);
+ assert(max_luma_height >= 4);
+
+ const int block_height = 1 << block_height_log2;
+ const int vert_inside = block_height <= max_luma_height;
+ if (vert_inside) {
+ CflSubsampler444_WxH_NEON<block_width_log2, block_height_log2, true>(
+ luma, max_luma_width, max_luma_height, source, stride);
+ } else {
+ CflSubsampler444_WxH_NEON<block_width_log2, block_height_log2, false>(
+ luma, max_luma_width, max_luma_height, source, stride);
+ }
+}
+
+template <int block_height_log2>
+void CflSubsampler420_4xH_NEON(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int /*max_luma_width*/, const int max_luma_height,
+ const void* const source, ptrdiff_t stride) {
+ const int block_height = 1 << block_height_log2;
+ const auto* src = static_cast<const uint16_t*>(source);
+ const ptrdiff_t src_stride = stride / sizeof(src[0]);
+ int16_t* luma_ptr = luma[0];
+ const int luma_height = std::min(block_height, max_luma_height >> 1);
+ int y = luma_height;
+
+ uint32x4_t final_sum = vdupq_n_u32(0);
+ do {
+ const uint16x8_t samples_row0 = vld1q_u16(src);
+ src += src_stride;
+ const uint16x8_t samples_row1 = vld1q_u16(src);
+ src += src_stride;
+ const uint16x8_t luma_sum01 = vaddq_u16(samples_row0, samples_row1);
+
+ const uint16x8_t samples_row2 = vld1q_u16(src);
+ src += src_stride;
+ const uint16x8_t samples_row3 = vld1q_u16(src);
+ src += src_stride;
+ const uint16x8_t luma_sum23 = vaddq_u16(samples_row2, samples_row3);
+ uint16x8_t sum = StoreLumaResults4_420(luma_sum01, luma_sum23, luma_ptr);
+ luma_ptr += kCflLumaBufferStride << 1;
+
+ const uint16x8_t samples_row4 = vld1q_u16(src);
+ src += src_stride;
+ const uint16x8_t samples_row5 = vld1q_u16(src);
+ src += src_stride;
+ const uint16x8_t luma_sum45 = vaddq_u16(samples_row4, samples_row5);
+
+ const uint16x8_t samples_row6 = vld1q_u16(src);
+ src += src_stride;
+ const uint16x8_t samples_row7 = vld1q_u16(src);
+ src += src_stride;
+ const uint16x8_t luma_sum67 = vaddq_u16(samples_row6, samples_row7);
+ sum =
+ vaddq_u16(sum, StoreLumaResults4_420(luma_sum45, luma_sum67, luma_ptr));
+ luma_ptr += kCflLumaBufferStride << 1;
+
+ final_sum = vpadalq_u16(final_sum, sum);
+ y -= 4;
+ } while (y != 0);
+
+ const uint16x4_t final_fill =
+ vreinterpret_u16_s16(vld1_s16(luma_ptr - kCflLumaBufferStride));
+ const uint32x4_t final_fill_to_sum = vmovl_u16(final_fill);
+ for (y = luma_height; y < block_height; ++y) {
+ vst1_s16(luma_ptr, vreinterpret_s16_u16(final_fill));
+ luma_ptr += kCflLumaBufferStride;
+ final_sum = vaddq_u32(final_sum, final_fill_to_sum);
+ }
+ const uint32_t average_sum = RightShiftWithRounding(
+ SumVector(final_sum), block_height_log2 + 2 /*log2 of width 4*/);
+ const int16x4_t averages = vdup_n_s16(static_cast<int16_t>(average_sum));
+ luma_ptr = luma[0];
+ y = block_height;
+ do {
+ const int16x4_t samples = vld1_s16(luma_ptr);
+ vst1_s16(luma_ptr, vsub_s16(samples, averages));
+ luma_ptr += kCflLumaBufferStride;
+ } while (--y != 0);
+}
+
+template <int block_height_log2, int max_luma_width>
+inline void CflSubsampler420Impl_8xH_NEON(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_height, const void* const source, ptrdiff_t stride) {
+ const int block_height = 1 << block_height_log2;
+ const auto* src = static_cast<const uint16_t*>(source);
+ const ptrdiff_t src_stride = stride / sizeof(src[0]);
+ int16_t* luma_ptr = luma[0];
+ const int luma_height = std::min(block_height, max_luma_height >> 1);
+ int y = luma_height;
+
+ uint32x4_t final_sum = vdupq_n_u32(0);
+ do {
+ const uint16x8_t samples_row00 = vld1q_u16(src);
+ const uint16x8_t samples_row01 = (max_luma_width == 16)
+ ? vld1q_u16(src + 8)
+ : LastRowSamples(samples_row00);
+ src += src_stride;
+ const uint16x8_t samples_row10 = vld1q_u16(src);
+ const uint16x8_t samples_row11 = (max_luma_width == 16)
+ ? vld1q_u16(src + 8)
+ : LastRowSamples(samples_row10);
+ src += src_stride;
+ const uint16x8_t luma_sum00 = vaddq_u16(samples_row00, samples_row10);
+ const uint16x8_t luma_sum01 = vaddq_u16(samples_row01, samples_row11);
+ uint16x8_t sum = StoreLumaResults8_420(luma_sum00, luma_sum01, luma_ptr);
+ luma_ptr += kCflLumaBufferStride;
+
+ const uint16x8_t samples_row20 = vld1q_u16(src);
+ const uint16x8_t samples_row21 = (max_luma_width == 16)
+ ? vld1q_u16(src + 8)
+ : LastRowSamples(samples_row20);
+ src += src_stride;
+ const uint16x8_t samples_row30 = vld1q_u16(src);
+ const uint16x8_t samples_row31 = (max_luma_width == 16)
+ ? vld1q_u16(src + 8)
+ : LastRowSamples(samples_row30);
+ src += src_stride;
+ const uint16x8_t luma_sum10 = vaddq_u16(samples_row20, samples_row30);
+ const uint16x8_t luma_sum11 = vaddq_u16(samples_row21, samples_row31);
+ sum =
+ vaddq_u16(sum, StoreLumaResults8_420(luma_sum10, luma_sum11, luma_ptr));
+ luma_ptr += kCflLumaBufferStride;
+
+ const uint16x8_t samples_row40 = vld1q_u16(src);
+ const uint16x8_t samples_row41 = (max_luma_width == 16)
+ ? vld1q_u16(src + 8)
+ : LastRowSamples(samples_row40);
+ src += src_stride;
+ const uint16x8_t samples_row50 = vld1q_u16(src);
+ const uint16x8_t samples_row51 = (max_luma_width == 16)
+ ? vld1q_u16(src + 8)
+ : LastRowSamples(samples_row50);
+ src += src_stride;
+ const uint16x8_t luma_sum20 = vaddq_u16(samples_row40, samples_row50);
+ const uint16x8_t luma_sum21 = vaddq_u16(samples_row41, samples_row51);
+ sum =
+ vaddq_u16(sum, StoreLumaResults8_420(luma_sum20, luma_sum21, luma_ptr));
+ luma_ptr += kCflLumaBufferStride;
+
+ const uint16x8_t samples_row60 = vld1q_u16(src);
+ const uint16x8_t samples_row61 = (max_luma_width == 16)
+ ? vld1q_u16(src + 8)
+ : LastRowSamples(samples_row60);
+ src += src_stride;
+ const uint16x8_t samples_row70 = vld1q_u16(src);
+ const uint16x8_t samples_row71 = (max_luma_width == 16)
+ ? vld1q_u16(src + 8)
+ : LastRowSamples(samples_row70);
+ src += src_stride;
+ const uint16x8_t luma_sum30 = vaddq_u16(samples_row60, samples_row70);
+ const uint16x8_t luma_sum31 = vaddq_u16(samples_row61, samples_row71);
+ sum =
+ vaddq_u16(sum, StoreLumaResults8_420(luma_sum30, luma_sum31, luma_ptr));
+ luma_ptr += kCflLumaBufferStride;
+
+ final_sum = vpadalq_u16(final_sum, sum);
+ y -= 4;
+ } while (y != 0);
+
+ // Duplicate the final row downward to the end after max_luma_height.
+ const uint16x8_t final_fill =
+ vreinterpretq_u16_s16(vld1q_s16(luma_ptr - kCflLumaBufferStride));
+ const uint32x4_t final_fill_to_sum =
+ vaddl_u16(vget_low_u16(final_fill), vget_high_u16(final_fill));
+
+ for (y = luma_height; y < block_height; ++y) {
+ vst1q_s16(luma_ptr, vreinterpretq_s16_u16(final_fill));
+ luma_ptr += kCflLumaBufferStride;
+ final_sum = vaddq_u32(final_sum, final_fill_to_sum);
+ }
+
+ const uint32_t average_sum = RightShiftWithRounding(
+ SumVector(final_sum), block_height_log2 + 3 /*log2 of width 8*/);
+ const int16x8_t averages = vdupq_n_s16(static_cast<int16_t>(average_sum));
+ luma_ptr = luma[0];
+ y = block_height;
+ do {
+ const int16x8_t samples = vld1q_s16(luma_ptr);
+ vst1q_s16(luma_ptr, vsubq_s16(samples, averages));
+ luma_ptr += kCflLumaBufferStride;
+ } while (--y != 0);
+}
+
+template <int block_height_log2>
+void CflSubsampler420_8xH_NEON(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_width, const int max_luma_height,
+ const void* const source, ptrdiff_t stride) {
+ if (max_luma_width == 8) {
+ CflSubsampler420Impl_8xH_NEON<block_height_log2, 8>(luma, max_luma_height,
+ source, stride);
+ } else {
+ CflSubsampler420Impl_8xH_NEON<block_height_log2, 16>(luma, max_luma_height,
+ source, stride);
+ }
+}
+
+template <int block_width_log2, int block_height_log2, int max_luma_width>
+inline void CflSubsampler420Impl_WxH_NEON(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_height, const void* const source, ptrdiff_t stride) {
+ const auto* src = static_cast<const uint16_t*>(source);
+ const ptrdiff_t src_stride = stride / sizeof(src[0]);
+ const int block_height = 1 << block_height_log2;
+ const int luma_height = std::min(block_height, max_luma_height >> 1);
+ int16_t* luma_ptr = luma[0];
+ // Begin first y section, covering width up to 32.
+ int y = luma_height;
+
+ uint16x8_t final_fill0, final_fill1;
+ uint32x4_t final_sum = vdupq_n_u32(0);
+ do {
+ const uint16_t* src_next = src + src_stride;
+ const uint16x8_t samples_row00 = vld1q_u16(src);
+ const uint16x8_t samples_row01 = (max_luma_width >= 16)
+ ? vld1q_u16(src + 8)
+ : LastRowSamples(samples_row00);
+ const uint16x8_t samples_row02 = (max_luma_width >= 24)
+ ? vld1q_u16(src + 16)
+ : LastRowSamples(samples_row01);
+ const uint16x8_t samples_row03 = (max_luma_width == 32)
+ ? vld1q_u16(src + 24)
+ : LastRowSamples(samples_row02);
+ const uint16x8_t samples_row10 = vld1q_u16(src_next);
+ const uint16x8_t samples_row11 = (max_luma_width >= 16)
+ ? vld1q_u16(src_next + 8)
+ : LastRowSamples(samples_row10);
+ const uint16x8_t samples_row12 = (max_luma_width >= 24)
+ ? vld1q_u16(src_next + 16)
+ : LastRowSamples(samples_row11);
+ const uint16x8_t samples_row13 = (max_luma_width == 32)
+ ? vld1q_u16(src_next + 24)
+ : LastRowSamples(samples_row12);
+ const uint16x8_t luma_sum0 = vaddq_u16(samples_row00, samples_row10);
+ const uint16x8_t luma_sum1 = vaddq_u16(samples_row01, samples_row11);
+ const uint16x8_t luma_sum2 = vaddq_u16(samples_row02, samples_row12);
+ const uint16x8_t luma_sum3 = vaddq_u16(samples_row03, samples_row13);
+ final_fill0 = StoreLumaResults8_420(luma_sum0, luma_sum1, luma_ptr);
+ final_fill1 = StoreLumaResults8_420(luma_sum2, luma_sum3, luma_ptr + 8);
+ const uint16x8_t sum = vaddq_u16(final_fill0, final_fill1);
+
+ final_sum = vpadalq_u16(final_sum, sum);
+
+ // Because max_luma_width is at most 32, any values beyond x=16 will
+ // necessarily be duplicated.
+ if (block_width_log2 == 5) {
+ const uint16x8_t wide_fill = LastRowResult(final_fill1);
+ final_sum = vpadalq_u16(final_sum, vshlq_n_u16(wide_fill, 1));
+ }
+ src += src_stride << 1;
+ luma_ptr += kCflLumaBufferStride;
+ } while (--y != 0);
+
+ // Begin second y section.
+ y = luma_height;
+ if (y < block_height) {
+ uint32x4_t wide_fill;
+ if (block_width_log2 == 5) {
+ // There are 16 16-bit fill values per row, shifting by 2 accounts for
+ // the widening to 32-bit. (a << 2) = (a + a) << 1.
+ wide_fill = vshll_n_u16(vget_low_u16(LastRowResult(final_fill1)), 2);
+ }
+ const uint16x8_t final_inner_sum = vaddq_u16(final_fill0, final_fill1);
+ const uint32x4_t final_fill_to_sum = vaddl_u16(
+ vget_low_u16(final_inner_sum), vget_high_u16(final_inner_sum));
+
+ do {
+ vst1q_s16(luma_ptr, vreinterpretq_s16_u16(final_fill0));
+ vst1q_s16(luma_ptr + 8, vreinterpretq_s16_u16(final_fill1));
+ if (block_width_log2 == 5) {
+ final_sum = vaddq_u32(final_sum, wide_fill);
+ }
+ luma_ptr += kCflLumaBufferStride;
+ final_sum = vaddq_u32(final_sum, final_fill_to_sum);
+ } while (++y < block_height);
+ } // End second y section.
+
+ const uint32_t average_sum = RightShiftWithRounding(
+ SumVector(final_sum), block_width_log2 + block_height_log2);
+ const int16x8_t averages = vdupq_n_s16(static_cast<int16_t>(average_sum));
+
+ luma_ptr = luma[0];
+ y = block_height;
+ do {
+ const int16x8_t samples0 = vld1q_s16(luma_ptr);
+ vst1q_s16(luma_ptr, vsubq_s16(samples0, averages));
+ const int16x8_t samples1 = vld1q_s16(luma_ptr + 8);
+ const int16x8_t final_row_result = vsubq_s16(samples1, averages);
+ vst1q_s16(luma_ptr + 8, final_row_result);
+
+ if (block_width_log2 == 5) {
+ const int16x8_t wide_fill = LastRowResult(final_row_result);
+ vst1q_s16(luma_ptr + 16, wide_fill);
+ vst1q_s16(luma_ptr + 24, wide_fill);
+ }
+ luma_ptr += kCflLumaBufferStride;
+ } while (--y != 0);
+}
+
+//------------------------------------------------------------------------------
+// Choose subsampler based on max_luma_width
+template <int block_width_log2, int block_height_log2>
+void CflSubsampler420_WxH_NEON(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_width, const int max_luma_height,
+ const void* const source, ptrdiff_t stride) {
+ switch (max_luma_width) {
+ case 8:
+ CflSubsampler420Impl_WxH_NEON<block_width_log2, block_height_log2, 8>(
+ luma, max_luma_height, source, stride);
+ return;
+ case 16:
+ CflSubsampler420Impl_WxH_NEON<block_width_log2, block_height_log2, 16>(
+ luma, max_luma_height, source, stride);
+ return;
+ case 24:
+ CflSubsampler420Impl_WxH_NEON<block_width_log2, block_height_log2, 24>(
+ luma, max_luma_height, source, stride);
+ return;
+ default:
+ assert(max_luma_width == 32);
+ CflSubsampler420Impl_WxH_NEON<block_width_log2, block_height_log2, 32>(
+ luma, max_luma_height, source, stride);
+ return;
+ }
+}
+
+//------------------------------------------------------------------------------
+// CflIntraPredictor
+
+// |luma| can be within +/-(((1 << bitdepth) - 1) << 3), inclusive.
+// |alpha| can be -16 to 16 (inclusive).
+// Clip |dc + ((alpha * luma) >> 6))| to 0, (1 << bitdepth) - 1.
+inline uint16x8_t Combine8(const int16x8_t luma, const int16x8_t alpha_abs,
+ const int16x8_t alpha_signed, const int16x8_t dc,
+ const uint16x8_t max_value) {
+ const int16x8_t luma_abs = vabsq_s16(luma);
+ const int16x8_t luma_alpha_sign =
+ vshrq_n_s16(veorq_s16(luma, alpha_signed), 15);
+ // (alpha * luma) >> 6
+ const int16x8_t la_abs = vqrdmulhq_s16(luma_abs, alpha_abs);
+ // Convert back to signed values.
+ const int16x8_t la =
+ vsubq_s16(veorq_s16(la_abs, luma_alpha_sign), luma_alpha_sign);
+ const int16x8_t result = vaddq_s16(la, dc);
+ const int16x8_t zero = vdupq_n_s16(0);
+ // Clip.
+ return vminq_u16(vreinterpretq_u16_s16(vmaxq_s16(result, zero)), max_value);
+}
+
+template <int block_height, int bitdepth = 10>
+inline void CflIntraPredictor4xN_NEON(
+ void* const dest, const ptrdiff_t stride,
+ const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int alpha) {
+ auto* dst = static_cast<uint16_t*>(dest);
+ const ptrdiff_t dst_stride = stride >> 1;
+ const uint16x8_t max_value = vdupq_n_u16((1 << bitdepth) - 1);
+ const int16x8_t alpha_signed = vdupq_n_s16(alpha << 9);
+ const int16x8_t alpha_abs = vabsq_s16(alpha_signed);
+ const int16x8_t dc = vdupq_n_s16(dst[0]);
+ for (int y = 0; y < block_height; y += 2) {
+ const int16x4_t luma_row0 = vld1_s16(luma[y]);
+ const int16x4_t luma_row1 = vld1_s16(luma[y + 1]);
+ const int16x8_t combined_luma = vcombine_s16(luma_row0, luma_row1);
+ const uint16x8_t sum =
+ Combine8(combined_luma, alpha_abs, alpha_signed, dc, max_value);
+ vst1_u16(dst, vget_low_u16(sum));
+ dst += dst_stride;
+ vst1_u16(dst, vget_high_u16(sum));
+ dst += dst_stride;
+ }
+}
+
+template <int block_height, int bitdepth = 10>
+inline void CflIntraPredictor8xN_NEON(
+ void* const dest, const ptrdiff_t stride,
+ const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int alpha) {
+ auto* dst = static_cast<uint16_t*>(dest);
+ const ptrdiff_t dst_stride = stride >> 1;
+ const uint16x8_t max_value = vdupq_n_u16((1 << bitdepth) - 1);
+ const int16x8_t alpha_signed = vdupq_n_s16(alpha << 9);
+ const int16x8_t alpha_abs = vabsq_s16(alpha_signed);
+ const int16x8_t dc = vdupq_n_s16(dst[0]);
+ for (int y = 0; y < block_height; ++y) {
+ const int16x8_t luma_row = vld1q_s16(luma[y]);
+ const uint16x8_t sum =
+ Combine8(luma_row, alpha_abs, alpha_signed, dc, max_value);
+ vst1q_u16(dst, sum);
+ dst += dst_stride;
+ }
+}
+
+template <int block_height, int bitdepth = 10>
+inline void CflIntraPredictor16xN_NEON(
+ void* const dest, const ptrdiff_t stride,
+ const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int alpha) {
+ auto* dst = static_cast<uint16_t*>(dest);
+ const ptrdiff_t dst_stride = stride >> 1;
+ const uint16x8_t max_value = vdupq_n_u16((1 << bitdepth) - 1);
+ const int16x8_t alpha_signed = vdupq_n_s16(alpha << 9);
+ const int16x8_t alpha_abs = vabsq_s16(alpha_signed);
+ const int16x8_t dc = vdupq_n_s16(dst[0]);
+ for (int y = 0; y < block_height; ++y) {
+ const int16x8_t luma_row_0 = vld1q_s16(luma[y]);
+ const int16x8_t luma_row_1 = vld1q_s16(luma[y] + 8);
+ const uint16x8_t sum_0 =
+ Combine8(luma_row_0, alpha_abs, alpha_signed, dc, max_value);
+ const uint16x8_t sum_1 =
+ Combine8(luma_row_1, alpha_abs, alpha_signed, dc, max_value);
+ vst1q_u16(dst, sum_0);
+ vst1q_u16(dst + 8, sum_1);
+ dst += dst_stride;
+ }
+}
+
+template <int block_height, int bitdepth = 10>
+inline void CflIntraPredictor32xN_NEON(
+ void* const dest, const ptrdiff_t stride,
+ const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int alpha) {
+ auto* dst = static_cast<uint16_t*>(dest);
+ const ptrdiff_t dst_stride = stride >> 1;
+ const uint16x8_t max_value = vdupq_n_u16((1 << bitdepth) - 1);
+ const int16x8_t alpha_signed = vdupq_n_s16(alpha << 9);
+ const int16x8_t alpha_abs = vabsq_s16(alpha_signed);
+ const int16x8_t dc = vdupq_n_s16(dst[0]);
+ for (int y = 0; y < block_height; ++y) {
+ const int16x8_t luma_row_0 = vld1q_s16(luma[y]);
+ const int16x8_t luma_row_1 = vld1q_s16(luma[y] + 8);
+ const int16x8_t luma_row_2 = vld1q_s16(luma[y] + 16);
+ const int16x8_t luma_row_3 = vld1q_s16(luma[y] + 24);
+ const uint16x8_t sum_0 =
+ Combine8(luma_row_0, alpha_abs, alpha_signed, dc, max_value);
+ const uint16x8_t sum_1 =
+ Combine8(luma_row_1, alpha_abs, alpha_signed, dc, max_value);
+ const uint16x8_t sum_2 =
+ Combine8(luma_row_2, alpha_abs, alpha_signed, dc, max_value);
+ const uint16x8_t sum_3 =
+ Combine8(luma_row_3, alpha_abs, alpha_signed, dc, max_value);
+ vst1q_u16(dst, sum_0);
+ vst1q_u16(dst + 8, sum_1);
+ vst1q_u16(dst + 16, sum_2);
+ vst1q_u16(dst + 24, sum_3);
+ dst += dst_stride;
+ }
+}
+
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+
+ dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType420] =
+ CflSubsampler420_4xH_NEON<2>;
+ dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType420] =
+ CflSubsampler420_4xH_NEON<3>;
+ dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType420] =
+ CflSubsampler420_4xH_NEON<4>;
+
+ dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType420] =
+ CflSubsampler420_8xH_NEON<2>;
+ dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType420] =
+ CflSubsampler420_8xH_NEON<3>;
+ dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType420] =
+ CflSubsampler420_8xH_NEON<4>;
+ dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType420] =
+ CflSubsampler420_8xH_NEON<5>;
+
+ dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType420] =
+ CflSubsampler420_WxH_NEON<4, 2>;
+ dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType420] =
+ CflSubsampler420_WxH_NEON<4, 3>;
+ dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType420] =
+ CflSubsampler420_WxH_NEON<4, 4>;
+ dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType420] =
+ CflSubsampler420_WxH_NEON<4, 5>;
+
+ dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType420] =
+ CflSubsampler420_WxH_NEON<5, 3>;
+ dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType420] =
+ CflSubsampler420_WxH_NEON<5, 4>;
+ dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType420] =
+ CflSubsampler420_WxH_NEON<5, 5>;
+
+ dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType444] =
+ CflSubsampler444_4xH_NEON<2>;
+ dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType444] =
+ CflSubsampler444_4xH_NEON<3>;
+ dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType444] =
+ CflSubsampler444_4xH_NEON<4>;
+
+ dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType444] =
+ CflSubsampler444_8xH_NEON<2>;
+ dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType444] =
+ CflSubsampler444_8xH_NEON<3>;
+ dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType444] =
+ CflSubsampler444_8xH_NEON<4>;
+ dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType444] =
+ CflSubsampler444_8xH_NEON<5>;
+
+ dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType444] =
+ CflSubsampler444_WxH_NEON<4, 2>;
+ dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType444] =
+ CflSubsampler444_WxH_NEON<4, 3>;
+ dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType444] =
+ CflSubsampler444_WxH_NEON<4, 4>;
+ dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType444] =
+ CflSubsampler444_WxH_NEON<4, 5>;
+
+ dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType444] =
+ CflSubsampler444_WxH_NEON<5, 3>;
+ dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType444] =
+ CflSubsampler444_WxH_NEON<5, 4>;
+ dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType444] =
+ CflSubsampler444_WxH_NEON<5, 5>;
+
+ dsp->cfl_intra_predictors[kTransformSize4x4] = CflIntraPredictor4xN_NEON<4>;
+ dsp->cfl_intra_predictors[kTransformSize4x8] = CflIntraPredictor4xN_NEON<8>;
+ dsp->cfl_intra_predictors[kTransformSize4x16] = CflIntraPredictor4xN_NEON<16>;
+
+ dsp->cfl_intra_predictors[kTransformSize8x4] = CflIntraPredictor8xN_NEON<4>;
+ dsp->cfl_intra_predictors[kTransformSize8x8] = CflIntraPredictor8xN_NEON<8>;
+ dsp->cfl_intra_predictors[kTransformSize8x16] = CflIntraPredictor8xN_NEON<16>;
+ dsp->cfl_intra_predictors[kTransformSize8x32] = CflIntraPredictor8xN_NEON<32>;
+
+ dsp->cfl_intra_predictors[kTransformSize16x4] = CflIntraPredictor16xN_NEON<4>;
+ dsp->cfl_intra_predictors[kTransformSize16x8] = CflIntraPredictor16xN_NEON<8>;
+ dsp->cfl_intra_predictors[kTransformSize16x16] =
+ CflIntraPredictor16xN_NEON<16>;
+ dsp->cfl_intra_predictors[kTransformSize16x32] =
+ CflIntraPredictor16xN_NEON<32>;
+ dsp->cfl_intra_predictors[kTransformSize32x8] = CflIntraPredictor32xN_NEON<8>;
+ dsp->cfl_intra_predictors[kTransformSize32x16] =
+ CflIntraPredictor32xN_NEON<16>;
+ dsp->cfl_intra_predictors[kTransformSize32x32] =
+ CflIntraPredictor32xN_NEON<32>;
+ // Max Cfl predictor size is 32x32.
+}
+
+} // namespace
+} // namespace high_bitdepth
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+void IntraPredCflInit_NEON() {
+ low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ high_bitdepth::Init10bpp();
+#endif
+}
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_NEON
+#else // !LIBGAV1_ENABLE_NEON
namespace libgav1 {
namespace dsp {
diff --git a/src/dsp/arm/intrapred_cfl_neon.h b/src/dsp/arm/intrapred_cfl_neon.h
new file mode 100644
index 0000000..b4f983a
--- /dev/null
+++ b/src/dsp/arm/intrapred_cfl_neon.h
@@ -0,0 +1,179 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_INTRAPRED_CFL_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_INTRAPRED_CFL_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::cfl_intra_predictors and Dsp::cfl_subsamplers, see the
+// defines below for specifics. These functions are not thread-safe.
+void IntraPredCflInit_NEON();
+
+} // namespace dsp
+} // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+// 4x4
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 4x8
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 4x16
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 8x4
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 8x8
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 8x16
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 8x32
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 16x4
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 16x8
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 16x16
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 16x32
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 32x8
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 32x16
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 32x32
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// -----------------------------------------------------------------------------
+// 10bpp
+
+// 4x4
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 4x8
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 4x16
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 8x4
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 8x8
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 8x16
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 8x32
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 16x4
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 16x8
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 16x16
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 16x32
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 32x8
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 32x16
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 32x32
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler444 LIBGAV1_CPU_NEON
+
+#endif // LIBGAV1_ENABLE_NEON
+
+#endif // LIBGAV1_SRC_DSP_ARM_INTRAPRED_CFL_NEON_H_
diff --git a/src/dsp/arm/intrapred_directional_neon.cc b/src/dsp/arm/intrapred_directional_neon.cc
index 805ba81..3f5edbd 100644
--- a/src/dsp/arm/intrapred_directional_neon.cc
+++ b/src/dsp/arm/intrapred_directional_neon.cc
@@ -12,18 +12,18 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "src/dsp/intrapred.h"
+#include "src/dsp/intrapred_directional.h"
#include "src/utils/cpu.h"
#if LIBGAV1_ENABLE_NEON
#include <arm_neon.h>
-#include <algorithm> // std::min
+#include <algorithm>
#include <cassert>
#include <cstddef>
#include <cstdint>
-#include <cstring> // memset
+#include <cstring>
#include "src/dsp/arm/common_neon.h"
#include "src/dsp/constants.h"
@@ -35,14 +35,14 @@ namespace dsp {
namespace low_bitdepth {
namespace {
-// Blend two values based on a 32 bit weight.
+// Blend two values based on weights that sum to 32.
inline uint8x8_t WeightedBlend(const uint8x8_t a, const uint8x8_t b,
const uint8x8_t a_weight,
const uint8x8_t b_weight) {
const uint16x8_t a_product = vmull_u8(a, a_weight);
const uint16x8_t b_product = vmull_u8(b, b_weight);
- return vrshrn_n_u16(vaddq_u16(a_product, b_product), 5);
+ return vrshrn_n_u16(vaddq_u16(a_product, b_product), 5 /*log2(32)*/);
}
// For vertical operations the weights are one constant value.
@@ -112,7 +112,7 @@ inline void DirectionalZone1_WxH(uint8_t* dst, const ptrdiff_t stride,
// 4 wide subsamples the output. 8 wide subsamples the input.
if (width == 4) {
const uint8x8_t left_values = vld1_u8(top + top_base_x);
- const uint8x8_t right_values = RightShift<8>(left_values);
+ const uint8x8_t right_values = RightShiftVector<8>(left_values);
const uint8x8_t value = WeightedBlend(left_values, right_values, shift);
// If |upsampled| is true then extract every other value for output.
@@ -910,12 +910,590 @@ void Init8bpp() {
} // namespace
} // namespace low_bitdepth
-void IntraPredDirectionalInit_NEON() { low_bitdepth::Init8bpp(); }
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+// Blend two values based on weights that sum to 32.
+inline uint16x4_t WeightedBlend(const uint16x4_t a, const uint16x4_t b,
+ const int a_weight, const int b_weight) {
+ const uint16x4_t a_product = vmul_n_u16(a, a_weight);
+ const uint16x4_t sum = vmla_n_u16(a_product, b, b_weight);
+
+ return vrshr_n_u16(sum, 5 /*log2(32)*/);
+}
+
+// Blend two values based on weights that sum to 32.
+inline uint16x8_t WeightedBlend(const uint16x8_t a, const uint16x8_t b,
+ const uint16_t a_weight,
+ const uint16_t b_weight) {
+ const uint16x8_t a_product = vmulq_n_u16(a, a_weight);
+ const uint16x8_t sum = vmlaq_n_u16(a_product, b, b_weight);
+
+ return vrshrq_n_u16(sum, 5 /*log2(32)*/);
+}
+
+// Each element of |dest| contains values associated with one weight value.
+inline void LoadEdgeVals(uint16x4x2_t* dest, const uint16_t* const source,
+ const bool upsampled) {
+ if (upsampled) {
+ *dest = vld2_u16(source);
+ } else {
+ dest->val[0] = vld1_u16(source);
+ dest->val[1] = vld1_u16(source + 1);
+ }
+}
+
+// Each element of |dest| contains values associated with one weight value.
+inline void LoadEdgeVals(uint16x8x2_t* dest, const uint16_t* const source,
+ const bool upsampled) {
+ if (upsampled) {
+ *dest = vld2q_u16(source);
+ } else {
+ dest->val[0] = vld1q_u16(source);
+ dest->val[1] = vld1q_u16(source + 1);
+ }
+}
+
+template <bool upsampled>
+inline void DirectionalZone1_4xH(uint16_t* dst, const ptrdiff_t stride,
+ const int height, const uint16_t* const top,
+ const int xstep) {
+ const int upsample_shift = static_cast<int>(upsampled);
+ const int index_scale_bits = 6 - upsample_shift;
+
+ const int max_base_x = (4 + height - 1) << upsample_shift;
+ const int16x4_t max_base = vdup_n_s16(max_base_x);
+ const uint16x4_t final_top_val = vdup_n_u16(top[max_base_x]);
+ const int16x4_t index_offset = {0, 1, 2, 3};
+
+ // All rows from |min_corner_only_y| down will simply use Memset.
+ // |max_base_x| is always greater than |height|, so clipping the denominator
+ // to 1 is enough to make the logic work.
+ const int xstep_units = std::max(xstep >> index_scale_bits, 1);
+ const int min_corner_only_y = std::min(max_base_x / xstep_units, height);
+
+ int top_x = xstep;
+ int y = 0;
+ for (; y < min_corner_only_y; ++y, dst += stride, top_x += xstep) {
+ const int top_base_x = top_x >> index_scale_bits;
+
+ // To accommodate reuse of this function in Zone2, permit negative values
+ // for |xstep|.
+ const uint16_t shift_0 = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+ const uint16_t shift_1 = 32 - shift_0;
+
+ // Use signed values to compare |top_base_x| to |max_base_x|.
+ const int16x4_t base_x = vadd_s16(vdup_n_s16(top_base_x), index_offset);
+ const uint16x4_t max_base_mask = vclt_s16(base_x, max_base);
+
+ uint16x4x2_t sampled_top_row;
+ LoadEdgeVals(&sampled_top_row, top + top_base_x, upsampled);
+ const uint16x4_t combined = WeightedBlend(
+ sampled_top_row.val[0], sampled_top_row.val[1], shift_1, shift_0);
+
+ // If |upsampled| is true then extract every other value for output.
+ const uint16x4_t masked_result =
+ vbsl_u16(max_base_mask, combined, final_top_val);
+
+ vst1_u16(dst, masked_result);
+ }
+ for (; y < height; ++y) {
+ Memset(dst, top[max_base_x], 4 /* width */);
+ dst += stride;
+ }
+}
+
+// Process a multiple of 8 |width| by any |height|. Processes horizontally
+// before vertically in the hopes of being a little more cache friendly.
+template <bool upsampled>
+inline void DirectionalZone1_WxH(uint16_t* dst, const ptrdiff_t stride,
+ const int width, const int height,
+ const uint16_t* const top, const int xstep) {
+ assert(width % 8 == 0);
+ const int upsample_shift = static_cast<int>(upsampled);
+ const int index_scale_bits = 6 - upsample_shift;
+
+ const int max_base_index = (width + height - 1) << upsample_shift;
+ const int16x8_t max_base_x = vdupq_n_s16(max_base_index);
+ const uint16x8_t final_top_val = vdupq_n_u16(top[max_base_index]);
+ const int16x8_t index_offset = {0, 1, 2, 3, 4, 5, 6, 7};
+
+ const int base_step = 1 << upsample_shift;
+ const int base_step8 = base_step << 3;
+ const int16x8_t block_step = vdupq_n_s16(base_step8);
+
+ // All rows from |min_corner_only_y| down will simply use Memset.
+ // |max_base_x| is always greater than |height|, so clipping the denominator
+ // to 1 is enough to make the logic work.
+ const int xstep_units = std::max(xstep >> index_scale_bits, 1);
+ const int min_corner_only_y = std::min(max_base_index / xstep_units, height);
+
+ int top_x = xstep;
+ int y = 0;
+ for (; y < min_corner_only_y; ++y, dst += stride, top_x += xstep) {
+ int top_base_x = top_x >> index_scale_bits;
+
+ // To accommodate reuse of this function in Zone2, permit negative values
+ // for |xstep|.
+ const uint16_t shift_0 = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+ const uint16_t shift_1 = 32 - shift_0;
+
+ // Use signed values to compare |top_base_x| to |max_base_x|.
+ int16x8_t base_x = vaddq_s16(vdupq_n_s16(top_base_x), index_offset);
+
+ int x = 0;
+ do {
+ const uint16x8_t max_base_mask = vcltq_s16(base_x, max_base_x);
+
+ uint16x8x2_t sampled_top_row;
+ LoadEdgeVals(&sampled_top_row, top + top_base_x, upsampled);
+ const uint16x8_t combined = WeightedBlend(
+ sampled_top_row.val[0], sampled_top_row.val[1], shift_1, shift_0);
+
+ const uint16x8_t masked_result =
+ vbslq_u16(max_base_mask, combined, final_top_val);
+ vst1q_u16(dst + x, masked_result);
+
+ base_x = vaddq_s16(base_x, block_step);
+ top_base_x += base_step8;
+ x += 8;
+ } while (x < width);
+ }
+ for (int i = y; i < height; ++i) {
+ Memset(dst, top[max_base_index], width);
+ dst += stride;
+ }
+}
+
+// Process a multiple of 8 |width| by any |height|. Processes horizontally
+// before vertically in the hopes of being a little more cache friendly.
+inline void DirectionalZone1_Large(uint16_t* dst, const ptrdiff_t stride,
+ const int width, const int height,
+ const uint16_t* const top, const int xstep,
+ const bool upsampled) {
+ assert(width % 8 == 0);
+ const int upsample_shift = static_cast<int>(upsampled);
+ const int index_scale_bits = 6 - upsample_shift;
+
+ const int max_base_index = (width + height - 1) << upsample_shift;
+ const int16x8_t max_base_x = vdupq_n_s16(max_base_index);
+ const uint16x8_t final_top_val = vdupq_n_u16(top[max_base_index]);
+ const int16x8_t index_offset = {0, 1, 2, 3, 4, 5, 6, 7};
+
+ const int base_step = 1 << upsample_shift;
+ const int base_step8 = base_step << 3;
+ const int16x8_t block_step = vdupq_n_s16(base_step8);
+
+ // All rows from |min_corner_only_y| down will simply use Memset.
+ // |max_base_x| is always greater than |height|, so clipping the denominator
+ // to 1 is enough to make the logic work.
+ const int xstep_units = std::max(xstep >> index_scale_bits, 1);
+ const int min_corner_only_y = std::min(max_base_index / xstep_units, height);
+
+ // Rows up to this y-value can be computed without checking for bounds.
+ const int max_no_corner_y = std::min(
+ ((max_base_index - (base_step * width)) << index_scale_bits) / xstep,
+ height);
+ // No need to check for exceeding |max_base_x| in the first loop.
+ int y = 0;
+ int top_x = xstep;
+ for (; y < max_no_corner_y; ++y, dst += stride, top_x += xstep) {
+ int top_base_x = top_x >> index_scale_bits;
+ // To accommodate reuse of this function in Zone2, permit negative values
+ // for |xstep|.
+ const uint16_t shift_0 = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+ const uint16_t shift_1 = 32 - shift_0;
+
+ int x = 0;
+ do {
+ uint16x8x2_t sampled_top_row;
+ LoadEdgeVals(&sampled_top_row, top + top_base_x, upsampled);
+ const uint16x8_t combined = WeightedBlend(
+ sampled_top_row.val[0], sampled_top_row.val[1], shift_1, shift_0);
+
+ vst1q_u16(dst + x, combined);
+
+ top_base_x += base_step8;
+ x += 8;
+ } while (x < width);
+ }
+
+ for (; y < min_corner_only_y; ++y, dst += stride, top_x += xstep) {
+ int top_base_x = top_x >> index_scale_bits;
+
+ // To accommodate reuse of this function in Zone2, permit negative values
+ // for |xstep|.
+ const uint16_t shift_0 = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+ const uint16_t shift_1 = 32 - shift_0;
+
+ // Use signed values to compare |top_base_x| to |max_base_x|.
+ int16x8_t base_x = vaddq_s16(vdupq_n_s16(top_base_x), index_offset);
+
+ int x = 0;
+ const int min_corner_only_x =
+ std::min(width, ((max_base_index - top_base_x) >> upsample_shift) + 7) &
+ ~7;
+ for (; x < min_corner_only_x; x += 8, top_base_x += base_step8,
+ base_x = vaddq_s16(base_x, block_step)) {
+ const uint16x8_t max_base_mask = vcltq_s16(base_x, max_base_x);
+
+ uint16x8x2_t sampled_top_row;
+ LoadEdgeVals(&sampled_top_row, top + top_base_x, upsampled);
+ const uint16x8_t combined = WeightedBlend(
+ sampled_top_row.val[0], sampled_top_row.val[1], shift_1, shift_0);
+
+ const uint16x8_t masked_result =
+ vbslq_u16(max_base_mask, combined, final_top_val);
+ vst1q_u16(dst + x, masked_result);
+ }
+ // Corner-only section of the row.
+ Memset(dst + x, top[max_base_index], width - x);
+ }
+ for (; y < height; ++y) {
+ Memset(dst, top[max_base_index], width);
+ dst += stride;
+ }
+}
+
+void DirectionalIntraPredictorZone1_NEON(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const int width, const int height,
+ const int xstep,
+ const bool upsampled_top) {
+ const uint16_t* const top = static_cast<const uint16_t*>(top_row);
+ uint16_t* dst = static_cast<uint16_t*>(dest);
+ stride /= sizeof(top[0]);
+
+ assert(xstep > 0);
+
+ if (xstep == 64) {
+ assert(!upsampled_top);
+ const uint16_t* top_ptr = top + 1;
+ const int width_bytes = width * sizeof(top[0]);
+ int y = height;
+ do {
+ memcpy(dst, top_ptr, width_bytes);
+ memcpy(dst + stride, top_ptr + 1, width_bytes);
+ memcpy(dst + 2 * stride, top_ptr + 2, width_bytes);
+ memcpy(dst + 3 * stride, top_ptr + 3, width_bytes);
+ dst += 4 * stride;
+ top_ptr += 4;
+ y -= 4;
+ } while (y != 0);
+ } else {
+ if (width == 4) {
+ if (upsampled_top) {
+ DirectionalZone1_4xH<true>(dst, stride, height, top, xstep);
+ } else {
+ DirectionalZone1_4xH<false>(dst, stride, height, top, xstep);
+ }
+ } else if (width >= 32) {
+ if (upsampled_top) {
+ DirectionalZone1_Large(dst, stride, width, height, top, xstep, true);
+ } else {
+ DirectionalZone1_Large(dst, stride, width, height, top, xstep, false);
+ }
+ } else if (upsampled_top) {
+ DirectionalZone1_WxH<true>(dst, stride, width, height, top, xstep);
+ } else {
+ DirectionalZone1_WxH<false>(dst, stride, width, height, top, xstep);
+ }
+ }
+}
+
+// -----------------------------------------------------------------------------
+// Zone 3
+// This can be considered "the transpose of Zone 1." In Zone 1, the fractional
+// step applies when moving vertically in the destination block, connected to
+// the change in |y|, whereas in this mode, the step applies when moving
+// horizontally, connected to the change in |x|. This makes vectorization very
+// complicated in row-order, because a given vector may need source pixels that
+// span 16 or 32 pixels in steep angles, requiring multiple expensive table
+// lookups and checked loads. Rather than work in row order, it is simpler to
+// compute |dest| in column order, and then store the transposed results.
+
+// Compute 4x4 sub-blocks.
+// Example of computed sub-blocks of a 4x8 block before and after transpose:
+// 00 10 20 30 00 01 02 03
+// 01 11 21 31 10 11 12 13
+// 02 12 22 32 20 21 22 23
+// 03 13 23 33 30 31 32 33
+// ----------- --> -----------
+// 40 50 60 70 40 41 42 43
+// 41 51 61 71 50 51 52 53
+// 42 52 62 72 60 61 62 63
+// 43 53 63 73 70 71 72 73
+template <bool upsampled>
+inline void DirectionalZone3_4x4(uint8_t* dst, const ptrdiff_t stride,
+ const uint16_t* const left, const int ystep,
+ const int base_left_y = 0) {
+ const int upsample_shift = static_cast<int>(upsampled);
+ const int index_scale_bits = 6 - upsample_shift;
+
+ // Compute one column at a time, then transpose for storage.
+ uint16x4_t result[4];
+
+ int left_y = base_left_y + ystep;
+ int left_offset = left_y >> index_scale_bits;
+ int shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+ int shift_1 = 32 - shift_0;
+ uint16x4x2_t sampled_left_col;
+ LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+ result[0] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ shift_1, shift_0);
+
+ left_y += ystep;
+ left_offset = left_y >> index_scale_bits;
+ shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+ shift_1 = 32 - shift_0;
+ LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+ result[1] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ shift_1, shift_0);
+
+ left_y += ystep;
+ left_offset = left_y >> index_scale_bits;
+ shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+ shift_1 = 32 - shift_0;
+ LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+ result[2] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ shift_1, shift_0);
+
+ left_y += ystep;
+ left_offset = left_y >> index_scale_bits;
+ shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+ shift_1 = 32 - shift_0;
+ LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+ result[3] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ shift_1, shift_0);
+
+ Transpose4x4(result);
+ Store4(dst, result[0]);
+ dst += stride;
+ Store4(dst, result[1]);
+ dst += stride;
+ Store4(dst, result[2]);
+ dst += stride;
+ Store4(dst, result[3]);
+}
+
+template <bool upsampled>
+inline void DirectionalZone3_4xH(uint8_t* dest, const ptrdiff_t stride,
+ const int height, const uint16_t* const left,
+ const int ystep) {
+ const int upsample_shift = static_cast<int>(upsampled);
+ int y = 0;
+ do {
+ DirectionalZone3_4x4<upsampled>(dest, stride, left + (y << upsample_shift),
+ ystep);
+ dest += 4 * stride;
+ y += 4;
+ } while (y < height);
+}
+
+template <bool upsampled>
+inline void DirectionalZone3_Wx4(uint8_t* dest, const ptrdiff_t stride,
+ const int width, const uint16_t* const left,
+ const int ystep) {
+ int x = 0;
+ int base_left_y = 0;
+ do {
+ // TODO(petersonab): Establish 8x4 transpose to reserve this function for
+ // 8x4 and 16x4.
+ DirectionalZone3_4x4<upsampled>(dest + 2 * x, stride, left, ystep,
+ base_left_y);
+ base_left_y += 4 * ystep;
+ x += 4;
+ } while (x < width);
+}
+
+template <bool upsampled>
+inline void DirectionalZone3_8x8(uint8_t* dest, const ptrdiff_t stride,
+ const uint16_t* const left, const int ystep,
+ const int base_left_y = 0) {
+ const int upsample_shift = static_cast<int>(upsampled);
+ const int index_scale_bits = 6 - upsample_shift;
+
+ // Compute one column at a time, then transpose for storage.
+ uint16x8_t result[8];
+
+ int left_y = base_left_y + ystep;
+ uint16x8x2_t sampled_left_col;
+ int left_offset = left_y >> index_scale_bits;
+ int shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+ int shift_1 = 32 - shift_0;
+ LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+ result[0] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ shift_1, shift_0);
+ left_y += ystep;
+ left_offset = left_y >> index_scale_bits;
+ shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+ shift_1 = 32 - shift_0;
+ LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+ result[1] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ shift_1, shift_0);
+
+ left_y += ystep;
+ left_offset = left_y >> index_scale_bits;
+ shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+ shift_1 = 32 - shift_0;
+ LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+ result[2] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ shift_1, shift_0);
+
+ left_y += ystep;
+ left_offset = left_y >> index_scale_bits;
+ shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+ shift_1 = 32 - shift_0;
+ LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+ result[3] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ shift_1, shift_0);
+
+ left_y += ystep;
+ left_offset = left_y >> index_scale_bits;
+ shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+ shift_1 = 32 - shift_0;
+ LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+ result[4] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ shift_1, shift_0);
+
+ left_y += ystep;
+ left_offset = left_y >> index_scale_bits;
+ shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+ shift_1 = 32 - shift_0;
+ LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+ result[5] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ shift_1, shift_0);
+
+ left_y += ystep;
+ left_offset = left_y >> index_scale_bits;
+ shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+ shift_1 = 32 - shift_0;
+ LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+ result[6] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ shift_1, shift_0);
+
+ left_y += ystep;
+ left_offset = left_y >> index_scale_bits;
+ shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+ shift_1 = 32 - shift_0;
+ LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+ result[7] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ shift_1, shift_0);
+
+ Transpose8x8(result);
+ Store8(dest, result[0]);
+ dest += stride;
+ Store8(dest, result[1]);
+ dest += stride;
+ Store8(dest, result[2]);
+ dest += stride;
+ Store8(dest, result[3]);
+ dest += stride;
+ Store8(dest, result[4]);
+ dest += stride;
+ Store8(dest, result[5]);
+ dest += stride;
+ Store8(dest, result[6]);
+ dest += stride;
+ Store8(dest, result[7]);
+}
+
+template <bool upsampled>
+inline void DirectionalZone3_WxH(uint8_t* dest, const ptrdiff_t stride,
+ const int width, const int height,
+ const uint16_t* const left, const int ystep) {
+ const int upsample_shift = static_cast<int>(upsampled);
+ // Zone3 never runs out of left_column values.
+ assert((width + height - 1) << upsample_shift > // max_base_y
+ ((ystep * width) >> (6 - upsample_shift)) +
+ (/* base_step */ 1 << upsample_shift) *
+ (height - 1)); // left_base_y
+ int y = 0;
+ do {
+ int x = 0;
+ uint8_t* dst_x = dest + y * stride;
+ do {
+ const int base_left_y = ystep * x;
+ DirectionalZone3_8x8<upsampled>(
+ dst_x, stride, left + (y << upsample_shift), ystep, base_left_y);
+ dst_x += 8 * sizeof(uint16_t);
+ x += 8;
+ } while (x < width);
+ y += 8;
+ } while (y < height);
+}
+
+void DirectionalIntraPredictorZone3_NEON(void* const dest,
+ const ptrdiff_t stride,
+ const void* const left_column,
+ const int width, const int height,
+ const int ystep,
+ const bool upsampled_left) {
+ const uint16_t* const left = static_cast<const uint16_t*>(left_column);
+ uint8_t* dst = static_cast<uint8_t*>(dest);
+
+ if (ystep == 64) {
+ assert(!upsampled_left);
+ const int width_bytes = width * sizeof(left[0]);
+ int y = height;
+ do {
+ const uint16_t* left_ptr = left + 1;
+ memcpy(dst, left_ptr, width_bytes);
+ memcpy(dst + stride, left_ptr + 1, width_bytes);
+ memcpy(dst + 2 * stride, left_ptr + 2, width_bytes);
+ memcpy(dst + 3 * stride, left_ptr + 3, width_bytes);
+ dst += 4 * stride;
+ left_ptr += 4;
+ y -= 4;
+ } while (y != 0);
+ return;
+ }
+ if (width == 4) {
+ if (upsampled_left) {
+ DirectionalZone3_4xH<true>(dst, stride, height, left, ystep);
+ } else {
+ DirectionalZone3_4xH<false>(dst, stride, height, left, ystep);
+ }
+ } else if (height == 4) {
+ if (upsampled_left) {
+ DirectionalZone3_Wx4<true>(dst, stride, width, left, ystep);
+ } else {
+ DirectionalZone3_Wx4<false>(dst, stride, width, left, ystep);
+ }
+ } else {
+ if (upsampled_left) {
+ // |upsampled_left| can only be true if |width| + |height| <= 16,
+ // therefore this is 8x8.
+ DirectionalZone3_8x8<true>(dst, stride, left, ystep);
+ } else {
+ DirectionalZone3_WxH<false>(dst, stride, width, height, left, ystep);
+ }
+ }
+}
+
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+ dsp->directional_intra_predictor_zone1 = DirectionalIntraPredictorZone1_NEON;
+ dsp->directional_intra_predictor_zone3 = DirectionalIntraPredictorZone3_NEON;
+}
+
+} // namespace
+} // namespace high_bitdepth
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+void IntraPredDirectionalInit_NEON() {
+ low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ high_bitdepth::Init10bpp();
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+}
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_NEON
+#else // !LIBGAV1_ENABLE_NEON
namespace libgav1 {
namespace dsp {
diff --git a/src/dsp/arm/intrapred_directional_neon.h b/src/dsp/arm/intrapred_directional_neon.h
new file mode 100644
index 0000000..f7d6235
--- /dev/null
+++ b/src/dsp/arm/intrapred_directional_neon.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_INTRAPRED_DIRECTIONAL_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_INTRAPRED_DIRECTIONAL_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::directional_intra_predictor_zone*, see the defines below for
+// specifics. These functions are not thread-safe.
+void IntraPredDirectionalInit_NEON();
+
+} // namespace dsp
+} // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone1
+#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone1 LIBGAV1_CPU_NEON
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone2
+#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone2 LIBGAV1_CPU_NEON
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone3
+#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone3 LIBGAV1_CPU_NEON
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone1
+#define LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone1 LIBGAV1_CPU_NEON
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone3
+#define LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone3 LIBGAV1_CPU_NEON
+#endif
+
+#endif // LIBGAV1_ENABLE_NEON
+
+#endif // LIBGAV1_SRC_DSP_ARM_INTRAPRED_DIRECTIONAL_NEON_H_
diff --git a/src/dsp/arm/intrapred_filter_intra_neon.cc b/src/dsp/arm/intrapred_filter_neon.cc
index 411708e..bd9f61d 100644
--- a/src/dsp/arm/intrapred_filter_intra_neon.cc
+++ b/src/dsp/arm/intrapred_filter_neon.cc
@@ -1,4 +1,4 @@
-// Copyright 2019 The libgav1 Authors
+// Copyright 2021 The libgav1 Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "src/dsp/intrapred.h"
+#include "src/dsp/intrapred_filter.h"
#include "src/utils/cpu.h"
#if LIBGAV1_ENABLE_NEON
@@ -160,16 +160,16 @@ void Init8bpp() {
} // namespace
} // namespace low_bitdepth
-void IntraPredFilterIntraInit_NEON() { low_bitdepth::Init8bpp(); }
+void IntraPredFilterInit_NEON() { low_bitdepth::Init8bpp(); }
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_NEON
+#else // !LIBGAV1_ENABLE_NEON
namespace libgav1 {
namespace dsp {
-void IntraPredFilterIntraInit_NEON() {}
+void IntraPredFilterInit_NEON() {}
} // namespace dsp
} // namespace libgav1
diff --git a/src/dsp/arm/intrapred_filter_neon.h b/src/dsp/arm/intrapred_filter_neon.h
new file mode 100644
index 0000000..283c1b1
--- /dev/null
+++ b/src/dsp/arm/intrapred_filter_neon.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_INTRAPRED_FILTER_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_INTRAPRED_FILTER_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::filter_intra_predictor, see the defines below for specifics.
+// These functions are not thread-safe.
+void IntraPredFilterInit_NEON();
+
+} // namespace dsp
+} // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+#define LIBGAV1_Dsp8bpp_FilterIntraPredictor LIBGAV1_CPU_NEON
+#endif // LIBGAV1_ENABLE_NEON
+
+#endif // LIBGAV1_SRC_DSP_ARM_INTRAPRED_FILTER_NEON_H_
diff --git a/src/dsp/arm/intrapred_neon.cc b/src/dsp/arm/intrapred_neon.cc
index c967d82..c143648 100644
--- a/src/dsp/arm/intrapred_neon.cc
+++ b/src/dsp/arm/intrapred_neon.cc
@@ -26,6 +26,7 @@
#include "src/dsp/arm/common_neon.h"
#include "src/dsp/constants.h"
#include "src/dsp/dsp.h"
+#include "src/utils/constants.h"
namespace libgav1 {
namespace dsp {
@@ -964,6 +965,200 @@ struct DcDefs {
using _64x64 = DcPredFuncs_NEON<6, 6, DcSum_NEON, DcStore_NEON<64, 64>>;
};
+// IntraPredFuncs_NEON::Horizontal -- duplicate left column across all rows
+
+template <int block_height>
+void Horizontal4xH_NEON(void* const dest, ptrdiff_t stride,
+ const void* /*top_row*/,
+ const void* const left_column) {
+ const auto* const left = static_cast<const uint16_t*>(left_column);
+ auto* dst = static_cast<uint8_t*>(dest);
+ int y = 0;
+ do {
+ auto* dst16 = reinterpret_cast<uint16_t*>(dst);
+ const uint16x4_t row = vld1_dup_u16(left + y);
+ vst1_u16(dst16, row);
+ dst += stride;
+ } while (++y < block_height);
+}
+
+template <int block_height>
+void Horizontal8xH_NEON(void* const dest, ptrdiff_t stride,
+ const void* /*top_row*/,
+ const void* const left_column) {
+ const auto* const left = static_cast<const uint16_t*>(left_column);
+ auto* dst = static_cast<uint8_t*>(dest);
+ int y = 0;
+ do {
+ auto* dst16 = reinterpret_cast<uint16_t*>(dst);
+ const uint16x8_t row = vld1q_dup_u16(left + y);
+ vst1q_u16(dst16, row);
+ dst += stride;
+ } while (++y < block_height);
+}
+
+template <int block_height>
+void Horizontal16xH_NEON(void* const dest, ptrdiff_t stride,
+ const void* /*top_row*/,
+ const void* const left_column) {
+ const auto* const left = static_cast<const uint16_t*>(left_column);
+ auto* dst = static_cast<uint8_t*>(dest);
+ int y = 0;
+ do {
+ const uint16x8_t row0 = vld1q_dup_u16(left + y);
+ const uint16x8_t row1 = vld1q_dup_u16(left + y + 1);
+ auto* dst16 = reinterpret_cast<uint16_t*>(dst);
+ vst1q_u16(dst16, row0);
+ vst1q_u16(dst16 + 8, row0);
+ dst += stride;
+ dst16 = reinterpret_cast<uint16_t*>(dst);
+ vst1q_u16(dst16, row1);
+ vst1q_u16(dst16 + 8, row1);
+ dst += stride;
+ y += 2;
+ } while (y < block_height);
+}
+
+template <int block_height>
+void Horizontal32xH_NEON(void* const dest, ptrdiff_t stride,
+ const void* /*top_row*/,
+ const void* const left_column) {
+ const auto* const left = static_cast<const uint16_t*>(left_column);
+ auto* dst = static_cast<uint8_t*>(dest);
+ int y = 0;
+ do {
+ const uint16x8_t row0 = vld1q_dup_u16(left + y);
+ const uint16x8_t row1 = vld1q_dup_u16(left + y + 1);
+ auto* dst16 = reinterpret_cast<uint16_t*>(dst);
+ vst1q_u16(dst16, row0);
+ vst1q_u16(dst16 + 8, row0);
+ vst1q_u16(dst16 + 16, row0);
+ vst1q_u16(dst16 + 24, row0);
+ dst += stride;
+ dst16 = reinterpret_cast<uint16_t*>(dst);
+ vst1q_u16(dst16, row1);
+ vst1q_u16(dst16 + 8, row1);
+ vst1q_u16(dst16 + 16, row1);
+ vst1q_u16(dst16 + 24, row1);
+ dst += stride;
+ y += 2;
+ } while (y < block_height);
+}
+
+// IntraPredFuncs_NEON::Vertical -- copy top row to all rows
+
+template <int block_height>
+void Vertical4xH_NEON(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const void* const /*left_column*/) {
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ auto* dst = static_cast<uint8_t*>(dest);
+ const uint8x8_t row = vld1_u8(top);
+ int y = block_height;
+ do {
+ vst1_u8(dst, row);
+ dst += stride;
+ } while (--y != 0);
+}
+
+template <int block_height>
+void Vertical8xH_NEON(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const void* const /*left_column*/) {
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ auto* dst = static_cast<uint8_t*>(dest);
+ const uint8x16_t row = vld1q_u8(top);
+ int y = block_height;
+ do {
+ vst1q_u8(dst, row);
+ dst += stride;
+ } while (--y != 0);
+}
+
+template <int block_height>
+void Vertical16xH_NEON(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const void* const /*left_column*/) {
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ auto* dst = static_cast<uint8_t*>(dest);
+ const uint8x16_t row0 = vld1q_u8(top);
+ const uint8x16_t row1 = vld1q_u8(top + 16);
+ int y = block_height;
+ do {
+ vst1q_u8(dst, row0);
+ vst1q_u8(dst + 16, row1);
+ dst += stride;
+ vst1q_u8(dst, row0);
+ vst1q_u8(dst + 16, row1);
+ dst += stride;
+ y -= 2;
+ } while (y != 0);
+}
+
+template <int block_height>
+void Vertical32xH_NEON(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const void* const /*left_column*/) {
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ auto* dst = static_cast<uint8_t*>(dest);
+ const uint8x16_t row0 = vld1q_u8(top);
+ const uint8x16_t row1 = vld1q_u8(top + 16);
+ const uint8x16_t row2 = vld1q_u8(top + 32);
+ const uint8x16_t row3 = vld1q_u8(top + 48);
+ int y = block_height;
+ do {
+ vst1q_u8(dst, row0);
+ vst1q_u8(dst + 16, row1);
+ vst1q_u8(dst + 32, row2);
+ vst1q_u8(dst + 48, row3);
+ dst += stride;
+ vst1q_u8(dst, row0);
+ vst1q_u8(dst + 16, row1);
+ vst1q_u8(dst + 32, row2);
+ vst1q_u8(dst + 48, row3);
+ dst += stride;
+ y -= 2;
+ } while (y != 0);
+}
+
+template <int block_height>
+void Vertical64xH_NEON(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const void* const /*left_column*/) {
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ auto* dst = static_cast<uint8_t*>(dest);
+ const uint8x16_t row0 = vld1q_u8(top);
+ const uint8x16_t row1 = vld1q_u8(top + 16);
+ const uint8x16_t row2 = vld1q_u8(top + 32);
+ const uint8x16_t row3 = vld1q_u8(top + 48);
+ const uint8x16_t row4 = vld1q_u8(top + 64);
+ const uint8x16_t row5 = vld1q_u8(top + 80);
+ const uint8x16_t row6 = vld1q_u8(top + 96);
+ const uint8x16_t row7 = vld1q_u8(top + 112);
+ int y = block_height;
+ do {
+ vst1q_u8(dst, row0);
+ vst1q_u8(dst + 16, row1);
+ vst1q_u8(dst + 32, row2);
+ vst1q_u8(dst + 48, row3);
+ vst1q_u8(dst + 64, row4);
+ vst1q_u8(dst + 80, row5);
+ vst1q_u8(dst + 96, row6);
+ vst1q_u8(dst + 112, row7);
+ dst += stride;
+ vst1q_u8(dst, row0);
+ vst1q_u8(dst + 16, row1);
+ vst1q_u8(dst + 32, row2);
+ vst1q_u8(dst + 48, row3);
+ vst1q_u8(dst + 64, row4);
+ vst1q_u8(dst + 80, row5);
+ vst1q_u8(dst + 96, row6);
+ vst1q_u8(dst + 112, row7);
+ dst += stride;
+ y -= 2;
+ } while (y != 0);
+}
+
void Init10bpp() {
Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
assert(dsp != nullptr);
@@ -973,6 +1168,8 @@ void Init10bpp() {
DcDefs::_4x4::DcLeft;
dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDc] =
DcDefs::_4x4::Dc;
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorVertical] =
+ Vertical4xH_NEON<4>;
// 4x8
dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcTop] =
@@ -981,6 +1178,10 @@ void Init10bpp() {
DcDefs::_4x8::DcLeft;
dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDc] =
DcDefs::_4x8::Dc;
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorHorizontal] =
+ Horizontal4xH_NEON<8>;
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorVertical] =
+ Vertical4xH_NEON<8>;
// 4x16
dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcTop] =
@@ -989,6 +1190,10 @@ void Init10bpp() {
DcDefs::_4x16::DcLeft;
dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDc] =
DcDefs::_4x16::Dc;
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorHorizontal] =
+ Horizontal4xH_NEON<16>;
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorVertical] =
+ Vertical4xH_NEON<16>;
// 8x4
dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcTop] =
@@ -997,6 +1202,8 @@ void Init10bpp() {
DcDefs::_8x4::DcLeft;
dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDc] =
DcDefs::_8x4::Dc;
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorVertical] =
+ Vertical8xH_NEON<4>;
// 8x8
dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcTop] =
@@ -1005,6 +1212,10 @@ void Init10bpp() {
DcDefs::_8x8::DcLeft;
dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDc] =
DcDefs::_8x8::Dc;
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorHorizontal] =
+ Horizontal8xH_NEON<8>;
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorVertical] =
+ Vertical8xH_NEON<8>;
// 8x16
dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcTop] =
@@ -1013,6 +1224,8 @@ void Init10bpp() {
DcDefs::_8x16::DcLeft;
dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDc] =
DcDefs::_8x16::Dc;
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorVertical] =
+ Vertical8xH_NEON<16>;
// 8x32
dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcTop] =
@@ -1021,6 +1234,10 @@ void Init10bpp() {
DcDefs::_8x32::DcLeft;
dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDc] =
DcDefs::_8x32::Dc;
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorHorizontal] =
+ Horizontal8xH_NEON<32>;
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorVertical] =
+ Vertical8xH_NEON<32>;
// 16x4
dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcTop] =
@@ -1029,6 +1246,8 @@ void Init10bpp() {
DcDefs::_16x4::DcLeft;
dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDc] =
DcDefs::_16x4::Dc;
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorVertical] =
+ Vertical16xH_NEON<4>;
// 16x8
dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcTop] =
@@ -1037,6 +1256,10 @@ void Init10bpp() {
DcDefs::_16x8::DcLeft;
dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDc] =
DcDefs::_16x8::Dc;
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorHorizontal] =
+ Horizontal16xH_NEON<8>;
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorVertical] =
+ Vertical16xH_NEON<8>;
// 16x16
dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcTop] =
@@ -1045,6 +1268,8 @@ void Init10bpp() {
DcDefs::_16x16::DcLeft;
dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDc] =
DcDefs::_16x16::Dc;
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorVertical] =
+ Vertical16xH_NEON<16>;
// 16x32
dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcTop] =
@@ -1053,6 +1278,8 @@ void Init10bpp() {
DcDefs::_16x32::DcLeft;
dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDc] =
DcDefs::_16x32::Dc;
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorVertical] =
+ Vertical16xH_NEON<32>;
// 16x64
dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcTop] =
@@ -1061,6 +1288,8 @@ void Init10bpp() {
DcDefs::_16x64::DcLeft;
dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDc] =
DcDefs::_16x64::Dc;
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorVertical] =
+ Vertical16xH_NEON<64>;
// 32x8
dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcTop] =
@@ -1069,6 +1298,8 @@ void Init10bpp() {
DcDefs::_32x8::DcLeft;
dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDc] =
DcDefs::_32x8::Dc;
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorVertical] =
+ Vertical32xH_NEON<8>;
// 32x16
dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcTop] =
@@ -1077,6 +1308,8 @@ void Init10bpp() {
DcDefs::_32x16::DcLeft;
dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDc] =
DcDefs::_32x16::Dc;
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorVertical] =
+ Vertical32xH_NEON<16>;
// 32x32
dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcTop] =
@@ -1085,6 +1318,8 @@ void Init10bpp() {
DcDefs::_32x32::DcLeft;
dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDc] =
DcDefs::_32x32::Dc;
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorVertical] =
+ Vertical32xH_NEON<32>;
// 32x64
dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcTop] =
@@ -1093,6 +1328,10 @@ void Init10bpp() {
DcDefs::_32x64::DcLeft;
dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDc] =
DcDefs::_32x64::Dc;
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorHorizontal] =
+ Horizontal32xH_NEON<64>;
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorVertical] =
+ Vertical32xH_NEON<64>;
// 64x16
dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcTop] =
@@ -1101,6 +1340,8 @@ void Init10bpp() {
DcDefs::_64x16::DcLeft;
dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDc] =
DcDefs::_64x16::Dc;
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorVertical] =
+ Vertical64xH_NEON<16>;
// 64x32
dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcTop] =
@@ -1109,6 +1350,8 @@ void Init10bpp() {
DcDefs::_64x32::DcLeft;
dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDc] =
DcDefs::_64x32::Dc;
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorVertical] =
+ Vertical64xH_NEON<32>;
// 64x64
dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcTop] =
@@ -1117,6 +1360,8 @@ void Init10bpp() {
DcDefs::_64x64::DcLeft;
dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDc] =
DcDefs::_64x64::Dc;
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorVertical] =
+ Vertical64xH_NEON<64>;
}
} // namespace
@@ -1133,7 +1378,7 @@ void IntraPredInit_NEON() {
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_NEON
+#else // !LIBGAV1_ENABLE_NEON
namespace libgav1 {
namespace dsp {
diff --git a/src/dsp/arm/intrapred_neon.h b/src/dsp/arm/intrapred_neon.h
index 16f858c..b27f29f 100644
--- a/src/dsp/arm/intrapred_neon.h
+++ b/src/dsp/arm/intrapred_neon.h
@@ -23,396 +23,282 @@
namespace libgav1 {
namespace dsp {
-// Initializes Dsp::intra_predictors, Dsp::directional_intra_predictor_zone*,
-// Dsp::cfl_intra_predictors, Dsp::cfl_subsamplers and
-// Dsp::filter_intra_predictor, see the defines below for specifics. These
-// functions are not thread-safe.
-void IntraPredCflInit_NEON();
-void IntraPredDirectionalInit_NEON();
-void IntraPredFilterIntraInit_NEON();
+// Initializes Dsp::intra_predictors.
+// See the defines below for specifics. These functions are not thread-safe.
void IntraPredInit_NEON();
-void IntraPredSmoothInit_NEON();
} // namespace dsp
} // namespace libgav1
#if LIBGAV1_ENABLE_NEON
-// 8 bit
-#define LIBGAV1_Dsp8bpp_FilterIntraPredictor LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone1 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone2 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone3 LIBGAV1_CPU_NEON
-
// 4x4
#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcLeft LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDc LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflIntraPredictor LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler420 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler444 LIBGAV1_CPU_NEON
// 4x8
#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDc LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflIntraPredictor LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler420 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler444 LIBGAV1_CPU_NEON
// 4x16
#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDc LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflIntraPredictor LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler420 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler444 LIBGAV1_CPU_NEON
// 8x4
#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDcLeft LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDc LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflIntraPredictor LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler420 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler444 LIBGAV1_CPU_NEON
// 8x8
#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDc LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflIntraPredictor LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler420 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler444 LIBGAV1_CPU_NEON
// 8x16
#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDc LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflIntraPredictor LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler420 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler444 LIBGAV1_CPU_NEON
// 8x32
#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDcLeft LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDc LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflIntraPredictor LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler420 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler444 LIBGAV1_CPU_NEON
// 16x4
#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDcLeft LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDc LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflIntraPredictor LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler420 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler444 LIBGAV1_CPU_NEON
// 16x8
#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDc LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflIntraPredictor LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler420 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler444 LIBGAV1_CPU_NEON
// 16x16
#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDc LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflIntraPredictor LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler420 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler444 LIBGAV1_CPU_NEON
// 16x32
#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDcLeft LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDc LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflIntraPredictor LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler420 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler444 LIBGAV1_CPU_NEON
// 16x64
#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDcLeft LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDc LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_NEON
// 32x8
#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDc LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflIntraPredictor LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler420 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler444 LIBGAV1_CPU_NEON
// 32x16
#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDc LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflIntraPredictor LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler420 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler444 LIBGAV1_CPU_NEON
// 32x32
#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDcLeft LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDc LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflIntraPredictor LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler420 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler444 LIBGAV1_CPU_NEON
// 32x64
#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDcLeft LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDc LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_NEON
// 64x16
#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDc LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_NEON
// 64x32
#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDcLeft LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDc LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_NEON
// 64x64
#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDcLeft LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDc LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_NEON
// 10 bit
// 4x4
#define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDcLeft LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorVertical \
+ LIBGAV1_CPU_NEON
// 4x8
#define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorHorizontal \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorVertical \
+ LIBGAV1_CPU_NEON
// 4x16
#define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorHorizontal \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorVertical \
+ LIBGAV1_CPU_NEON
// 8x4
#define LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorDcLeft LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorVertical \
+ LIBGAV1_CPU_NEON
// 8x8
#define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorHorizontal \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorVertical \
+ LIBGAV1_CPU_NEON
// 8x16
#define LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorVertical \
+ LIBGAV1_CPU_NEON
// 8x32
#define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorDcLeft LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorHorizontal \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorVertical \
+ LIBGAV1_CPU_NEON
// 16x4
#define LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorDcLeft LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorVertical \
+ LIBGAV1_CPU_NEON
// 16x8
#define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorHorizontal \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorVertical \
+ LIBGAV1_CPU_NEON
// 16x16
#define LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorDcLeft \
LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorVertical \
+ LIBGAV1_CPU_NEON
// 16x32
#define LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorDcLeft \
LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorVertical \
+ LIBGAV1_CPU_NEON
// 16x64
#define LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorDcLeft \
LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorVertical \
+ LIBGAV1_CPU_NEON
// 32x8
#define LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorVertical \
+ LIBGAV1_CPU_NEON
// 32x16
#define LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorDcLeft \
LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorVertical \
+ LIBGAV1_CPU_NEON
// 32x32
#define LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorDcLeft \
LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorVertical \
+ LIBGAV1_CPU_NEON
// 32x64
#define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorDcLeft \
LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorHorizontal \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorVertical \
+ LIBGAV1_CPU_NEON
// 64x16
#define LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorDcLeft \
LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorVertical \
+ LIBGAV1_CPU_NEON
// 64x32
#define LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorDcLeft \
LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorVertical \
+ LIBGAV1_CPU_NEON
// 64x64
#define LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorDcLeft \
LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorVertical \
+ LIBGAV1_CPU_NEON
#endif // LIBGAV1_ENABLE_NEON
#endif // LIBGAV1_SRC_DSP_ARM_INTRAPRED_NEON_H_
diff --git a/src/dsp/arm/intrapred_smooth_neon.cc b/src/dsp/arm/intrapred_smooth_neon.cc
index abc93e8..c33f333 100644
--- a/src/dsp/arm/intrapred_smooth_neon.cc
+++ b/src/dsp/arm/intrapred_smooth_neon.cc
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "src/dsp/intrapred.h"
+#include "src/dsp/intrapred_smooth.h"
#include "src/utils/cpu.h"
#if LIBGAV1_ENABLE_NEON
@@ -26,6 +26,7 @@
#include "src/dsp/arm/common_neon.h"
#include "src/dsp/constants.h"
#include "src/dsp/dsp.h"
+#include "src/utils/constants.h"
namespace libgav1 {
namespace dsp {
@@ -605,7 +606,7 @@ void IntraPredSmoothInit_NEON() { low_bitdepth::Init8bpp(); }
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_NEON
+#else // !LIBGAV1_ENABLE_NEON
namespace libgav1 {
namespace dsp {
diff --git a/src/dsp/arm/intrapred_smooth_neon.h b/src/dsp/arm/intrapred_smooth_neon.h
new file mode 100644
index 0000000..edd01be
--- /dev/null
+++ b/src/dsp/arm/intrapred_smooth_neon.h
@@ -0,0 +1,149 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_INTRAPRED_SMOOTH_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_INTRAPRED_SMOOTH_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::intra_predictors[][kIntraPredictorSmooth.*].
+// This function is not thread-safe.
+void IntraPredSmoothInit_NEON();
+
+} // namespace dsp
+} // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+#endif // LIBGAV1_ENABLE_NEON
+
+#endif // LIBGAV1_SRC_DSP_ARM_INTRAPRED_SMOOTH_NEON_H_
diff --git a/src/dsp/arm/inverse_transform_10bit_neon.cc b/src/dsp/arm/inverse_transform_10bit_neon.cc
new file mode 100644
index 0000000..ff184a1
--- /dev/null
+++ b/src/dsp/arm/inverse_transform_10bit_neon.cc
@@ -0,0 +1,2543 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/inverse_transform.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON && LIBGAV1_MAX_BITDEPTH >= 10
+
+#include <arm_neon.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+// Include the constants and utility functions inside the anonymous namespace.
+#include "src/dsp/inverse_transform.inc"
+
+//------------------------------------------------------------------------------
+
+LIBGAV1_ALWAYS_INLINE void Transpose4x4(const int32x4_t in[4],
+ int32x4_t out[4]) {
+ // in:
+ // 00 01 02 03
+ // 10 11 12 13
+ // 20 21 22 23
+ // 30 31 32 33
+
+ // 00 10 02 12 a.val[0]
+ // 01 11 03 13 a.val[1]
+ // 20 30 22 32 b.val[0]
+ // 21 31 23 33 b.val[1]
+ const int32x4x2_t a = vtrnq_s32(in[0], in[1]);
+ const int32x4x2_t b = vtrnq_s32(in[2], in[3]);
+ out[0] = vextq_s32(vextq_s32(a.val[0], a.val[0], 2), b.val[0], 2);
+ out[1] = vextq_s32(vextq_s32(a.val[1], a.val[1], 2), b.val[1], 2);
+ out[2] = vextq_s32(a.val[0], vextq_s32(b.val[0], b.val[0], 2), 2);
+ out[3] = vextq_s32(a.val[1], vextq_s32(b.val[1], b.val[1], 2), 2);
+ // out:
+ // 00 10 20 30
+ // 01 11 21 31
+ // 02 12 22 32
+ // 03 13 23 33
+}
+
+//------------------------------------------------------------------------------
+template <int store_count>
+LIBGAV1_ALWAYS_INLINE void StoreDst(int32_t* dst, int32_t stride, int32_t idx,
+ const int32x4_t* const s) {
+ assert(store_count % 4 == 0);
+ for (int i = 0; i < store_count; i += 4) {
+ vst1q_s32(&dst[i * stride + idx], s[i]);
+ vst1q_s32(&dst[(i + 1) * stride + idx], s[i + 1]);
+ vst1q_s32(&dst[(i + 2) * stride + idx], s[i + 2]);
+ vst1q_s32(&dst[(i + 3) * stride + idx], s[i + 3]);
+ }
+}
+
+template <int load_count>
+LIBGAV1_ALWAYS_INLINE void LoadSrc(const int32_t* src, int32_t stride,
+ int32_t idx, int32x4_t* x) {
+ assert(load_count % 4 == 0);
+ for (int i = 0; i < load_count; i += 4) {
+ x[i] = vld1q_s32(&src[i * stride + idx]);
+ x[i + 1] = vld1q_s32(&src[(i + 1) * stride + idx]);
+ x[i + 2] = vld1q_s32(&src[(i + 2) * stride + idx]);
+ x[i + 3] = vld1q_s32(&src[(i + 3) * stride + idx]);
+ }
+}
+
+// Butterfly rotate 4 values.
+LIBGAV1_ALWAYS_INLINE void ButterflyRotation_4(int32x4_t* a, int32x4_t* b,
+ const int angle,
+ const bool flip) {
+ const int32_t cos128 = Cos128(angle);
+ const int32_t sin128 = Sin128(angle);
+ const int32x4_t acc_x = vmulq_n_s32(*a, cos128);
+ const int32x4_t acc_y = vmulq_n_s32(*a, sin128);
+ // The max range for the input is 18 bits. The cos128/sin128 is 13 bits,
+ // which leaves 1 bit for the add/subtract. For 10bpp, x/y will fit in a 32
+ // bit lane.
+ const int32x4_t x0 = vmlsq_n_s32(acc_x, *b, sin128);
+ const int32x4_t y0 = vmlaq_n_s32(acc_y, *b, cos128);
+ const int32x4_t x = vrshrq_n_s32(x0, 12);
+ const int32x4_t y = vrshrq_n_s32(y0, 12);
+ if (flip) {
+ *a = y;
+ *b = x;
+ } else {
+ *a = x;
+ *b = y;
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE void ButterflyRotation_FirstIsZero(int32x4_t* a,
+ int32x4_t* b,
+ const int angle,
+ const bool flip) {
+ const int32_t cos128 = Cos128(angle);
+ const int32_t sin128 = Sin128(angle);
+ assert(sin128 <= 0xfff);
+ const int32x4_t x0 = vmulq_n_s32(*b, -sin128);
+ const int32x4_t y0 = vmulq_n_s32(*b, cos128);
+ const int32x4_t x = vrshrq_n_s32(x0, 12);
+ const int32x4_t y = vrshrq_n_s32(y0, 12);
+ if (flip) {
+ *a = y;
+ *b = x;
+ } else {
+ *a = x;
+ *b = y;
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE void ButterflyRotation_SecondIsZero(int32x4_t* a,
+ int32x4_t* b,
+ const int angle,
+ const bool flip) {
+ const int32_t cos128 = Cos128(angle);
+ const int32_t sin128 = Sin128(angle);
+ const int32x4_t x0 = vmulq_n_s32(*a, cos128);
+ const int32x4_t y0 = vmulq_n_s32(*a, sin128);
+ const int32x4_t x = vrshrq_n_s32(x0, 12);
+ const int32x4_t y = vrshrq_n_s32(y0, 12);
+ if (flip) {
+ *a = y;
+ *b = x;
+ } else {
+ *a = x;
+ *b = y;
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE void HadamardRotation(int32x4_t* a, int32x4_t* b,
+ bool flip) {
+ int32x4_t x, y;
+ if (flip) {
+ y = vqaddq_s32(*b, *a);
+ x = vqsubq_s32(*b, *a);
+ } else {
+ x = vqaddq_s32(*a, *b);
+ y = vqsubq_s32(*a, *b);
+ }
+ *a = x;
+ *b = y;
+}
+
+LIBGAV1_ALWAYS_INLINE void HadamardRotation(int32x4_t* a, int32x4_t* b,
+ bool flip, const int32x4_t* min,
+ const int32x4_t* max) {
+ int32x4_t x, y;
+ if (flip) {
+ y = vqaddq_s32(*b, *a);
+ x = vqsubq_s32(*b, *a);
+ } else {
+ x = vqaddq_s32(*a, *b);
+ y = vqsubq_s32(*a, *b);
+ }
+ *a = vmaxq_s32(vminq_s32(x, *max), *min);
+ *b = vmaxq_s32(vminq_s32(y, *max), *min);
+}
+
+using ButterflyRotationFunc = void (*)(int32x4_t* a, int32x4_t* b, int angle,
+ bool flip);
+
+//------------------------------------------------------------------------------
+// Discrete Cosine Transforms (DCT).
+
+template <int width>
+LIBGAV1_ALWAYS_INLINE bool DctDcOnly(void* dest, int adjusted_tx_height,
+ bool should_round, int row_shift) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int32_t*>(dest);
+ const int32x4_t v_src = vdupq_n_s32(dst[0]);
+ const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0);
+ const int32x4_t v_src_round =
+ vqrdmulhq_n_s32(v_src, kTransformRowMultiplier << (31 - 12));
+ const int32x4_t s0 = vbslq_s32(v_mask, v_src_round, v_src);
+ const int32_t cos128 = Cos128(32);
+ const int32x4_t xy = vqrdmulhq_n_s32(s0, cos128 << (31 - 12));
+ // vqrshlq_s32 will shift right if shift value is negative.
+ const int32x4_t xy_shifted = vqrshlq_s32(xy, vdupq_n_s32(-row_shift));
+ // Clamp result to signed 16 bits.
+ const int32x4_t result = vmovl_s16(vqmovn_s32(xy_shifted));
+ if (width == 4) {
+ vst1q_s32(dst, result);
+ } else {
+ for (int i = 0; i < width; i += 4) {
+ vst1q_s32(dst, result);
+ dst += 4;
+ }
+ }
+ return true;
+}
+
+template <int height>
+LIBGAV1_ALWAYS_INLINE bool DctDcOnlyColumn(void* dest, int adjusted_tx_height,
+ int width) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int32_t*>(dest);
+ const int32_t cos128 = Cos128(32);
+
+ // Calculate dc values for first row.
+ if (width == 4) {
+ const int32x4_t v_src = vld1q_s32(dst);
+ const int32x4_t xy = vqrdmulhq_n_s32(v_src, cos128 << (31 - 12));
+ vst1q_s32(dst, xy);
+ } else {
+ int i = 0;
+ do {
+ const int32x4_t v_src = vld1q_s32(&dst[i]);
+ const int32x4_t xy = vqrdmulhq_n_s32(v_src, cos128 << (31 - 12));
+ vst1q_s32(&dst[i], xy);
+ i += 4;
+ } while (i < width);
+ }
+
+ // Copy first row to the rest of the block.
+ for (int y = 1; y < height; ++y) {
+ memcpy(&dst[y * width], dst, width * sizeof(dst[0]));
+ }
+ return true;
+}
+
+template <ButterflyRotationFunc butterfly_rotation,
+ bool is_fast_butterfly = false>
+LIBGAV1_ALWAYS_INLINE void Dct4Stages(int32x4_t* s, const int32x4_t* min,
+ const int32x4_t* max,
+ const bool is_last_stage) {
+ // stage 12.
+ if (is_fast_butterfly) {
+ ButterflyRotation_SecondIsZero(&s[0], &s[1], 32, true);
+ ButterflyRotation_SecondIsZero(&s[2], &s[3], 48, false);
+ } else {
+ butterfly_rotation(&s[0], &s[1], 32, true);
+ butterfly_rotation(&s[2], &s[3], 48, false);
+ }
+
+ // stage 17.
+ if (is_last_stage) {
+ HadamardRotation(&s[0], &s[3], false);
+ HadamardRotation(&s[1], &s[2], false);
+ } else {
+ HadamardRotation(&s[0], &s[3], false, min, max);
+ HadamardRotation(&s[1], &s[2], false, min, max);
+ }
+}
+
+template <ButterflyRotationFunc butterfly_rotation>
+LIBGAV1_ALWAYS_INLINE void Dct4_NEON(void* dest, int32_t step, bool is_row,
+ int row_shift) {
+ auto* const dst = static_cast<int32_t*>(dest);
+ // When |is_row| is true, set range to the row range, otherwise, set to the
+ // column range.
+ const int32_t range = is_row ? kBitdepth10 + 7 : 15;
+ const int32x4_t min = vdupq_n_s32(-(1 << range));
+ const int32x4_t max = vdupq_n_s32((1 << range) - 1);
+ int32x4_t s[4], x[4];
+
+ LoadSrc<4>(dst, step, 0, x);
+ if (is_row) {
+ Transpose4x4(x, x);
+ }
+
+ // stage 1.
+ // kBitReverseLookup 0, 2, 1, 3
+ s[0] = x[0];
+ s[1] = x[2];
+ s[2] = x[1];
+ s[3] = x[3];
+
+ Dct4Stages<butterfly_rotation>(s, &min, &max, /*is_last_stage=*/true);
+
+ if (is_row) {
+ const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
+ for (int i = 0; i < 4; ++i) {
+ s[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(s[i], v_row_shift)));
+ }
+ Transpose4x4(s, s);
+ }
+ StoreDst<4>(dst, step, 0, s);
+}
+
+template <ButterflyRotationFunc butterfly_rotation,
+ bool is_fast_butterfly = false>
+LIBGAV1_ALWAYS_INLINE void Dct8Stages(int32x4_t* s, const int32x4_t* min,
+ const int32x4_t* max,
+ const bool is_last_stage) {
+ // stage 8.
+ if (is_fast_butterfly) {
+ ButterflyRotation_SecondIsZero(&s[4], &s[7], 56, false);
+ ButterflyRotation_FirstIsZero(&s[5], &s[6], 24, false);
+ } else {
+ butterfly_rotation(&s[4], &s[7], 56, false);
+ butterfly_rotation(&s[5], &s[6], 24, false);
+ }
+
+ // stage 13.
+ HadamardRotation(&s[4], &s[5], false, min, max);
+ HadamardRotation(&s[6], &s[7], true, min, max);
+
+ // stage 18.
+ butterfly_rotation(&s[6], &s[5], 32, true);
+
+ // stage 22.
+ if (is_last_stage) {
+ HadamardRotation(&s[0], &s[7], false);
+ HadamardRotation(&s[1], &s[6], false);
+ HadamardRotation(&s[2], &s[5], false);
+ HadamardRotation(&s[3], &s[4], false);
+ } else {
+ HadamardRotation(&s[0], &s[7], false, min, max);
+ HadamardRotation(&s[1], &s[6], false, min, max);
+ HadamardRotation(&s[2], &s[5], false, min, max);
+ HadamardRotation(&s[3], &s[4], false, min, max);
+ }
+}
+
+// Process dct8 rows or columns, depending on the |is_row| flag.
+template <ButterflyRotationFunc butterfly_rotation>
+LIBGAV1_ALWAYS_INLINE void Dct8_NEON(void* dest, int32_t step, bool is_row,
+ int row_shift) {
+ auto* const dst = static_cast<int32_t*>(dest);
+ const int32_t range = is_row ? kBitdepth10 + 7 : 15;
+ const int32x4_t min = vdupq_n_s32(-(1 << range));
+ const int32x4_t max = vdupq_n_s32((1 << range) - 1);
+ int32x4_t s[8], x[8];
+
+ if (is_row) {
+ LoadSrc<4>(dst, step, 0, &x[0]);
+ LoadSrc<4>(dst, step, 4, &x[4]);
+ Transpose4x4(&x[0], &x[0]);
+ Transpose4x4(&x[4], &x[4]);
+ } else {
+ LoadSrc<8>(dst, step, 0, &x[0]);
+ }
+
+ // stage 1.
+ // kBitReverseLookup 0, 4, 2, 6, 1, 5, 3, 7,
+ s[0] = x[0];
+ s[1] = x[4];
+ s[2] = x[2];
+ s[3] = x[6];
+ s[4] = x[1];
+ s[5] = x[5];
+ s[6] = x[3];
+ s[7] = x[7];
+
+ Dct4Stages<butterfly_rotation>(s, &min, &max, /*is_last_stage=*/false);
+ Dct8Stages<butterfly_rotation>(s, &min, &max, /*is_last_stage=*/true);
+
+ if (is_row) {
+ const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
+ for (int i = 0; i < 8; ++i) {
+ s[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(s[i], v_row_shift)));
+ }
+ Transpose4x4(&s[0], &s[0]);
+ Transpose4x4(&s[4], &s[4]);
+ StoreDst<4>(dst, step, 0, &s[0]);
+ StoreDst<4>(dst, step, 4, &s[4]);
+ } else {
+ StoreDst<8>(dst, step, 0, &s[0]);
+ }
+}
+
+template <ButterflyRotationFunc butterfly_rotation,
+ bool is_fast_butterfly = false>
+LIBGAV1_ALWAYS_INLINE void Dct16Stages(int32x4_t* s, const int32x4_t* min,
+ const int32x4_t* max,
+ const bool is_last_stage) {
+ // stage 5.
+ if (is_fast_butterfly) {
+ ButterflyRotation_SecondIsZero(&s[8], &s[15], 60, false);
+ ButterflyRotation_FirstIsZero(&s[9], &s[14], 28, false);
+ ButterflyRotation_SecondIsZero(&s[10], &s[13], 44, false);
+ ButterflyRotation_FirstIsZero(&s[11], &s[12], 12, false);
+ } else {
+ butterfly_rotation(&s[8], &s[15], 60, false);
+ butterfly_rotation(&s[9], &s[14], 28, false);
+ butterfly_rotation(&s[10], &s[13], 44, false);
+ butterfly_rotation(&s[11], &s[12], 12, false);
+ }
+
+ // stage 9.
+ HadamardRotation(&s[8], &s[9], false, min, max);
+ HadamardRotation(&s[10], &s[11], true, min, max);
+ HadamardRotation(&s[12], &s[13], false, min, max);
+ HadamardRotation(&s[14], &s[15], true, min, max);
+
+ // stage 14.
+ butterfly_rotation(&s[14], &s[9], 48, true);
+ butterfly_rotation(&s[13], &s[10], 112, true);
+
+ // stage 19.
+ HadamardRotation(&s[8], &s[11], false, min, max);
+ HadamardRotation(&s[9], &s[10], false, min, max);
+ HadamardRotation(&s[12], &s[15], true, min, max);
+ HadamardRotation(&s[13], &s[14], true, min, max);
+
+ // stage 23.
+ butterfly_rotation(&s[13], &s[10], 32, true);
+ butterfly_rotation(&s[12], &s[11], 32, true);
+
+ // stage 26.
+ if (is_last_stage) {
+ HadamardRotation(&s[0], &s[15], false);
+ HadamardRotation(&s[1], &s[14], false);
+ HadamardRotation(&s[2], &s[13], false);
+ HadamardRotation(&s[3], &s[12], false);
+ HadamardRotation(&s[4], &s[11], false);
+ HadamardRotation(&s[5], &s[10], false);
+ HadamardRotation(&s[6], &s[9], false);
+ HadamardRotation(&s[7], &s[8], false);
+ } else {
+ HadamardRotation(&s[0], &s[15], false, min, max);
+ HadamardRotation(&s[1], &s[14], false, min, max);
+ HadamardRotation(&s[2], &s[13], false, min, max);
+ HadamardRotation(&s[3], &s[12], false, min, max);
+ HadamardRotation(&s[4], &s[11], false, min, max);
+ HadamardRotation(&s[5], &s[10], false, min, max);
+ HadamardRotation(&s[6], &s[9], false, min, max);
+ HadamardRotation(&s[7], &s[8], false, min, max);
+ }
+}
+
+// Process dct16 rows or columns, depending on the |is_row| flag.
+template <ButterflyRotationFunc butterfly_rotation>
+LIBGAV1_ALWAYS_INLINE void Dct16_NEON(void* dest, int32_t step, bool is_row,
+ int row_shift) {
+ auto* const dst = static_cast<int32_t*>(dest);
+ const int32_t range = is_row ? kBitdepth10 + 7 : 15;
+ const int32x4_t min = vdupq_n_s32(-(1 << range));
+ const int32x4_t max = vdupq_n_s32((1 << range) - 1);
+ int32x4_t s[16], x[16];
+
+ if (is_row) {
+ for (int idx = 0; idx < 16; idx += 8) {
+ LoadSrc<4>(dst, step, idx, &x[idx]);
+ LoadSrc<4>(dst, step, idx + 4, &x[idx + 4]);
+ Transpose4x4(&x[idx], &x[idx]);
+ Transpose4x4(&x[idx + 4], &x[idx + 4]);
+ }
+ } else {
+ LoadSrc<16>(dst, step, 0, &x[0]);
+ }
+
+ // stage 1
+ // kBitReverseLookup 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15,
+ s[0] = x[0];
+ s[1] = x[8];
+ s[2] = x[4];
+ s[3] = x[12];
+ s[4] = x[2];
+ s[5] = x[10];
+ s[6] = x[6];
+ s[7] = x[14];
+ s[8] = x[1];
+ s[9] = x[9];
+ s[10] = x[5];
+ s[11] = x[13];
+ s[12] = x[3];
+ s[13] = x[11];
+ s[14] = x[7];
+ s[15] = x[15];
+
+ Dct4Stages<butterfly_rotation>(s, &min, &max, /*is_last_stage=*/false);
+ Dct8Stages<butterfly_rotation>(s, &min, &max, /*is_last_stage=*/false);
+ Dct16Stages<butterfly_rotation>(s, &min, &max, /*is_last_stage=*/true);
+
+ if (is_row) {
+ const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
+ for (int i = 0; i < 16; ++i) {
+ s[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(s[i], v_row_shift)));
+ }
+ for (int idx = 0; idx < 16; idx += 8) {
+ Transpose4x4(&s[idx], &s[idx]);
+ Transpose4x4(&s[idx + 4], &s[idx + 4]);
+ StoreDst<4>(dst, step, idx, &s[idx]);
+ StoreDst<4>(dst, step, idx + 4, &s[idx + 4]);
+ }
+ } else {
+ StoreDst<16>(dst, step, 0, &s[0]);
+ }
+}
+
+template <ButterflyRotationFunc butterfly_rotation,
+ bool is_fast_butterfly = false>
+LIBGAV1_ALWAYS_INLINE void Dct32Stages(int32x4_t* s, const int32x4_t* min,
+ const int32x4_t* max,
+ const bool is_last_stage) {
+ // stage 3
+ if (is_fast_butterfly) {
+ ButterflyRotation_SecondIsZero(&s[16], &s[31], 62, false);
+ ButterflyRotation_FirstIsZero(&s[17], &s[30], 30, false);
+ ButterflyRotation_SecondIsZero(&s[18], &s[29], 46, false);
+ ButterflyRotation_FirstIsZero(&s[19], &s[28], 14, false);
+ ButterflyRotation_SecondIsZero(&s[20], &s[27], 54, false);
+ ButterflyRotation_FirstIsZero(&s[21], &s[26], 22, false);
+ ButterflyRotation_SecondIsZero(&s[22], &s[25], 38, false);
+ ButterflyRotation_FirstIsZero(&s[23], &s[24], 6, false);
+ } else {
+ butterfly_rotation(&s[16], &s[31], 62, false);
+ butterfly_rotation(&s[17], &s[30], 30, false);
+ butterfly_rotation(&s[18], &s[29], 46, false);
+ butterfly_rotation(&s[19], &s[28], 14, false);
+ butterfly_rotation(&s[20], &s[27], 54, false);
+ butterfly_rotation(&s[21], &s[26], 22, false);
+ butterfly_rotation(&s[22], &s[25], 38, false);
+ butterfly_rotation(&s[23], &s[24], 6, false);
+ }
+
+ // stage 6.
+ HadamardRotation(&s[16], &s[17], false, min, max);
+ HadamardRotation(&s[18], &s[19], true, min, max);
+ HadamardRotation(&s[20], &s[21], false, min, max);
+ HadamardRotation(&s[22], &s[23], true, min, max);
+ HadamardRotation(&s[24], &s[25], false, min, max);
+ HadamardRotation(&s[26], &s[27], true, min, max);
+ HadamardRotation(&s[28], &s[29], false, min, max);
+ HadamardRotation(&s[30], &s[31], true, min, max);
+
+ // stage 10.
+ butterfly_rotation(&s[30], &s[17], 24 + 32, true);
+ butterfly_rotation(&s[29], &s[18], 24 + 64 + 32, true);
+ butterfly_rotation(&s[26], &s[21], 24, true);
+ butterfly_rotation(&s[25], &s[22], 24 + 64, true);
+
+ // stage 15.
+ HadamardRotation(&s[16], &s[19], false, min, max);
+ HadamardRotation(&s[17], &s[18], false, min, max);
+ HadamardRotation(&s[20], &s[23], true, min, max);
+ HadamardRotation(&s[21], &s[22], true, min, max);
+ HadamardRotation(&s[24], &s[27], false, min, max);
+ HadamardRotation(&s[25], &s[26], false, min, max);
+ HadamardRotation(&s[28], &s[31], true, min, max);
+ HadamardRotation(&s[29], &s[30], true, min, max);
+
+ // stage 20.
+ butterfly_rotation(&s[29], &s[18], 48, true);
+ butterfly_rotation(&s[28], &s[19], 48, true);
+ butterfly_rotation(&s[27], &s[20], 48 + 64, true);
+ butterfly_rotation(&s[26], &s[21], 48 + 64, true);
+
+ // stage 24.
+ HadamardRotation(&s[16], &s[23], false, min, max);
+ HadamardRotation(&s[17], &s[22], false, min, max);
+ HadamardRotation(&s[18], &s[21], false, min, max);
+ HadamardRotation(&s[19], &s[20], false, min, max);
+ HadamardRotation(&s[24], &s[31], true, min, max);
+ HadamardRotation(&s[25], &s[30], true, min, max);
+ HadamardRotation(&s[26], &s[29], true, min, max);
+ HadamardRotation(&s[27], &s[28], true, min, max);
+
+ // stage 27.
+ butterfly_rotation(&s[27], &s[20], 32, true);
+ butterfly_rotation(&s[26], &s[21], 32, true);
+ butterfly_rotation(&s[25], &s[22], 32, true);
+ butterfly_rotation(&s[24], &s[23], 32, true);
+
+ // stage 29.
+ if (is_last_stage) {
+ HadamardRotation(&s[0], &s[31], false);
+ HadamardRotation(&s[1], &s[30], false);
+ HadamardRotation(&s[2], &s[29], false);
+ HadamardRotation(&s[3], &s[28], false);
+ HadamardRotation(&s[4], &s[27], false);
+ HadamardRotation(&s[5], &s[26], false);
+ HadamardRotation(&s[6], &s[25], false);
+ HadamardRotation(&s[7], &s[24], false);
+ HadamardRotation(&s[8], &s[23], false);
+ HadamardRotation(&s[9], &s[22], false);
+ HadamardRotation(&s[10], &s[21], false);
+ HadamardRotation(&s[11], &s[20], false);
+ HadamardRotation(&s[12], &s[19], false);
+ HadamardRotation(&s[13], &s[18], false);
+ HadamardRotation(&s[14], &s[17], false);
+ HadamardRotation(&s[15], &s[16], false);
+ } else {
+ HadamardRotation(&s[0], &s[31], false, min, max);
+ HadamardRotation(&s[1], &s[30], false, min, max);
+ HadamardRotation(&s[2], &s[29], false, min, max);
+ HadamardRotation(&s[3], &s[28], false, min, max);
+ HadamardRotation(&s[4], &s[27], false, min, max);
+ HadamardRotation(&s[5], &s[26], false, min, max);
+ HadamardRotation(&s[6], &s[25], false, min, max);
+ HadamardRotation(&s[7], &s[24], false, min, max);
+ HadamardRotation(&s[8], &s[23], false, min, max);
+ HadamardRotation(&s[9], &s[22], false, min, max);
+ HadamardRotation(&s[10], &s[21], false, min, max);
+ HadamardRotation(&s[11], &s[20], false, min, max);
+ HadamardRotation(&s[12], &s[19], false, min, max);
+ HadamardRotation(&s[13], &s[18], false, min, max);
+ HadamardRotation(&s[14], &s[17], false, min, max);
+ HadamardRotation(&s[15], &s[16], false, min, max);
+ }
+}
+
+// Process dct32 rows or columns, depending on the |is_row| flag.
+LIBGAV1_ALWAYS_INLINE void Dct32_NEON(void* dest, const int32_t step,
+ const bool is_row, int row_shift) {
+ auto* const dst = static_cast<int32_t*>(dest);
+ const int32_t range = is_row ? kBitdepth10 + 7 : 15;
+ const int32x4_t min = vdupq_n_s32(-(1 << range));
+ const int32x4_t max = vdupq_n_s32((1 << range) - 1);
+ int32x4_t s[32], x[32];
+
+ if (is_row) {
+ for (int idx = 0; idx < 32; idx += 8) {
+ LoadSrc<4>(dst, step, idx, &x[idx]);
+ LoadSrc<4>(dst, step, idx + 4, &x[idx + 4]);
+ Transpose4x4(&x[idx], &x[idx]);
+ Transpose4x4(&x[idx + 4], &x[idx + 4]);
+ }
+ } else {
+ LoadSrc<32>(dst, step, 0, &x[0]);
+ }
+
+ // stage 1
+ // kBitReverseLookup
+ // 0, 16, 8, 24, 4, 20, 12, 28, 2, 18, 10, 26, 6, 22, 14, 30,
+ s[0] = x[0];
+ s[1] = x[16];
+ s[2] = x[8];
+ s[3] = x[24];
+ s[4] = x[4];
+ s[5] = x[20];
+ s[6] = x[12];
+ s[7] = x[28];
+ s[8] = x[2];
+ s[9] = x[18];
+ s[10] = x[10];
+ s[11] = x[26];
+ s[12] = x[6];
+ s[13] = x[22];
+ s[14] = x[14];
+ s[15] = x[30];
+
+ // 1, 17, 9, 25, 5, 21, 13, 29, 3, 19, 11, 27, 7, 23, 15, 31,
+ s[16] = x[1];
+ s[17] = x[17];
+ s[18] = x[9];
+ s[19] = x[25];
+ s[20] = x[5];
+ s[21] = x[21];
+ s[22] = x[13];
+ s[23] = x[29];
+ s[24] = x[3];
+ s[25] = x[19];
+ s[26] = x[11];
+ s[27] = x[27];
+ s[28] = x[7];
+ s[29] = x[23];
+ s[30] = x[15];
+ s[31] = x[31];
+
+ Dct4Stages<ButterflyRotation_4>(s, &min, &max, /*is_last_stage=*/false);
+ Dct8Stages<ButterflyRotation_4>(s, &min, &max, /*is_last_stage=*/false);
+ Dct16Stages<ButterflyRotation_4>(s, &min, &max, /*is_last_stage=*/false);
+ Dct32Stages<ButterflyRotation_4>(s, &min, &max, /*is_last_stage=*/true);
+
+ if (is_row) {
+ const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
+ for (int idx = 0; idx < 32; idx += 8) {
+ int32x4_t output[8];
+ Transpose4x4(&s[idx], &output[0]);
+ Transpose4x4(&s[idx + 4], &output[4]);
+ for (int i = 0; i < 8; ++i) {
+ output[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(output[i], v_row_shift)));
+ }
+ StoreDst<4>(dst, step, idx, &output[0]);
+ StoreDst<4>(dst, step, idx + 4, &output[4]);
+ }
+ } else {
+ StoreDst<32>(dst, step, 0, &s[0]);
+ }
+}
+
+void Dct64_NEON(void* dest, int32_t step, bool is_row, int row_shift) {
+ auto* const dst = static_cast<int32_t*>(dest);
+ const int32_t range = is_row ? kBitdepth10 + 7 : 15;
+ const int32x4_t min = vdupq_n_s32(-(1 << range));
+ const int32x4_t max = vdupq_n_s32((1 << range) - 1);
+ int32x4_t s[64], x[32];
+
+ if (is_row) {
+ // The last 32 values of every row are always zero if the |tx_width| is
+ // 64.
+ for (int idx = 0; idx < 32; idx += 8) {
+ LoadSrc<4>(dst, step, idx, &x[idx]);
+ LoadSrc<4>(dst, step, idx + 4, &x[idx + 4]);
+ Transpose4x4(&x[idx], &x[idx]);
+ Transpose4x4(&x[idx + 4], &x[idx + 4]);
+ }
+ } else {
+ // The last 32 values of every column are always zero if the |tx_height| is
+ // 64.
+ LoadSrc<32>(dst, step, 0, &x[0]);
+ }
+
+ // stage 1
+ // kBitReverseLookup
+ // 0, 32, 16, 48, 8, 40, 24, 56, 4, 36, 20, 52, 12, 44, 28, 60,
+ s[0] = x[0];
+ s[2] = x[16];
+ s[4] = x[8];
+ s[6] = x[24];
+ s[8] = x[4];
+ s[10] = x[20];
+ s[12] = x[12];
+ s[14] = x[28];
+
+ // 2, 34, 18, 50, 10, 42, 26, 58, 6, 38, 22, 54, 14, 46, 30, 62,
+ s[16] = x[2];
+ s[18] = x[18];
+ s[20] = x[10];
+ s[22] = x[26];
+ s[24] = x[6];
+ s[26] = x[22];
+ s[28] = x[14];
+ s[30] = x[30];
+
+ // 1, 33, 17, 49, 9, 41, 25, 57, 5, 37, 21, 53, 13, 45, 29, 61,
+ s[32] = x[1];
+ s[34] = x[17];
+ s[36] = x[9];
+ s[38] = x[25];
+ s[40] = x[5];
+ s[42] = x[21];
+ s[44] = x[13];
+ s[46] = x[29];
+
+ // 3, 35, 19, 51, 11, 43, 27, 59, 7, 39, 23, 55, 15, 47, 31, 63
+ s[48] = x[3];
+ s[50] = x[19];
+ s[52] = x[11];
+ s[54] = x[27];
+ s[56] = x[7];
+ s[58] = x[23];
+ s[60] = x[15];
+ s[62] = x[31];
+
+ Dct4Stages<ButterflyRotation_4, /*is_fast_butterfly=*/true>(
+ s, &min, &max, /*is_last_stage=*/false);
+ Dct8Stages<ButterflyRotation_4, /*is_fast_butterfly=*/true>(
+ s, &min, &max, /*is_last_stage=*/false);
+ Dct16Stages<ButterflyRotation_4, /*is_fast_butterfly=*/true>(
+ s, &min, &max, /*is_last_stage=*/false);
+ Dct32Stages<ButterflyRotation_4, /*is_fast_butterfly=*/true>(
+ s, &min, &max, /*is_last_stage=*/false);
+
+ //-- start dct 64 stages
+ // stage 2.
+ ButterflyRotation_SecondIsZero(&s[32], &s[63], 63 - 0, false);
+ ButterflyRotation_FirstIsZero(&s[33], &s[62], 63 - 32, false);
+ ButterflyRotation_SecondIsZero(&s[34], &s[61], 63 - 16, false);
+ ButterflyRotation_FirstIsZero(&s[35], &s[60], 63 - 48, false);
+ ButterflyRotation_SecondIsZero(&s[36], &s[59], 63 - 8, false);
+ ButterflyRotation_FirstIsZero(&s[37], &s[58], 63 - 40, false);
+ ButterflyRotation_SecondIsZero(&s[38], &s[57], 63 - 24, false);
+ ButterflyRotation_FirstIsZero(&s[39], &s[56], 63 - 56, false);
+ ButterflyRotation_SecondIsZero(&s[40], &s[55], 63 - 4, false);
+ ButterflyRotation_FirstIsZero(&s[41], &s[54], 63 - 36, false);
+ ButterflyRotation_SecondIsZero(&s[42], &s[53], 63 - 20, false);
+ ButterflyRotation_FirstIsZero(&s[43], &s[52], 63 - 52, false);
+ ButterflyRotation_SecondIsZero(&s[44], &s[51], 63 - 12, false);
+ ButterflyRotation_FirstIsZero(&s[45], &s[50], 63 - 44, false);
+ ButterflyRotation_SecondIsZero(&s[46], &s[49], 63 - 28, false);
+ ButterflyRotation_FirstIsZero(&s[47], &s[48], 63 - 60, false);
+
+ // stage 4.
+ HadamardRotation(&s[32], &s[33], false, &min, &max);
+ HadamardRotation(&s[34], &s[35], true, &min, &max);
+ HadamardRotation(&s[36], &s[37], false, &min, &max);
+ HadamardRotation(&s[38], &s[39], true, &min, &max);
+ HadamardRotation(&s[40], &s[41], false, &min, &max);
+ HadamardRotation(&s[42], &s[43], true, &min, &max);
+ HadamardRotation(&s[44], &s[45], false, &min, &max);
+ HadamardRotation(&s[46], &s[47], true, &min, &max);
+ HadamardRotation(&s[48], &s[49], false, &min, &max);
+ HadamardRotation(&s[50], &s[51], true, &min, &max);
+ HadamardRotation(&s[52], &s[53], false, &min, &max);
+ HadamardRotation(&s[54], &s[55], true, &min, &max);
+ HadamardRotation(&s[56], &s[57], false, &min, &max);
+ HadamardRotation(&s[58], &s[59], true, &min, &max);
+ HadamardRotation(&s[60], &s[61], false, &min, &max);
+ HadamardRotation(&s[62], &s[63], true, &min, &max);
+
+ // stage 7.
+ ButterflyRotation_4(&s[62], &s[33], 60 - 0, true);
+ ButterflyRotation_4(&s[61], &s[34], 60 - 0 + 64, true);
+ ButterflyRotation_4(&s[58], &s[37], 60 - 32, true);
+ ButterflyRotation_4(&s[57], &s[38], 60 - 32 + 64, true);
+ ButterflyRotation_4(&s[54], &s[41], 60 - 16, true);
+ ButterflyRotation_4(&s[53], &s[42], 60 - 16 + 64, true);
+ ButterflyRotation_4(&s[50], &s[45], 60 - 48, true);
+ ButterflyRotation_4(&s[49], &s[46], 60 - 48 + 64, true);
+
+ // stage 11.
+ HadamardRotation(&s[32], &s[35], false, &min, &max);
+ HadamardRotation(&s[33], &s[34], false, &min, &max);
+ HadamardRotation(&s[36], &s[39], true, &min, &max);
+ HadamardRotation(&s[37], &s[38], true, &min, &max);
+ HadamardRotation(&s[40], &s[43], false, &min, &max);
+ HadamardRotation(&s[41], &s[42], false, &min, &max);
+ HadamardRotation(&s[44], &s[47], true, &min, &max);
+ HadamardRotation(&s[45], &s[46], true, &min, &max);
+ HadamardRotation(&s[48], &s[51], false, &min, &max);
+ HadamardRotation(&s[49], &s[50], false, &min, &max);
+ HadamardRotation(&s[52], &s[55], true, &min, &max);
+ HadamardRotation(&s[53], &s[54], true, &min, &max);
+ HadamardRotation(&s[56], &s[59], false, &min, &max);
+ HadamardRotation(&s[57], &s[58], false, &min, &max);
+ HadamardRotation(&s[60], &s[63], true, &min, &max);
+ HadamardRotation(&s[61], &s[62], true, &min, &max);
+
+ // stage 16.
+ ButterflyRotation_4(&s[61], &s[34], 56, true);
+ ButterflyRotation_4(&s[60], &s[35], 56, true);
+ ButterflyRotation_4(&s[59], &s[36], 56 + 64, true);
+ ButterflyRotation_4(&s[58], &s[37], 56 + 64, true);
+ ButterflyRotation_4(&s[53], &s[42], 56 - 32, true);
+ ButterflyRotation_4(&s[52], &s[43], 56 - 32, true);
+ ButterflyRotation_4(&s[51], &s[44], 56 - 32 + 64, true);
+ ButterflyRotation_4(&s[50], &s[45], 56 - 32 + 64, true);
+
+ // stage 21.
+ HadamardRotation(&s[32], &s[39], false, &min, &max);
+ HadamardRotation(&s[33], &s[38], false, &min, &max);
+ HadamardRotation(&s[34], &s[37], false, &min, &max);
+ HadamardRotation(&s[35], &s[36], false, &min, &max);
+ HadamardRotation(&s[40], &s[47], true, &min, &max);
+ HadamardRotation(&s[41], &s[46], true, &min, &max);
+ HadamardRotation(&s[42], &s[45], true, &min, &max);
+ HadamardRotation(&s[43], &s[44], true, &min, &max);
+ HadamardRotation(&s[48], &s[55], false, &min, &max);
+ HadamardRotation(&s[49], &s[54], false, &min, &max);
+ HadamardRotation(&s[50], &s[53], false, &min, &max);
+ HadamardRotation(&s[51], &s[52], false, &min, &max);
+ HadamardRotation(&s[56], &s[63], true, &min, &max);
+ HadamardRotation(&s[57], &s[62], true, &min, &max);
+ HadamardRotation(&s[58], &s[61], true, &min, &max);
+ HadamardRotation(&s[59], &s[60], true, &min, &max);
+
+ // stage 25.
+ ButterflyRotation_4(&s[59], &s[36], 48, true);
+ ButterflyRotation_4(&s[58], &s[37], 48, true);
+ ButterflyRotation_4(&s[57], &s[38], 48, true);
+ ButterflyRotation_4(&s[56], &s[39], 48, true);
+ ButterflyRotation_4(&s[55], &s[40], 112, true);
+ ButterflyRotation_4(&s[54], &s[41], 112, true);
+ ButterflyRotation_4(&s[53], &s[42], 112, true);
+ ButterflyRotation_4(&s[52], &s[43], 112, true);
+
+ // stage 28.
+ HadamardRotation(&s[32], &s[47], false, &min, &max);
+ HadamardRotation(&s[33], &s[46], false, &min, &max);
+ HadamardRotation(&s[34], &s[45], false, &min, &max);
+ HadamardRotation(&s[35], &s[44], false, &min, &max);
+ HadamardRotation(&s[36], &s[43], false, &min, &max);
+ HadamardRotation(&s[37], &s[42], false, &min, &max);
+ HadamardRotation(&s[38], &s[41], false, &min, &max);
+ HadamardRotation(&s[39], &s[40], false, &min, &max);
+ HadamardRotation(&s[48], &s[63], true, &min, &max);
+ HadamardRotation(&s[49], &s[62], true, &min, &max);
+ HadamardRotation(&s[50], &s[61], true, &min, &max);
+ HadamardRotation(&s[51], &s[60], true, &min, &max);
+ HadamardRotation(&s[52], &s[59], true, &min, &max);
+ HadamardRotation(&s[53], &s[58], true, &min, &max);
+ HadamardRotation(&s[54], &s[57], true, &min, &max);
+ HadamardRotation(&s[55], &s[56], true, &min, &max);
+
+ // stage 30.
+ ButterflyRotation_4(&s[55], &s[40], 32, true);
+ ButterflyRotation_4(&s[54], &s[41], 32, true);
+ ButterflyRotation_4(&s[53], &s[42], 32, true);
+ ButterflyRotation_4(&s[52], &s[43], 32, true);
+ ButterflyRotation_4(&s[51], &s[44], 32, true);
+ ButterflyRotation_4(&s[50], &s[45], 32, true);
+ ButterflyRotation_4(&s[49], &s[46], 32, true);
+ ButterflyRotation_4(&s[48], &s[47], 32, true);
+
+ // stage 31.
+ for (int i = 0; i < 32; i += 4) {
+ HadamardRotation(&s[i], &s[63 - i], false, &min, &max);
+ HadamardRotation(&s[i + 1], &s[63 - i - 1], false, &min, &max);
+ HadamardRotation(&s[i + 2], &s[63 - i - 2], false, &min, &max);
+ HadamardRotation(&s[i + 3], &s[63 - i - 3], false, &min, &max);
+ }
+ //-- end dct 64 stages
+ if (is_row) {
+ const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
+ for (int idx = 0; idx < 64; idx += 8) {
+ int32x4_t output[8];
+ Transpose4x4(&s[idx], &output[0]);
+ Transpose4x4(&s[idx + 4], &output[4]);
+ for (int i = 0; i < 8; ++i) {
+ output[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(output[i], v_row_shift)));
+ }
+ StoreDst<4>(dst, step, idx, &output[0]);
+ StoreDst<4>(dst, step, idx + 4, &output[4]);
+ }
+ } else {
+ StoreDst<64>(dst, step, 0, &s[0]);
+ }
+}
+
+//------------------------------------------------------------------------------
+// Asymmetric Discrete Sine Transforms (ADST).
+LIBGAV1_ALWAYS_INLINE void Adst4_NEON(void* dest, int32_t step, bool is_row,
+ int row_shift) {
+ auto* const dst = static_cast<int32_t*>(dest);
+ int32x4_t s[8];
+ int32x4_t x[4];
+
+ LoadSrc<4>(dst, step, 0, x);
+ if (is_row) {
+ Transpose4x4(x, x);
+ }
+
+ // stage 1.
+ s[5] = vmulq_n_s32(x[3], kAdst4Multiplier[1]);
+ s[6] = vmulq_n_s32(x[3], kAdst4Multiplier[3]);
+
+ // stage 2.
+ const int32x4_t a7 = vsubq_s32(x[0], x[2]);
+ const int32x4_t b7 = vaddq_s32(a7, x[3]);
+
+ // stage 3.
+ s[0] = vmulq_n_s32(x[0], kAdst4Multiplier[0]);
+ s[1] = vmulq_n_s32(x[0], kAdst4Multiplier[1]);
+ // s[0] = s[0] + s[3]
+ s[0] = vmlaq_n_s32(s[0], x[2], kAdst4Multiplier[3]);
+ // s[1] = s[1] - s[4]
+ s[1] = vmlsq_n_s32(s[1], x[2], kAdst4Multiplier[0]);
+
+ s[3] = vmulq_n_s32(x[1], kAdst4Multiplier[2]);
+ s[2] = vmulq_n_s32(b7, kAdst4Multiplier[2]);
+
+ // stage 4.
+ s[0] = vaddq_s32(s[0], s[5]);
+ s[1] = vsubq_s32(s[1], s[6]);
+
+ // stages 5 and 6.
+ const int32x4_t x0 = vaddq_s32(s[0], s[3]);
+ const int32x4_t x1 = vaddq_s32(s[1], s[3]);
+ const int32x4_t x3_a = vaddq_s32(s[0], s[1]);
+ const int32x4_t x3 = vsubq_s32(x3_a, s[3]);
+ x[0] = vrshrq_n_s32(x0, 12);
+ x[1] = vrshrq_n_s32(x1, 12);
+ x[2] = vrshrq_n_s32(s[2], 12);
+ x[3] = vrshrq_n_s32(x3, 12);
+
+ if (is_row) {
+ const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
+ x[0] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[0], v_row_shift)));
+ x[1] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[1], v_row_shift)));
+ x[2] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[2], v_row_shift)));
+ x[3] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[3], v_row_shift)));
+ Transpose4x4(x, x);
+ }
+ StoreDst<4>(dst, step, 0, x);
+}
+
+alignas(16) constexpr int32_t kAdst4DcOnlyMultiplier[4] = {1321, 2482, 3344,
+ 2482};
+
+LIBGAV1_ALWAYS_INLINE bool Adst4DcOnly(void* dest, int adjusted_tx_height,
+ bool should_round, int row_shift) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int32_t*>(dest);
+ int32x4_t s[2];
+
+ const int32x4_t v_src0 = vdupq_n_s32(dst[0]);
+ const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0);
+ const int32x4_t v_src0_round =
+ vqrdmulhq_n_s32(v_src0, kTransformRowMultiplier << (31 - 12));
+
+ const int32x4_t v_src = vbslq_s32(v_mask, v_src0_round, v_src0);
+ const int32x4_t kAdst4DcOnlyMultipliers = vld1q_s32(kAdst4DcOnlyMultiplier);
+ s[1] = vdupq_n_s32(0);
+
+ // s0*k0 s0*k1 s0*k2 s0*k1
+ s[0] = vmulq_s32(kAdst4DcOnlyMultipliers, v_src);
+ // 0 0 0 s0*k0
+ s[1] = vextq_s32(s[1], s[0], 1);
+
+ const int32x4_t x3 = vaddq_s32(s[0], s[1]);
+ const int32x4_t dst_0 = vrshrq_n_s32(x3, 12);
+
+ // vqrshlq_s32 will shift right if shift value is negative.
+ vst1q_s32(dst,
+ vmovl_s16(vqmovn_s32(vqrshlq_s32(dst_0, vdupq_n_s32(-row_shift)))));
+
+ return true;
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst4DcOnlyColumn(void* dest, int adjusted_tx_height,
+ int width) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int32_t*>(dest);
+ int32x4_t s[4];
+
+ int i = 0;
+ do {
+ const int32x4_t v_src = vld1q_s32(&dst[i]);
+
+ s[0] = vmulq_n_s32(v_src, kAdst4Multiplier[0]);
+ s[1] = vmulq_n_s32(v_src, kAdst4Multiplier[1]);
+ s[2] = vmulq_n_s32(v_src, kAdst4Multiplier[2]);
+
+ const int32x4_t x0 = s[0];
+ const int32x4_t x1 = s[1];
+ const int32x4_t x2 = s[2];
+ const int32x4_t x3 = vaddq_s32(s[0], s[1]);
+ const int32x4_t dst_0 = vrshrq_n_s32(x0, 12);
+ const int32x4_t dst_1 = vrshrq_n_s32(x1, 12);
+ const int32x4_t dst_2 = vrshrq_n_s32(x2, 12);
+ const int32x4_t dst_3 = vrshrq_n_s32(x3, 12);
+
+ vst1q_s32(&dst[i], dst_0);
+ vst1q_s32(&dst[i + width * 1], dst_1);
+ vst1q_s32(&dst[i + width * 2], dst_2);
+ vst1q_s32(&dst[i + width * 3], dst_3);
+
+ i += 4;
+ } while (i < width);
+
+ return true;
+}
+
+template <ButterflyRotationFunc butterfly_rotation>
+LIBGAV1_ALWAYS_INLINE void Adst8_NEON(void* dest, int32_t step, bool is_row,
+ int row_shift) {
+ auto* const dst = static_cast<int32_t*>(dest);
+ const int32_t range = is_row ? kBitdepth10 + 7 : 15;
+ const int32x4_t min = vdupq_n_s32(-(1 << range));
+ const int32x4_t max = vdupq_n_s32((1 << range) - 1);
+ int32x4_t s[8], x[8];
+
+ if (is_row) {
+ LoadSrc<4>(dst, step, 0, &x[0]);
+ LoadSrc<4>(dst, step, 4, &x[4]);
+ Transpose4x4(&x[0], &x[0]);
+ Transpose4x4(&x[4], &x[4]);
+ } else {
+ LoadSrc<8>(dst, step, 0, &x[0]);
+ }
+
+ // stage 1.
+ s[0] = x[7];
+ s[1] = x[0];
+ s[2] = x[5];
+ s[3] = x[2];
+ s[4] = x[3];
+ s[5] = x[4];
+ s[6] = x[1];
+ s[7] = x[6];
+
+ // stage 2.
+ butterfly_rotation(&s[0], &s[1], 60 - 0, true);
+ butterfly_rotation(&s[2], &s[3], 60 - 16, true);
+ butterfly_rotation(&s[4], &s[5], 60 - 32, true);
+ butterfly_rotation(&s[6], &s[7], 60 - 48, true);
+
+ // stage 3.
+ HadamardRotation(&s[0], &s[4], false, &min, &max);
+ HadamardRotation(&s[1], &s[5], false, &min, &max);
+ HadamardRotation(&s[2], &s[6], false, &min, &max);
+ HadamardRotation(&s[3], &s[7], false, &min, &max);
+
+ // stage 4.
+ butterfly_rotation(&s[4], &s[5], 48 - 0, true);
+ butterfly_rotation(&s[7], &s[6], 48 - 32, true);
+
+ // stage 5.
+ HadamardRotation(&s[0], &s[2], false, &min, &max);
+ HadamardRotation(&s[4], &s[6], false, &min, &max);
+ HadamardRotation(&s[1], &s[3], false, &min, &max);
+ HadamardRotation(&s[5], &s[7], false, &min, &max);
+
+ // stage 6.
+ butterfly_rotation(&s[2], &s[3], 32, true);
+ butterfly_rotation(&s[6], &s[7], 32, true);
+
+ // stage 7.
+ x[0] = s[0];
+ x[1] = vqnegq_s32(s[4]);
+ x[2] = s[6];
+ x[3] = vqnegq_s32(s[2]);
+ x[4] = s[3];
+ x[5] = vqnegq_s32(s[7]);
+ x[6] = s[5];
+ x[7] = vqnegq_s32(s[1]);
+
+ if (is_row) {
+ const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
+ for (int i = 0; i < 8; ++i) {
+ x[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[i], v_row_shift)));
+ }
+ Transpose4x4(&x[0], &x[0]);
+ Transpose4x4(&x[4], &x[4]);
+ StoreDst<4>(dst, step, 0, &x[0]);
+ StoreDst<4>(dst, step, 4, &x[4]);
+ } else {
+ StoreDst<8>(dst, step, 0, &x[0]);
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst8DcOnly(void* dest, int adjusted_tx_height,
+ bool should_round, int row_shift) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int32_t*>(dest);
+ int32x4_t s[8];
+
+ const int32x4_t v_src = vdupq_n_s32(dst[0]);
+ const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0);
+ const int32x4_t v_src_round =
+ vqrdmulhq_n_s32(v_src, kTransformRowMultiplier << (31 - 12));
+ // stage 1.
+ s[1] = vbslq_s32(v_mask, v_src_round, v_src);
+
+ // stage 2.
+ ButterflyRotation_FirstIsZero(&s[0], &s[1], 60, true);
+
+ // stage 3.
+ s[4] = s[0];
+ s[5] = s[1];
+
+ // stage 4.
+ ButterflyRotation_4(&s[4], &s[5], 48, true);
+
+ // stage 5.
+ s[2] = s[0];
+ s[3] = s[1];
+ s[6] = s[4];
+ s[7] = s[5];
+
+ // stage 6.
+ ButterflyRotation_4(&s[2], &s[3], 32, true);
+ ButterflyRotation_4(&s[6], &s[7], 32, true);
+
+ // stage 7.
+ int32x4_t x[8];
+ x[0] = s[0];
+ x[1] = vqnegq_s32(s[4]);
+ x[2] = s[6];
+ x[3] = vqnegq_s32(s[2]);
+ x[4] = s[3];
+ x[5] = vqnegq_s32(s[7]);
+ x[6] = s[5];
+ x[7] = vqnegq_s32(s[1]);
+
+ for (int i = 0; i < 8; ++i) {
+ // vqrshlq_s32 will shift right if shift value is negative.
+ x[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[i], vdupq_n_s32(-row_shift))));
+ vst1q_lane_s32(&dst[i], x[i], 0);
+ }
+
+ return true;
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst8DcOnlyColumn(void* dest, int adjusted_tx_height,
+ int width) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int32_t*>(dest);
+ int32x4_t s[8];
+
+ int i = 0;
+ do {
+ const int32x4_t v_src = vld1q_s32(dst);
+ // stage 1.
+ s[1] = v_src;
+
+ // stage 2.
+ ButterflyRotation_FirstIsZero(&s[0], &s[1], 60, true);
+
+ // stage 3.
+ s[4] = s[0];
+ s[5] = s[1];
+
+ // stage 4.
+ ButterflyRotation_4(&s[4], &s[5], 48, true);
+
+ // stage 5.
+ s[2] = s[0];
+ s[3] = s[1];
+ s[6] = s[4];
+ s[7] = s[5];
+
+ // stage 6.
+ ButterflyRotation_4(&s[2], &s[3], 32, true);
+ ButterflyRotation_4(&s[6], &s[7], 32, true);
+
+ // stage 7.
+ int32x4_t x[8];
+ x[0] = s[0];
+ x[1] = vqnegq_s32(s[4]);
+ x[2] = s[6];
+ x[3] = vqnegq_s32(s[2]);
+ x[4] = s[3];
+ x[5] = vqnegq_s32(s[7]);
+ x[6] = s[5];
+ x[7] = vqnegq_s32(s[1]);
+
+ for (int j = 0; j < 8; ++j) {
+ vst1q_s32(&dst[j * width], x[j]);
+ }
+ i += 4;
+ dst += 4;
+ } while (i < width);
+
+ return true;
+}
+
+template <ButterflyRotationFunc butterfly_rotation>
+LIBGAV1_ALWAYS_INLINE void Adst16_NEON(void* dest, int32_t step, bool is_row,
+ int row_shift) {
+ auto* const dst = static_cast<int32_t*>(dest);
+ const int32_t range = is_row ? kBitdepth10 + 7 : 15;
+ const int32x4_t min = vdupq_n_s32(-(1 << range));
+ const int32x4_t max = vdupq_n_s32((1 << range) - 1);
+ int32x4_t s[16], x[16];
+
+ if (is_row) {
+ for (int idx = 0; idx < 16; idx += 8) {
+ LoadSrc<4>(dst, step, idx, &x[idx]);
+ LoadSrc<4>(dst, step, idx + 4, &x[idx + 4]);
+ Transpose4x4(&x[idx], &x[idx]);
+ Transpose4x4(&x[idx + 4], &x[idx + 4]);
+ }
+ } else {
+ LoadSrc<16>(dst, step, 0, &x[0]);
+ }
+
+ // stage 1.
+ s[0] = x[15];
+ s[1] = x[0];
+ s[2] = x[13];
+ s[3] = x[2];
+ s[4] = x[11];
+ s[5] = x[4];
+ s[6] = x[9];
+ s[7] = x[6];
+ s[8] = x[7];
+ s[9] = x[8];
+ s[10] = x[5];
+ s[11] = x[10];
+ s[12] = x[3];
+ s[13] = x[12];
+ s[14] = x[1];
+ s[15] = x[14];
+
+ // stage 2.
+ butterfly_rotation(&s[0], &s[1], 62 - 0, true);
+ butterfly_rotation(&s[2], &s[3], 62 - 8, true);
+ butterfly_rotation(&s[4], &s[5], 62 - 16, true);
+ butterfly_rotation(&s[6], &s[7], 62 - 24, true);
+ butterfly_rotation(&s[8], &s[9], 62 - 32, true);
+ butterfly_rotation(&s[10], &s[11], 62 - 40, true);
+ butterfly_rotation(&s[12], &s[13], 62 - 48, true);
+ butterfly_rotation(&s[14], &s[15], 62 - 56, true);
+
+ // stage 3.
+ HadamardRotation(&s[0], &s[8], false, &min, &max);
+ HadamardRotation(&s[1], &s[9], false, &min, &max);
+ HadamardRotation(&s[2], &s[10], false, &min, &max);
+ HadamardRotation(&s[3], &s[11], false, &min, &max);
+ HadamardRotation(&s[4], &s[12], false, &min, &max);
+ HadamardRotation(&s[5], &s[13], false, &min, &max);
+ HadamardRotation(&s[6], &s[14], false, &min, &max);
+ HadamardRotation(&s[7], &s[15], false, &min, &max);
+
+ // stage 4.
+ butterfly_rotation(&s[8], &s[9], 56 - 0, true);
+ butterfly_rotation(&s[13], &s[12], 8 + 0, true);
+ butterfly_rotation(&s[10], &s[11], 56 - 32, true);
+ butterfly_rotation(&s[15], &s[14], 8 + 32, true);
+
+ // stage 5.
+ HadamardRotation(&s[0], &s[4], false, &min, &max);
+ HadamardRotation(&s[8], &s[12], false, &min, &max);
+ HadamardRotation(&s[1], &s[5], false, &min, &max);
+ HadamardRotation(&s[9], &s[13], false, &min, &max);
+ HadamardRotation(&s[2], &s[6], false, &min, &max);
+ HadamardRotation(&s[10], &s[14], false, &min, &max);
+ HadamardRotation(&s[3], &s[7], false, &min, &max);
+ HadamardRotation(&s[11], &s[15], false, &min, &max);
+
+ // stage 6.
+ butterfly_rotation(&s[4], &s[5], 48 - 0, true);
+ butterfly_rotation(&s[12], &s[13], 48 - 0, true);
+ butterfly_rotation(&s[7], &s[6], 48 - 32, true);
+ butterfly_rotation(&s[15], &s[14], 48 - 32, true);
+
+ // stage 7.
+ HadamardRotation(&s[0], &s[2], false, &min, &max);
+ HadamardRotation(&s[4], &s[6], false, &min, &max);
+ HadamardRotation(&s[8], &s[10], false, &min, &max);
+ HadamardRotation(&s[12], &s[14], false, &min, &max);
+ HadamardRotation(&s[1], &s[3], false, &min, &max);
+ HadamardRotation(&s[5], &s[7], false, &min, &max);
+ HadamardRotation(&s[9], &s[11], false, &min, &max);
+ HadamardRotation(&s[13], &s[15], false, &min, &max);
+
+ // stage 8.
+ butterfly_rotation(&s[2], &s[3], 32, true);
+ butterfly_rotation(&s[6], &s[7], 32, true);
+ butterfly_rotation(&s[10], &s[11], 32, true);
+ butterfly_rotation(&s[14], &s[15], 32, true);
+
+ // stage 9.
+ x[0] = s[0];
+ x[1] = vqnegq_s32(s[8]);
+ x[2] = s[12];
+ x[3] = vqnegq_s32(s[4]);
+ x[4] = s[6];
+ x[5] = vqnegq_s32(s[14]);
+ x[6] = s[10];
+ x[7] = vqnegq_s32(s[2]);
+ x[8] = s[3];
+ x[9] = vqnegq_s32(s[11]);
+ x[10] = s[15];
+ x[11] = vqnegq_s32(s[7]);
+ x[12] = s[5];
+ x[13] = vqnegq_s32(s[13]);
+ x[14] = s[9];
+ x[15] = vqnegq_s32(s[1]);
+
+ if (is_row) {
+ const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
+ for (int i = 0; i < 16; ++i) {
+ x[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[i], v_row_shift)));
+ }
+ for (int idx = 0; idx < 16; idx += 8) {
+ Transpose4x4(&x[idx], &x[idx]);
+ Transpose4x4(&x[idx + 4], &x[idx + 4]);
+ StoreDst<4>(dst, step, idx, &x[idx]);
+ StoreDst<4>(dst, step, idx + 4, &x[idx + 4]);
+ }
+ } else {
+ StoreDst<16>(dst, step, 0, &x[0]);
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE void Adst16DcOnlyInternal(int32x4_t* s, int32x4_t* x) {
+ // stage 2.
+ ButterflyRotation_FirstIsZero(&s[0], &s[1], 62, true);
+
+ // stage 3.
+ s[8] = s[0];
+ s[9] = s[1];
+
+ // stage 4.
+ ButterflyRotation_4(&s[8], &s[9], 56, true);
+
+ // stage 5.
+ s[4] = s[0];
+ s[12] = s[8];
+ s[5] = s[1];
+ s[13] = s[9];
+
+ // stage 6.
+ ButterflyRotation_4(&s[4], &s[5], 48, true);
+ ButterflyRotation_4(&s[12], &s[13], 48, true);
+
+ // stage 7.
+ s[2] = s[0];
+ s[6] = s[4];
+ s[10] = s[8];
+ s[14] = s[12];
+ s[3] = s[1];
+ s[7] = s[5];
+ s[11] = s[9];
+ s[15] = s[13];
+
+ // stage 8.
+ ButterflyRotation_4(&s[2], &s[3], 32, true);
+ ButterflyRotation_4(&s[6], &s[7], 32, true);
+ ButterflyRotation_4(&s[10], &s[11], 32, true);
+ ButterflyRotation_4(&s[14], &s[15], 32, true);
+
+ // stage 9.
+ x[0] = s[0];
+ x[1] = vqnegq_s32(s[8]);
+ x[2] = s[12];
+ x[3] = vqnegq_s32(s[4]);
+ x[4] = s[6];
+ x[5] = vqnegq_s32(s[14]);
+ x[6] = s[10];
+ x[7] = vqnegq_s32(s[2]);
+ x[8] = s[3];
+ x[9] = vqnegq_s32(s[11]);
+ x[10] = s[15];
+ x[11] = vqnegq_s32(s[7]);
+ x[12] = s[5];
+ x[13] = vqnegq_s32(s[13]);
+ x[14] = s[9];
+ x[15] = vqnegq_s32(s[1]);
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst16DcOnly(void* dest, int adjusted_tx_height,
+ bool should_round, int row_shift) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int32_t*>(dest);
+ int32x4_t s[16];
+ int32x4_t x[16];
+ const int32x4_t v_src = vdupq_n_s32(dst[0]);
+ const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0);
+ const int32x4_t v_src_round =
+ vqrdmulhq_n_s32(v_src, kTransformRowMultiplier << (31 - 12));
+ // stage 1.
+ s[1] = vbslq_s32(v_mask, v_src_round, v_src);
+
+ Adst16DcOnlyInternal(s, x);
+
+ for (int i = 0; i < 16; ++i) {
+ // vqrshlq_s32 will shift right if shift value is negative.
+ x[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[i], vdupq_n_s32(-row_shift))));
+ vst1q_lane_s32(&dst[i], x[i], 0);
+ }
+
+ return true;
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst16DcOnlyColumn(void* dest,
+ int adjusted_tx_height,
+ int width) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int32_t*>(dest);
+ int i = 0;
+ do {
+ int32x4_t s[16];
+ int32x4_t x[16];
+ const int32x4_t v_src = vld1q_s32(dst);
+ // stage 1.
+ s[1] = v_src;
+
+ Adst16DcOnlyInternal(s, x);
+
+ for (int j = 0; j < 16; ++j) {
+ vst1q_s32(&dst[j * width], x[j]);
+ }
+ i += 4;
+ dst += 4;
+ } while (i < width);
+
+ return true;
+}
+
+//------------------------------------------------------------------------------
+// Identity Transforms.
+
+LIBGAV1_ALWAYS_INLINE void Identity4_NEON(void* dest, int32_t step, int shift) {
+ auto* const dst = static_cast<int32_t*>(dest);
+ const int32x4_t v_dual_round = vdupq_n_s32((1 + (shift << 1)) << 11);
+ const int32x4_t v_multiplier = vdupq_n_s32(kIdentity4Multiplier);
+ const int32x4_t v_shift = vdupq_n_s32(-(12 + shift));
+ for (int i = 0; i < 4; ++i) {
+ const int32x4_t v_src = vld1q_s32(&dst[i * step]);
+ const int32x4_t v_src_mult_lo =
+ vmlaq_s32(v_dual_round, v_src, v_multiplier);
+ const int32x4_t shift_lo = vqshlq_s32(v_src_mult_lo, v_shift);
+ vst1q_s32(&dst[i * step], vmovl_s16(vqmovn_s32(shift_lo)));
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Identity4DcOnly(void* dest, int adjusted_tx_height,
+ bool should_round, int tx_height) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int32_t*>(dest);
+ const int32x4_t v_src0 = vdupq_n_s32(dst[0]);
+ const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0);
+ const int32x4_t v_src_round =
+ vqrdmulhq_n_s32(v_src0, kTransformRowMultiplier << (31 - 12));
+ const int32x4_t v_src = vbslq_s32(v_mask, v_src_round, v_src0);
+ const int shift = tx_height < 16 ? 0 : 1;
+ const int32x4_t v_dual_round = vdupq_n_s32((1 + (shift << 1)) << 11);
+ const int32x4_t v_multiplier = vdupq_n_s32(kIdentity4Multiplier);
+ const int32x4_t v_shift = vdupq_n_s32(-(12 + shift));
+ const int32x4_t v_src_mult_lo = vmlaq_s32(v_dual_round, v_src, v_multiplier);
+ const int32x4_t dst_0 = vqshlq_s32(v_src_mult_lo, v_shift);
+ vst1q_lane_s32(dst, vmovl_s16(vqmovn_s32(dst_0)), 0);
+ return true;
+}
+
+template <int identity_size>
+LIBGAV1_ALWAYS_INLINE void IdentityColumnStoreToFrame(
+ Array2DView<uint16_t> frame, const int start_x, const int start_y,
+ const int tx_width, const int tx_height, const int32_t* source) {
+ static_assert(identity_size == 4 || identity_size == 8 || identity_size == 16,
+ "Invalid identity_size.");
+ const int stride = frame.columns();
+ uint16_t* dst = frame[start_y] + start_x;
+ const int32x4_t v_dual_round = vdupq_n_s32((1 + (1 << 4)) << 11);
+ const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1);
+
+ if (tx_width == 4) {
+ int i = 0;
+ do {
+ int32x4x2_t v_src, v_dst_i, a, b;
+ v_src.val[0] = vld1q_s32(&source[i * 4]);
+ v_src.val[1] = vld1q_s32(&source[(i * 4) + 4]);
+ if (identity_size == 4) {
+ v_dst_i.val[0] =
+ vmlaq_n_s32(v_dual_round, v_src.val[0], kIdentity4Multiplier);
+ v_dst_i.val[1] =
+ vmlaq_n_s32(v_dual_round, v_src.val[1], kIdentity4Multiplier);
+ a.val[0] = vshrq_n_s32(v_dst_i.val[0], 4 + 12);
+ a.val[1] = vshrq_n_s32(v_dst_i.val[1], 4 + 12);
+ } else if (identity_size == 8) {
+ v_dst_i.val[0] = vaddq_s32(v_src.val[0], v_src.val[0]);
+ v_dst_i.val[1] = vaddq_s32(v_src.val[1], v_src.val[1]);
+ a.val[0] = vrshrq_n_s32(v_dst_i.val[0], 4);
+ a.val[1] = vrshrq_n_s32(v_dst_i.val[1], 4);
+ } else { // identity_size == 16
+ v_dst_i.val[0] =
+ vmlaq_n_s32(v_dual_round, v_src.val[0], kIdentity16Multiplier);
+ v_dst_i.val[1] =
+ vmlaq_n_s32(v_dual_round, v_src.val[1], kIdentity16Multiplier);
+ a.val[0] = vshrq_n_s32(v_dst_i.val[0], 4 + 12);
+ a.val[1] = vshrq_n_s32(v_dst_i.val[1], 4 + 12);
+ }
+ uint16x4x2_t frame_data;
+ frame_data.val[0] = vld1_u16(dst);
+ frame_data.val[1] = vld1_u16(dst + stride);
+ b.val[0] = vaddw_s16(a.val[0], vreinterpret_s16_u16(frame_data.val[0]));
+ b.val[1] = vaddw_s16(a.val[1], vreinterpret_s16_u16(frame_data.val[1]));
+ vst1_u16(dst, vmin_u16(vqmovun_s32(b.val[0]), v_max_bitdepth));
+ vst1_u16(dst + stride, vmin_u16(vqmovun_s32(b.val[1]), v_max_bitdepth));
+ dst += stride << 1;
+ i += 2;
+ } while (i < tx_height);
+ } else {
+ int i = 0;
+ do {
+ const int row = i * tx_width;
+ int j = 0;
+ do {
+ int32x4x2_t v_src, v_dst_i, a, b;
+ v_src.val[0] = vld1q_s32(&source[row + j]);
+ v_src.val[1] = vld1q_s32(&source[row + j + 4]);
+ if (identity_size == 4) {
+ v_dst_i.val[0] =
+ vmlaq_n_s32(v_dual_round, v_src.val[0], kIdentity4Multiplier);
+ v_dst_i.val[1] =
+ vmlaq_n_s32(v_dual_round, v_src.val[1], kIdentity4Multiplier);
+ a.val[0] = vshrq_n_s32(v_dst_i.val[0], 4 + 12);
+ a.val[1] = vshrq_n_s32(v_dst_i.val[1], 4 + 12);
+ } else if (identity_size == 8) {
+ v_dst_i.val[0] = vaddq_s32(v_src.val[0], v_src.val[0]);
+ v_dst_i.val[1] = vaddq_s32(v_src.val[1], v_src.val[1]);
+ a.val[0] = vrshrq_n_s32(v_dst_i.val[0], 4);
+ a.val[1] = vrshrq_n_s32(v_dst_i.val[1], 4);
+ } else { // identity_size == 16
+ v_dst_i.val[0] =
+ vmlaq_n_s32(v_dual_round, v_src.val[0], kIdentity16Multiplier);
+ v_dst_i.val[1] =
+ vmlaq_n_s32(v_dual_round, v_src.val[1], kIdentity16Multiplier);
+ a.val[0] = vshrq_n_s32(v_dst_i.val[0], 4 + 12);
+ a.val[1] = vshrq_n_s32(v_dst_i.val[1], 4 + 12);
+ }
+ uint16x4x2_t frame_data;
+ frame_data.val[0] = vld1_u16(dst + j);
+ frame_data.val[1] = vld1_u16(dst + j + 4);
+ b.val[0] = vaddw_s16(a.val[0], vreinterpret_s16_u16(frame_data.val[0]));
+ b.val[1] = vaddw_s16(a.val[1], vreinterpret_s16_u16(frame_data.val[1]));
+ vst1_u16(dst + j, vmin_u16(vqmovun_s32(b.val[0]), v_max_bitdepth));
+ vst1_u16(dst + j + 4, vmin_u16(vqmovun_s32(b.val[1]), v_max_bitdepth));
+ j += 8;
+ } while (j < tx_width);
+ dst += stride;
+ } while (++i < tx_height);
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity4RowColumnStoreToFrame(
+ Array2DView<uint16_t> frame, const int start_x, const int start_y,
+ const int tx_width, const int tx_height, const int32_t* source) {
+ const int stride = frame.columns();
+ uint16_t* dst = frame[start_y] + start_x;
+ const int32x4_t v_round = vdupq_n_s32((1 + (0)) << 11);
+ const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1);
+
+ if (tx_width == 4) {
+ int i = 0;
+ do {
+ const int32x4_t v_src = vld1q_s32(&source[i * 4]);
+ const int32x4_t v_dst_row =
+ vshrq_n_s32(vmlaq_n_s32(v_round, v_src, kIdentity4Multiplier), 12);
+ const int32x4_t v_dst_col =
+ vmlaq_n_s32(v_round, v_dst_row, kIdentity4Multiplier);
+ const uint16x4_t frame_data = vld1_u16(dst);
+ const int32x4_t a = vrshrq_n_s32(v_dst_col, 4 + 12);
+ const int32x4_t b = vaddw_s16(a, vreinterpret_s16_u16(frame_data));
+ vst1_u16(dst, vmin_u16(vqmovun_s32(b), v_max_bitdepth));
+ dst += stride;
+ } while (++i < tx_height);
+ } else {
+ int i = 0;
+ do {
+ const int row = i * tx_width;
+ int j = 0;
+ do {
+ int32x4x2_t v_src, v_src_round, v_dst_row, v_dst_col, a, b;
+ v_src.val[0] = vld1q_s32(&source[row + j]);
+ v_src.val[1] = vld1q_s32(&source[row + j + 4]);
+ v_src_round.val[0] = vshrq_n_s32(
+ vmlaq_n_s32(v_round, v_src.val[0], kTransformRowMultiplier), 12);
+ v_src_round.val[1] = vshrq_n_s32(
+ vmlaq_n_s32(v_round, v_src.val[1], kTransformRowMultiplier), 12);
+ v_dst_row.val[0] = vqaddq_s32(v_src_round.val[0], v_src_round.val[0]);
+ v_dst_row.val[1] = vqaddq_s32(v_src_round.val[1], v_src_round.val[1]);
+ v_dst_col.val[0] =
+ vmlaq_n_s32(v_round, v_dst_row.val[0], kIdentity4Multiplier);
+ v_dst_col.val[1] =
+ vmlaq_n_s32(v_round, v_dst_row.val[1], kIdentity4Multiplier);
+ uint16x4x2_t frame_data;
+ frame_data.val[0] = vld1_u16(dst + j);
+ frame_data.val[1] = vld1_u16(dst + j + 4);
+ a.val[0] = vrshrq_n_s32(v_dst_col.val[0], 4 + 12);
+ a.val[1] = vrshrq_n_s32(v_dst_col.val[1], 4 + 12);
+ b.val[0] = vaddw_s16(a.val[0], vreinterpret_s16_u16(frame_data.val[0]));
+ b.val[1] = vaddw_s16(a.val[1], vreinterpret_s16_u16(frame_data.val[1]));
+ vst1_u16(dst + j, vmin_u16(vqmovun_s32(b.val[0]), v_max_bitdepth));
+ vst1_u16(dst + j + 4, vmin_u16(vqmovun_s32(b.val[1]), v_max_bitdepth));
+ j += 8;
+ } while (j < tx_width);
+ dst += stride;
+ } while (++i < tx_height);
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity8Row32_NEON(void* dest, int32_t step) {
+ auto* const dst = static_cast<int32_t*>(dest);
+
+ // When combining the identity8 multiplier with the row shift, the
+ // calculations for tx_height equal to 32 can be simplified from
+ // ((A * 2) + 2) >> 2) to ((A + 1) >> 1).
+ for (int i = 0; i < 4; ++i) {
+ const int32x4_t v_src_lo = vld1q_s32(&dst[i * step]);
+ const int32x4_t v_src_hi = vld1q_s32(&dst[(i * step) + 4]);
+ const int32x4_t a_lo = vrshrq_n_s32(v_src_lo, 1);
+ const int32x4_t a_hi = vrshrq_n_s32(v_src_hi, 1);
+ vst1q_s32(&dst[i * step], vmovl_s16(vqmovn_s32(a_lo)));
+ vst1q_s32(&dst[(i * step) + 4], vmovl_s16(vqmovn_s32(a_hi)));
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity8Row4_NEON(void* dest, int32_t step) {
+ auto* const dst = static_cast<int32_t*>(dest);
+
+ for (int i = 0; i < 4; ++i) {
+ const int32x4_t v_src_lo = vld1q_s32(&dst[i * step]);
+ const int32x4_t v_src_hi = vld1q_s32(&dst[(i * step) + 4]);
+ const int32x4_t v_srcx2_lo = vqaddq_s32(v_src_lo, v_src_lo);
+ const int32x4_t v_srcx2_hi = vqaddq_s32(v_src_hi, v_src_hi);
+ vst1q_s32(&dst[i * step], vmovl_s16(vqmovn_s32(v_srcx2_lo)));
+ vst1q_s32(&dst[(i * step) + 4], vmovl_s16(vqmovn_s32(v_srcx2_hi)));
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Identity8DcOnly(void* dest, int adjusted_tx_height,
+ bool should_round, int row_shift) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int32_t*>(dest);
+ const int32x4_t v_src0 = vdupq_n_s32(dst[0]);
+ const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0);
+ const int32x4_t v_src_round =
+ vqrdmulhq_n_s32(v_src0, kTransformRowMultiplier << (31 - 12));
+ const int32x4_t v_src = vbslq_s32(v_mask, v_src_round, v_src0);
+ const int32x4_t v_srcx2 = vaddq_s32(v_src, v_src);
+ const int32x4_t dst_0 = vqrshlq_s32(v_srcx2, vdupq_n_s32(-row_shift));
+ vst1q_lane_s32(dst, vmovl_s16(vqmovn_s32(dst_0)), 0);
+ return true;
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity16Row_NEON(void* dest, int32_t step,
+ int shift) {
+ auto* const dst = static_cast<int32_t*>(dest);
+ const int32x4_t v_dual_round = vdupq_n_s32((1 + (shift << 1)) << 11);
+ const int32x4_t v_shift = vdupq_n_s32(-(12 + shift));
+
+ for (int i = 0; i < 4; ++i) {
+ for (int j = 0; j < 2; ++j) {
+ int32x4x2_t v_src;
+ v_src.val[0] = vld1q_s32(&dst[i * step + j * 8]);
+ v_src.val[1] = vld1q_s32(&dst[i * step + j * 8 + 4]);
+ const int32x4_t v_src_mult_lo =
+ vmlaq_n_s32(v_dual_round, v_src.val[0], kIdentity16Multiplier);
+ const int32x4_t v_src_mult_hi =
+ vmlaq_n_s32(v_dual_round, v_src.val[1], kIdentity16Multiplier);
+ const int32x4_t shift_lo = vqshlq_s32(v_src_mult_lo, v_shift);
+ const int32x4_t shift_hi = vqshlq_s32(v_src_mult_hi, v_shift);
+ vst1q_s32(&dst[i * step + j * 8], vmovl_s16(vqmovn_s32(shift_lo)));
+ vst1q_s32(&dst[i * step + j * 8 + 4], vmovl_s16(vqmovn_s32(shift_hi)));
+ }
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Identity16DcOnly(void* dest, int adjusted_tx_height,
+ bool should_round, int shift) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int32_t*>(dest);
+ const int32x4_t v_src0 = vdupq_n_s32(dst[0]);
+ const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0);
+ const int32x4_t v_src_round =
+ vqrdmulhq_n_s32(v_src0, kTransformRowMultiplier << (31 - 12));
+ const int32x4_t v_src = vbslq_s32(v_mask, v_src_round, v_src0);
+ const int32x4_t v_dual_round = vdupq_n_s32((1 + (shift << 1)) << 11);
+ const int32x4_t v_src_mult_lo =
+ vmlaq_n_s32(v_dual_round, v_src, kIdentity16Multiplier);
+ const int32x4_t dst_0 = vqshlq_s32(v_src_mult_lo, vdupq_n_s32(-(12 + shift)));
+ vst1q_lane_s32(dst, vmovl_s16(vqmovn_s32(dst_0)), 0);
+ return true;
+}
+
+//------------------------------------------------------------------------------
+// row/column transform loops
+
+template <int tx_height>
+LIBGAV1_ALWAYS_INLINE void FlipColumns(int32_t* source, int tx_width) {
+ if (tx_width >= 16) {
+ int i = 0;
+ do {
+ // 00 01 02 03
+ const int32x4_t a = vld1q_s32(&source[i]);
+ const int32x4_t b = vld1q_s32(&source[i + 4]);
+ const int32x4_t c = vld1q_s32(&source[i + 8]);
+ const int32x4_t d = vld1q_s32(&source[i + 12]);
+ // 01 00 03 02
+ const int32x4_t a_rev = vrev64q_s32(a);
+ const int32x4_t b_rev = vrev64q_s32(b);
+ const int32x4_t c_rev = vrev64q_s32(c);
+ const int32x4_t d_rev = vrev64q_s32(d);
+ // 03 02 01 00
+ vst1q_s32(&source[i], vextq_s32(d_rev, d_rev, 2));
+ vst1q_s32(&source[i + 4], vextq_s32(c_rev, c_rev, 2));
+ vst1q_s32(&source[i + 8], vextq_s32(b_rev, b_rev, 2));
+ vst1q_s32(&source[i + 12], vextq_s32(a_rev, a_rev, 2));
+ i += 16;
+ } while (i < tx_width * tx_height);
+ } else if (tx_width == 8) {
+ for (int i = 0; i < 8 * tx_height; i += 8) {
+ // 00 01 02 03
+ const int32x4_t a = vld1q_s32(&source[i]);
+ const int32x4_t b = vld1q_s32(&source[i + 4]);
+ // 01 00 03 02
+ const int32x4_t a_rev = vrev64q_s32(a);
+ const int32x4_t b_rev = vrev64q_s32(b);
+ // 03 02 01 00
+ vst1q_s32(&source[i], vextq_s32(b_rev, b_rev, 2));
+ vst1q_s32(&source[i + 4], vextq_s32(a_rev, a_rev, 2));
+ }
+ } else {
+ // Process two rows per iteration.
+ for (int i = 0; i < 4 * tx_height; i += 8) {
+ // 00 01 02 03
+ const int32x4_t a = vld1q_s32(&source[i]);
+ const int32x4_t b = vld1q_s32(&source[i + 4]);
+ // 01 00 03 02
+ const int32x4_t a_rev = vrev64q_s32(a);
+ const int32x4_t b_rev = vrev64q_s32(b);
+ // 03 02 01 00
+ vst1q_s32(&source[i], vextq_s32(a_rev, a_rev, 2));
+ vst1q_s32(&source[i + 4], vextq_s32(b_rev, b_rev, 2));
+ }
+ }
+}
+
+template <int tx_width>
+LIBGAV1_ALWAYS_INLINE void ApplyRounding(int32_t* source, int num_rows) {
+ // Process two rows per iteration.
+ int i = 0;
+ do {
+ const int32x4_t a_lo = vld1q_s32(&source[i]);
+ const int32x4_t a_hi = vld1q_s32(&source[i + 4]);
+ const int32x4_t b_lo =
+ vqrdmulhq_n_s32(a_lo, kTransformRowMultiplier << (31 - 12));
+ const int32x4_t b_hi =
+ vqrdmulhq_n_s32(a_hi, kTransformRowMultiplier << (31 - 12));
+ vst1q_s32(&source[i], b_lo);
+ vst1q_s32(&source[i + 4], b_hi);
+ i += 8;
+ } while (i < tx_width * num_rows);
+}
+
+template <int tx_width>
+LIBGAV1_ALWAYS_INLINE void RowShift(int32_t* source, int num_rows,
+ int row_shift) {
+ // vqrshlq_s32 will shift right if shift value is negative.
+ row_shift = -row_shift;
+
+ // Process two rows per iteration.
+ int i = 0;
+ do {
+ const int32x4_t residual0 = vld1q_s32(&source[i]);
+ const int32x4_t residual1 = vld1q_s32(&source[i + 4]);
+ vst1q_s32(&source[i], vqrshlq_s32(residual0, vdupq_n_s32(row_shift)));
+ vst1q_s32(&source[i + 4], vqrshlq_s32(residual1, vdupq_n_s32(row_shift)));
+ i += 8;
+ } while (i < tx_width * num_rows);
+}
+
+template <int tx_height, bool enable_flip_rows = false>
+LIBGAV1_ALWAYS_INLINE void StoreToFrameWithRound(
+ Array2DView<uint16_t> frame, const int start_x, const int start_y,
+ const int tx_width, const int32_t* source, TransformType tx_type) {
+ const bool flip_rows =
+ enable_flip_rows ? kTransformFlipRowsMask.Contains(tx_type) : false;
+ const int stride = frame.columns();
+ uint16_t* dst = frame[start_y] + start_x;
+
+ if (tx_width == 4) {
+ for (int i = 0; i < tx_height; ++i) {
+ const int row = flip_rows ? (tx_height - i - 1) * 4 : i * 4;
+ const int32x4_t residual = vld1q_s32(&source[row]);
+ const uint16x4_t frame_data = vld1_u16(dst);
+ const int32x4_t a = vrshrq_n_s32(residual, 4);
+ const uint32x4_t b = vaddw_u16(vreinterpretq_u32_s32(a), frame_data);
+ const uint16x4_t d = vqmovun_s32(vreinterpretq_s32_u32(b));
+ vst1_u16(dst, vmin_u16(d, vdup_n_u16((1 << kBitdepth10) - 1)));
+ dst += stride;
+ }
+ } else {
+ for (int i = 0; i < tx_height; ++i) {
+ const int y = start_y + i;
+ const int row = flip_rows ? (tx_height - i - 1) * tx_width : i * tx_width;
+ int j = 0;
+ do {
+ const int x = start_x + j;
+ const int32x4_t residual = vld1q_s32(&source[row + j]);
+ const int32x4_t residual_hi = vld1q_s32(&source[row + j + 4]);
+ const uint16x8_t frame_data = vld1q_u16(frame[y] + x);
+ const int32x4_t a = vrshrq_n_s32(residual, 4);
+ const int32x4_t a_hi = vrshrq_n_s32(residual_hi, 4);
+ const uint32x4_t b =
+ vaddw_u16(vreinterpretq_u32_s32(a), vget_low_u16(frame_data));
+ const uint32x4_t b_hi =
+ vaddw_u16(vreinterpretq_u32_s32(a_hi), vget_high_u16(frame_data));
+ const uint16x4_t d = vqmovun_s32(vreinterpretq_s32_u32(b));
+ const uint16x4_t d_hi = vqmovun_s32(vreinterpretq_s32_u32(b_hi));
+ vst1q_u16(frame[y] + x, vminq_u16(vcombine_u16(d, d_hi),
+ vdupq_n_u16((1 << kBitdepth10) - 1)));
+ j += 8;
+ } while (j < tx_width);
+ }
+ }
+}
+
+void Dct4TransformLoopRow_NEON(TransformType /*tx_type*/, TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int /*start_x*/, int /*start_y*/,
+ void* /*dst_frame*/) {
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const int tx_height = kTransformHeight[tx_size];
+ const bool should_round = (tx_height == 8);
+ const int row_shift = (tx_height == 16);
+
+ if (DctDcOnly<4>(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<4>(src, adjusted_tx_height);
+ }
+
+ // Process 4 1d dct4 rows in parallel per iteration.
+ int i = adjusted_tx_height;
+ auto* data = src;
+ do {
+ Dct4_NEON<ButterflyRotation_4>(data, /*step=*/4, /*is_row=*/true,
+ row_shift);
+ data += 16;
+ i -= 4;
+ } while (i != 0);
+}
+
+void Dct4TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int start_x, int start_y, void* dst_frame) {
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<4>(src, tx_width);
+ }
+
+ if (!DctDcOnlyColumn<4>(src, adjusted_tx_height, tx_width)) {
+ // Process 4 1d dct4 columns in parallel per iteration.
+ int i = tx_width;
+ auto* data = src;
+ do {
+ Dct4_NEON<ButterflyRotation_4>(data, tx_width, /*transpose=*/false,
+ /*row_shift=*/0);
+ data += 4;
+ i -= 4;
+ } while (i != 0);
+ }
+
+ auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+ StoreToFrameWithRound<4>(frame, start_x, start_y, tx_width, src, tx_type);
+}
+
+void Dct8TransformLoopRow_NEON(TransformType /*tx_type*/, TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int /*start_x*/, int /*start_y*/,
+ void* /*dst_frame*/) {
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
+
+ if (DctDcOnly<8>(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<8>(src, adjusted_tx_height);
+ }
+
+ // Process 4 1d dct8 rows in parallel per iteration.
+ int i = adjusted_tx_height;
+ auto* data = src;
+ do {
+ Dct8_NEON<ButterflyRotation_4>(data, /*step=*/8, /*is_row=*/true,
+ row_shift);
+ data += 32;
+ i -= 4;
+ } while (i != 0);
+}
+
+void Dct8TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int start_x, int start_y, void* dst_frame) {
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<8>(src, tx_width);
+ }
+
+ if (!DctDcOnlyColumn<8>(src, adjusted_tx_height, tx_width)) {
+ // Process 4 1d dct8 columns in parallel per iteration.
+ int i = tx_width;
+ auto* data = src;
+ do {
+ Dct8_NEON<ButterflyRotation_4>(data, tx_width, /*is_row=*/false,
+ /*row_shift=*/0);
+ data += 4;
+ i -= 4;
+ } while (i != 0);
+ }
+ auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+ StoreToFrameWithRound<8>(frame, start_x, start_y, tx_width, src, tx_type);
+}
+
+void Dct16TransformLoopRow_NEON(TransformType /*tx_type*/,
+ TransformSize tx_size, int adjusted_tx_height,
+ void* src_buffer, int /*start_x*/,
+ int /*start_y*/, void* /*dst_frame*/) {
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
+
+ if (DctDcOnly<16>(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<16>(src, adjusted_tx_height);
+ }
+
+ assert(adjusted_tx_height % 4 == 0);
+ int i = adjusted_tx_height;
+ auto* data = src;
+ do {
+ // Process 4 1d dct16 rows in parallel per iteration.
+ Dct16_NEON<ButterflyRotation_4>(data, 16, /*is_row=*/true, row_shift);
+ data += 64;
+ i -= 4;
+ } while (i != 0);
+}
+
+void Dct16TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int start_x, int start_y, void* dst_frame) {
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<16>(src, tx_width);
+ }
+
+ if (!DctDcOnlyColumn<16>(src, adjusted_tx_height, tx_width)) {
+ // Process 4 1d dct16 columns in parallel per iteration.
+ int i = tx_width;
+ auto* data = src;
+ do {
+ Dct16_NEON<ButterflyRotation_4>(data, tx_width, /*is_row=*/false,
+ /*row_shift=*/0);
+ data += 4;
+ i -= 4;
+ } while (i != 0);
+ }
+ auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+ StoreToFrameWithRound<16>(frame, start_x, start_y, tx_width, src, tx_type);
+}
+
+void Dct32TransformLoopRow_NEON(TransformType /*tx_type*/,
+ TransformSize tx_size, int adjusted_tx_height,
+ void* src_buffer, int /*start_x*/,
+ int /*start_y*/, void* /*dst_frame*/) {
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
+
+ if (DctDcOnly<32>(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<32>(src, adjusted_tx_height);
+ }
+
+ assert(adjusted_tx_height % 4 == 0);
+ int i = adjusted_tx_height;
+ auto* data = src;
+ do {
+ // Process 4 1d dct32 rows in parallel per iteration.
+ Dct32_NEON(data, 32, /*is_row=*/true, row_shift);
+ data += 128;
+ i -= 4;
+ } while (i != 0);
+}
+
+void Dct32TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int start_x, int start_y, void* dst_frame) {
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<32>(src, tx_width);
+ }
+
+ if (!DctDcOnlyColumn<32>(src, adjusted_tx_height, tx_width)) {
+ // Process 4 1d dct32 columns in parallel per iteration.
+ int i = tx_width;
+ auto* data = src;
+ do {
+ Dct32_NEON(data, tx_width, /*is_row=*/false, /*row_shift=*/0);
+ data += 4;
+ i -= 4;
+ } while (i != 0);
+ }
+ auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+ StoreToFrameWithRound<32>(frame, start_x, start_y, tx_width, src, tx_type);
+}
+
+void Dct64TransformLoopRow_NEON(TransformType /*tx_type*/,
+ TransformSize tx_size, int adjusted_tx_height,
+ void* src_buffer, int /*start_x*/,
+ int /*start_y*/, void* /*dst_frame*/) {
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
+
+ if (DctDcOnly<64>(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<64>(src, adjusted_tx_height);
+ }
+
+ assert(adjusted_tx_height % 4 == 0);
+ int i = adjusted_tx_height;
+ auto* data = src;
+ do {
+ // Process 4 1d dct64 rows in parallel per iteration.
+ Dct64_NEON(data, 64, /*is_row=*/true, row_shift);
+ data += 128 * 2;
+ i -= 4;
+ } while (i != 0);
+}
+
+void Dct64TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int start_x, int start_y, void* dst_frame) {
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<64>(src, tx_width);
+ }
+
+ if (!DctDcOnlyColumn<64>(src, adjusted_tx_height, tx_width)) {
+ // Process 4 1d dct64 columns in parallel per iteration.
+ int i = tx_width;
+ auto* data = src;
+ do {
+ Dct64_NEON(data, tx_width, /*is_row=*/false, /*row_shift=*/0);
+ data += 4;
+ i -= 4;
+ } while (i != 0);
+ }
+ auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+ StoreToFrameWithRound<64>(frame, start_x, start_y, tx_width, src, tx_type);
+}
+
+void Adst4TransformLoopRow_NEON(TransformType /*tx_type*/,
+ TransformSize tx_size, int adjusted_tx_height,
+ void* src_buffer, int /*start_x*/,
+ int /*start_y*/, void* /*dst_frame*/) {
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const int tx_height = kTransformHeight[tx_size];
+ const int row_shift = static_cast<int>(tx_height == 16);
+ const bool should_round = (tx_height == 8);
+
+ if (Adst4DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<4>(src, adjusted_tx_height);
+ }
+
+ // Process 4 1d adst4 rows in parallel per iteration.
+ int i = adjusted_tx_height;
+ auto* data = src;
+ do {
+ Adst4_NEON(data, /*step=*/4, /*is_row=*/true, row_shift);
+ data += 16;
+ i -= 4;
+ } while (i != 0);
+}
+
+void Adst4TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int start_x, int start_y, void* dst_frame) {
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<4>(src, tx_width);
+ }
+
+ if (!Adst4DcOnlyColumn(src, adjusted_tx_height, tx_width)) {
+ // Process 4 1d adst4 columns in parallel per iteration.
+ int i = tx_width;
+ auto* data = src;
+ do {
+ Adst4_NEON(data, tx_width, /*is_row=*/false, /*row_shift=*/0);
+ data += 4;
+ i -= 4;
+ } while (i != 0);
+ }
+
+ auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+ StoreToFrameWithRound<4, /*enable_flip_rows=*/true>(frame, start_x, start_y,
+ tx_width, src, tx_type);
+}
+
+void Adst8TransformLoopRow_NEON(TransformType /*tx_type*/,
+ TransformSize tx_size, int adjusted_tx_height,
+ void* src_buffer, int /*start_x*/,
+ int /*start_y*/, void* /*dst_frame*/) {
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
+
+ if (Adst8DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<8>(src, adjusted_tx_height);
+ }
+
+ // Process 4 1d adst8 rows in parallel per iteration.
+ assert(adjusted_tx_height % 4 == 0);
+ int i = adjusted_tx_height;
+ auto* data = src;
+ do {
+ Adst8_NEON<ButterflyRotation_4>(data, /*step=*/8,
+ /*transpose=*/true, row_shift);
+ data += 32;
+ i -= 4;
+ } while (i != 0);
+}
+
+void Adst8TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int start_x, int start_y, void* dst_frame) {
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<8>(src, tx_width);
+ }
+
+ if (!Adst8DcOnlyColumn(src, adjusted_tx_height, tx_width)) {
+ // Process 4 1d adst8 columns in parallel per iteration.
+ int i = tx_width;
+ auto* data = src;
+ do {
+ Adst8_NEON<ButterflyRotation_4>(data, tx_width, /*transpose=*/false,
+ /*row_shift=*/0);
+ data += 4;
+ i -= 4;
+ } while (i != 0);
+ }
+ auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+ StoreToFrameWithRound<8, /*enable_flip_rows=*/true>(frame, start_x, start_y,
+ tx_width, src, tx_type);
+}
+
+void Adst16TransformLoopRow_NEON(TransformType /*tx_type*/,
+ TransformSize tx_size, int adjusted_tx_height,
+ void* src_buffer, int /*start_x*/,
+ int /*start_y*/, void* /*dst_frame*/) {
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
+
+ if (Adst16DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<16>(src, adjusted_tx_height);
+ }
+
+ assert(adjusted_tx_height % 4 == 0);
+ int i = adjusted_tx_height;
+ do {
+ // Process 4 1d adst16 rows in parallel per iteration.
+ Adst16_NEON<ButterflyRotation_4>(src, 16, /*is_row=*/true, row_shift);
+ src += 64;
+ i -= 4;
+ } while (i != 0);
+}
+
+void Adst16TransformLoopColumn_NEON(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int start_x, int start_y, void* dst_frame) {
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<16>(src, tx_width);
+ }
+
+ if (!Adst16DcOnlyColumn(src, adjusted_tx_height, tx_width)) {
+ int i = tx_width;
+ auto* data = src;
+ do {
+ // Process 4 1d adst16 columns in parallel per iteration.
+ Adst16_NEON<ButterflyRotation_4>(data, tx_width, /*is_row=*/false,
+ /*row_shift=*/0);
+ data += 4;
+ i -= 4;
+ } while (i != 0);
+ }
+ auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+ StoreToFrameWithRound<16, /*enable_flip_rows=*/true>(frame, start_x, start_y,
+ tx_width, src, tx_type);
+}
+
+void Identity4TransformLoopRow_NEON(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int /*start_x*/, int /*start_y*/,
+ void* /*dst_frame*/) {
+ // Special case: Process row calculations during column transform call.
+ // Improves performance.
+ if (tx_type == kTransformTypeIdentityIdentity &&
+ tx_size == kTransformSize4x4) {
+ return;
+ }
+
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const int tx_height = kTransformHeight[tx_size];
+ const bool should_round = (tx_height == 8);
+
+ if (Identity4DcOnly(src, adjusted_tx_height, should_round, tx_height)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<4>(src, adjusted_tx_height);
+ }
+
+ const int shift = tx_height > 8 ? 1 : 0;
+ int i = adjusted_tx_height;
+ do {
+ Identity4_NEON(src, /*step=*/4, shift);
+ src += 16;
+ i -= 4;
+ } while (i != 0);
+}
+
+void Identity4TransformLoopColumn_NEON(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int start_x, int start_y,
+ void* dst_frame) {
+ auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ // Special case: Process row calculations during column transform call.
+ if (tx_type == kTransformTypeIdentityIdentity &&
+ (tx_size == kTransformSize4x4 || tx_size == kTransformSize8x4)) {
+ Identity4RowColumnStoreToFrame(frame, start_x, start_y, tx_width,
+ adjusted_tx_height, src);
+ return;
+ }
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<4>(src, tx_width);
+ }
+
+ IdentityColumnStoreToFrame<4>(frame, start_x, start_y, tx_width,
+ adjusted_tx_height, src);
+}
+
+void Identity8TransformLoopRow_NEON(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int /*start_x*/, int /*start_y*/,
+ void* /*dst_frame*/) {
+ // Special case: Process row calculations during column transform call.
+ // Improves performance.
+ if (tx_type == kTransformTypeIdentityIdentity &&
+ tx_size == kTransformSize8x4) {
+ return;
+ }
+
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const int tx_height = kTransformHeight[tx_size];
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
+
+ if (Identity8DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+ if (should_round) {
+ ApplyRounding<8>(src, adjusted_tx_height);
+ }
+
+ // When combining the identity8 multiplier with the row shift, the
+ // calculations for tx_height == 8 and tx_height == 16 can be simplified
+ // from ((A * 2) + 1) >> 1) to A. For 10bpp, A must be clamped to a signed 16
+ // bit value.
+ if ((tx_height & 0x18) != 0) {
+ for (int i = 0; i < tx_height; ++i) {
+ const int32x4_t v_src_lo = vld1q_s32(&src[i * 8]);
+ const int32x4_t v_src_hi = vld1q_s32(&src[(i * 8) + 4]);
+ vst1q_s32(&src[i * 8], vmovl_s16(vqmovn_s32(v_src_lo)));
+ vst1q_s32(&src[(i * 8) + 4], vmovl_s16(vqmovn_s32(v_src_hi)));
+ }
+ return;
+ }
+ if (tx_height == 32) {
+ int i = adjusted_tx_height;
+ do {
+ Identity8Row32_NEON(src, /*step=*/8);
+ src += 32;
+ i -= 4;
+ } while (i != 0);
+ return;
+ }
+
+ assert(tx_size == kTransformSize8x4);
+ int i = adjusted_tx_height;
+ do {
+ Identity8Row4_NEON(src, /*step=*/8);
+ src += 32;
+ i -= 4;
+ } while (i != 0);
+}
+
+void Identity8TransformLoopColumn_NEON(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int start_x, int start_y,
+ void* dst_frame) {
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<8>(src, tx_width);
+ }
+ auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+ IdentityColumnStoreToFrame<8>(frame, start_x, start_y, tx_width,
+ adjusted_tx_height, src);
+}
+
+void Identity16TransformLoopRow_NEON(TransformType /*tx_type*/,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int /*start_x*/, int /*start_y*/,
+ void* /*dst_frame*/) {
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
+
+ if (Identity16DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<16>(src, adjusted_tx_height);
+ }
+ int i = adjusted_tx_height;
+ do {
+ Identity16Row_NEON(src, /*step=*/16, row_shift);
+ src += 64;
+ i -= 4;
+ } while (i != 0);
+}
+
+void Identity16TransformLoopColumn_NEON(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height,
+ void* src_buffer, int start_x,
+ int start_y, void* dst_frame) {
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<16>(src, tx_width);
+ }
+ auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+ IdentityColumnStoreToFrame<16>(frame, start_x, start_y, tx_width,
+ adjusted_tx_height, src);
+}
+
+//------------------------------------------------------------------------------
+
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+ // Maximum transform size for Dct is 64.
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kRow] =
+ Dct4TransformLoopRow_NEON;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kColumn] =
+ Dct4TransformLoopColumn_NEON;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kRow] =
+ Dct8TransformLoopRow_NEON;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kColumn] =
+ Dct8TransformLoopColumn_NEON;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kRow] =
+ Dct16TransformLoopRow_NEON;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kColumn] =
+ Dct16TransformLoopColumn_NEON;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kRow] =
+ Dct32TransformLoopRow_NEON;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kColumn] =
+ Dct32TransformLoopColumn_NEON;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kRow] =
+ Dct64TransformLoopRow_NEON;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kColumn] =
+ Dct64TransformLoopColumn_NEON;
+
+ // Maximum transform size for Adst is 16.
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kRow] =
+ Adst4TransformLoopRow_NEON;
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kColumn] =
+ Adst4TransformLoopColumn_NEON;
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kRow] =
+ Adst8TransformLoopRow_NEON;
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kColumn] =
+ Adst8TransformLoopColumn_NEON;
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kRow] =
+ Adst16TransformLoopRow_NEON;
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kColumn] =
+ Adst16TransformLoopColumn_NEON;
+
+ // Maximum transform size for Identity transform is 32.
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kRow] =
+ Identity4TransformLoopRow_NEON;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kColumn] =
+ Identity4TransformLoopColumn_NEON;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kRow] =
+ Identity8TransformLoopRow_NEON;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kColumn] =
+ Identity8TransformLoopColumn_NEON;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kRow] =
+ Identity16TransformLoopRow_NEON;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kColumn] =
+ Identity16TransformLoopColumn_NEON;
+}
+
+} // namespace
+
+void InverseTransformInit10bpp_NEON() { Init10bpp(); }
+
+} // namespace dsp
+} // namespace libgav1
+#else // !LIBGAV1_ENABLE_NEON || LIBGAV1_MAX_BITDEPTH < 10
+namespace libgav1 {
+namespace dsp {
+
+void InverseTransformInit10bpp_NEON() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_ENABLE_NEON && LIBGAV1_MAX_BITDEPTH >= 10
diff --git a/src/dsp/arm/inverse_transform_neon.cc b/src/dsp/arm/inverse_transform_neon.cc
index 072991a..315d5e9 100644
--- a/src/dsp/arm/inverse_transform_neon.cc
+++ b/src/dsp/arm/inverse_transform_neon.cc
@@ -3117,7 +3117,7 @@ void InverseTransformInit_NEON() { low_bitdepth::Init8bpp(); }
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_NEON
+#else // !LIBGAV1_ENABLE_NEON
namespace libgav1 {
namespace dsp {
diff --git a/src/dsp/arm/inverse_transform_neon.h b/src/dsp/arm/inverse_transform_neon.h
index af647e8..91e0e83 100644
--- a/src/dsp/arm/inverse_transform_neon.h
+++ b/src/dsp/arm/inverse_transform_neon.h
@@ -26,6 +26,7 @@ namespace dsp {
// Initializes Dsp::inverse_transforms, see the defines below for specifics.
// This function is not thread-safe.
void InverseTransformInit_NEON();
+void InverseTransformInit10bpp_NEON();
} // namespace dsp
} // namespace libgav1
@@ -47,6 +48,21 @@ void InverseTransformInit_NEON();
#define LIBGAV1_Dsp8bpp_1DTransformSize32_1DTransformIdentity LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformWht LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_1DTransformSize4_1DTransformDct LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_1DTransformSize8_1DTransformDct LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_1DTransformSize16_1DTransformDct LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_1DTransformSize32_1DTransformDct LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_1DTransformSize64_1DTransformDct LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_1DTransformSize4_1DTransformAdst LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_1DTransformSize8_1DTransformAdst LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_1DTransformSize16_1DTransformAdst LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_1DTransformSize4_1DTransformIdentity LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_1DTransformSize8_1DTransformIdentity LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_1DTransformSize16_1DTransformIdentity LIBGAV1_CPU_NEON
+
#endif // LIBGAV1_ENABLE_NEON
#endif // LIBGAV1_SRC_DSP_ARM_INVERSE_TRANSFORM_NEON_H_
diff --git a/src/dsp/arm/loop_filter_neon.cc b/src/dsp/arm/loop_filter_neon.cc
index 146c983..8d72892 100644
--- a/src/dsp/arm/loop_filter_neon.cc
+++ b/src/dsp/arm/loop_filter_neon.cc
@@ -35,7 +35,7 @@ namespace {
// (abs(p1 - p0) > thresh) || (abs(q1 - q0) > thresh)
inline uint8x8_t Hev(const uint8x8_t abd_p0p1_q0q1, const uint8_t thresh) {
const uint8x8_t a = vcgt_u8(abd_p0p1_q0q1, vdup_n_u8(thresh));
- return vorr_u8(a, RightShift<32>(a));
+ return vorr_u8(a, RightShiftVector<32>(a));
}
// abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= outer_thresh
@@ -44,7 +44,7 @@ inline uint8x8_t OuterThreshold(const uint8x8_t p0q0, const uint8x8_t p1q1,
const uint8x8x2_t a = Interleave32(p0q0, p1q1);
const uint8x8_t b = vabd_u8(a.val[0], a.val[1]);
const uint8x8_t p0q0_double = vqadd_u8(b, b);
- const uint8x8_t p1q1_half = RightShift<32>(vshr_n_u8(b, 1));
+ const uint8x8_t p1q1_half = RightShiftVector<32>(vshr_n_u8(b, 1));
const uint8x8_t c = vqadd_u8(p0q0_double, p1q1_half);
return vcle_u8(c, vdup_n_u8(outer_thresh));
}
@@ -56,7 +56,7 @@ inline uint8x8_t NeedsFilter4(const uint8x8_t abd_p0p1_q0q1,
const uint8_t inner_thresh,
const uint8_t outer_thresh) {
const uint8x8_t a = vcle_u8(abd_p0p1_q0q1, vdup_n_u8(inner_thresh));
- const uint8x8_t inner_mask = vand_u8(a, RightShift<32>(a));
+ const uint8x8_t inner_mask = vand_u8(a, RightShiftVector<32>(a));
const uint8x8_t outer_mask = OuterThreshold(p0q0, p1q1, outer_thresh);
return vand_u8(inner_mask, outer_mask);
}
@@ -121,7 +121,7 @@ inline void Filter4(const uint8x8_t q0p1, const uint8x8_t p0q1,
vcombine_s16(vget_low_s16(p0q1_l), vget_low_s16(q0p1_l));
// Need to shift the second term or we end up with a2_ma2.
const int8x8_t a2_ma1 =
- InterleaveLow32(a2_a1, RightShift<32>(vneg_s8(a2_a1)));
+ InterleaveLow32(a2_a1, RightShiftVector<32>(vneg_s8(a2_a1)));
const int16x8_t p0q0_a = vaddw_s8(p0q0_l, a2_ma1);
*p1q1_result = vqmovun_s16(p1q1_a3);
@@ -251,7 +251,7 @@ inline uint8x8_t IsFlat3(const uint8x8_t abd_p0p1_q0q1,
const uint8x8_t abd_p0p2_q0q2) {
const uint8x8_t a = vmax_u8(abd_p0p1_q0q1, abd_p0p2_q0q2);
const uint8x8_t b = vcle_u8(a, vdup_n_u8(1));
- return vand_u8(b, RightShift<32>(b));
+ return vand_u8(b, RightShiftVector<32>(b));
}
// abs(p2 - p1) <= inner_thresh && abs(p1 - p0) <= inner_thresh &&
@@ -264,7 +264,7 @@ inline uint8x8_t NeedsFilter6(const uint8x8_t abd_p0p1_q0q1,
const uint8_t outer_thresh) {
const uint8x8_t a = vmax_u8(abd_p0p1_q0q1, abd_p1p2_q1q2);
const uint8x8_t b = vcle_u8(a, vdup_n_u8(inner_thresh));
- const uint8x8_t inner_mask = vand_u8(b, RightShift<32>(b));
+ const uint8x8_t inner_mask = vand_u8(b, RightShiftVector<32>(b));
const uint8x8_t outer_mask = OuterThreshold(p0q0, p1q1, outer_thresh);
return vand_u8(inner_mask, outer_mask);
}
@@ -482,7 +482,7 @@ inline uint8x8_t IsFlat4(const uint8x8_t abd_p0n0_q0n0,
const uint8x8_t a = vmax_u8(abd_p0n0_q0n0, abd_p0n1_q0n1);
const uint8x8_t b = vmax_u8(a, abd_p0n2_q0n2);
const uint8x8_t c = vcle_u8(b, vdup_n_u8(1));
- return vand_u8(c, RightShift<32>(c));
+ return vand_u8(c, RightShiftVector<32>(c));
}
// abs(p3 - p2) <= inner_thresh && abs(p2 - p1) <= inner_thresh &&
@@ -498,7 +498,7 @@ inline uint8x8_t NeedsFilter8(const uint8x8_t abd_p0p1_q0q1,
const uint8x8_t a = vmax_u8(abd_p0p1_q0q1, abd_p1p2_q1q2);
const uint8x8_t b = vmax_u8(a, abd_p2p3_q2q3);
const uint8x8_t c = vcle_u8(b, vdup_n_u8(inner_thresh));
- const uint8x8_t inner_mask = vand_u8(c, RightShift<32>(c));
+ const uint8x8_t inner_mask = vand_u8(c, RightShiftVector<32>(c));
const uint8x8_t outer_mask = OuterThreshold(p0q0, p1q1, outer_thresh);
return vand_u8(inner_mask, outer_mask);
}
@@ -1179,7 +1179,7 @@ void LoopFilterInit_NEON() { low_bitdepth::Init8bpp(); }
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_NEON
+#else // !LIBGAV1_ENABLE_NEON
namespace libgav1 {
namespace dsp {
diff --git a/src/dsp/arm/loop_restoration_neon.cc b/src/dsp/arm/loop_restoration_neon.cc
index 337c9b4..e6ceb66 100644
--- a/src/dsp/arm/loop_restoration_neon.cc
+++ b/src/dsp/arm/loop_restoration_neon.cc
@@ -41,10 +41,25 @@ inline uint8x8_t VshrU128(const uint8x8x2_t src) {
}
template <int bytes>
+inline uint8x8_t VshrU128(const uint8x8_t src[2]) {
+ return vext_u8(src[0], src[1], bytes);
+}
+
+template <int bytes>
+inline uint8x16_t VshrU128(const uint8x16_t src[2]) {
+ return vextq_u8(src[0], src[1], bytes);
+}
+
+template <int bytes>
inline uint16x8_t VshrU128(const uint16x8x2_t src) {
return vextq_u16(src.val[0], src.val[1], bytes / 2);
}
+template <int bytes>
+inline uint16x8_t VshrU128(const uint16x8_t src[2]) {
+ return vextq_u16(src[0], src[1], bytes / 2);
+}
+
// Wiener
// Must make a local copy of coefficients to help compiler know that they have
@@ -177,18 +192,17 @@ inline void WienerHorizontalTap3(const uint8_t* src, const ptrdiff_t src_stride,
int16_t** const wiener_buffer) {
for (int y = height; y != 0; --y) {
const uint8_t* src_ptr = src;
- uint8x16_t s[4];
- s[0] = vld1q_u8(src_ptr);
+ uint8x16_t s[3];
ptrdiff_t x = width;
do {
- src_ptr += 16;
- s[3] = vld1q_u8(src_ptr);
- s[1] = vextq_u8(s[0], s[3], 1);
- s[2] = vextq_u8(s[0], s[3], 2);
+ // Slightly faster than using vextq_u8().
+ s[0] = vld1q_u8(src_ptr);
+ s[1] = vld1q_u8(src_ptr + 1);
+ s[2] = vld1q_u8(src_ptr + 2);
int16x8x2_t sum;
sum.val[0] = sum.val[1] = vdupq_n_s16(0);
WienerHorizontalSum(s, filter, sum, *wiener_buffer);
- s[0] = s[3];
+ src_ptr += 16;
*wiener_buffer += 16;
x -= 16;
} while (x != 0);
@@ -476,12 +490,12 @@ inline void WienerVerticalTap1(const int16_t* wiener_buffer,
// For width 16 and up, store the horizontal results, and then do the vertical
// filter row by row. This is faster than doing it column by column when
// considering cache issues.
-void WienerFilter_NEON(const RestorationUnitInfo& restoration_info,
- const void* const source, const void* const top_border,
- const void* const bottom_border, const ptrdiff_t stride,
- const int width, const int height,
- RestorationBuffer* const restoration_buffer,
- void* const dest) {
+void WienerFilter_NEON(
+ const RestorationUnitInfo& restoration_info, const void* const source,
+ const ptrdiff_t stride, const void* const top_border,
+ const ptrdiff_t top_border_stride, const void* const bottom_border,
+ const ptrdiff_t bottom_border_stride, const int width, const int height,
+ RestorationBuffer* const restoration_buffer, void* const dest) {
const int16_t* const number_leading_zero_coefficients =
restoration_info.wiener_info.number_leading_zero_coefficients;
const int number_rows_to_skip = std::max(
@@ -509,39 +523,42 @@ void WienerFilter_NEON(const RestorationUnitInfo& restoration_info,
const auto* const top = static_cast<const uint8_t*>(top_border);
const auto* const bottom = static_cast<const uint8_t*>(bottom_border);
if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) {
- WienerHorizontalTap7(top + (2 - height_extra) * stride - 3, stride,
- wiener_stride, height_extra, filter_horizontal,
- &wiener_buffer_horizontal);
- WienerHorizontalTap7(src - 3, stride, wiener_stride, height,
+ WienerHorizontalTap7(top + (2 - height_extra) * top_border_stride - 3,
+ top_border_stride, wiener_stride, height_extra,
filter_horizontal, &wiener_buffer_horizontal);
- WienerHorizontalTap7(bottom - 3, stride, wiener_stride, height_extra,
+ WienerHorizontalTap7(src - 3, stride, wiener_stride, height,
filter_horizontal, &wiener_buffer_horizontal);
- } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
- WienerHorizontalTap5(top + (2 - height_extra) * stride - 2, stride,
- wiener_stride, height_extra, filter_horizontal,
+ WienerHorizontalTap7(bottom - 3, bottom_border_stride, wiener_stride,
+ height_extra, filter_horizontal,
&wiener_buffer_horizontal);
- WienerHorizontalTap5(src - 2, stride, wiener_stride, height,
+ } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
+ WienerHorizontalTap5(top + (2 - height_extra) * top_border_stride - 2,
+ top_border_stride, wiener_stride, height_extra,
filter_horizontal, &wiener_buffer_horizontal);
- WienerHorizontalTap5(bottom - 2, stride, wiener_stride, height_extra,
+ WienerHorizontalTap5(src - 2, stride, wiener_stride, height,
filter_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap5(bottom - 2, bottom_border_stride, wiener_stride,
+ height_extra, filter_horizontal,
+ &wiener_buffer_horizontal);
} else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) {
// The maximum over-reads happen here.
- WienerHorizontalTap3(top + (2 - height_extra) * stride - 1, stride,
- wiener_stride, height_extra, filter_horizontal,
- &wiener_buffer_horizontal);
- WienerHorizontalTap3(src - 1, stride, wiener_stride, height,
+ WienerHorizontalTap3(top + (2 - height_extra) * top_border_stride - 1,
+ top_border_stride, wiener_stride, height_extra,
filter_horizontal, &wiener_buffer_horizontal);
- WienerHorizontalTap3(bottom - 1, stride, wiener_stride, height_extra,
+ WienerHorizontalTap3(src - 1, stride, wiener_stride, height,
filter_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap3(bottom - 1, bottom_border_stride, wiener_stride,
+ height_extra, filter_horizontal,
+ &wiener_buffer_horizontal);
} else {
assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3);
- WienerHorizontalTap1(top + (2 - height_extra) * stride, stride,
- wiener_stride, height_extra,
+ WienerHorizontalTap1(top + (2 - height_extra) * top_border_stride,
+ top_border_stride, wiener_stride, height_extra,
&wiener_buffer_horizontal);
WienerHorizontalTap1(src, stride, wiener_stride, height,
&wiener_buffer_horizontal);
- WienerHorizontalTap1(bottom, stride, wiener_stride, height_extra,
- &wiener_buffer_horizontal);
+ WienerHorizontalTap1(bottom, bottom_border_stride, wiener_stride,
+ height_extra, &wiener_buffer_horizontal);
}
// vertical filtering.
@@ -574,13 +591,20 @@ void WienerFilter_NEON(const RestorationUnitInfo& restoration_info,
//------------------------------------------------------------------------------
// SGR
-inline void Prepare3_8(const uint8x8x2_t src, uint8x8_t dst[3]) {
+inline void Prepare3_8(const uint8x8_t src[2], uint8x8_t dst[3]) {
dst[0] = VshrU128<0>(src);
dst[1] = VshrU128<1>(src);
dst[2] = VshrU128<2>(src);
}
-inline void Prepare3_16(const uint16x8x2_t src, uint16x4_t low[3],
+template <int offset>
+inline void Prepare3_8(const uint8x16_t src[2], uint8x16_t dst[3]) {
+ dst[0] = VshrU128<offset + 0>(src);
+ dst[1] = VshrU128<offset + 1>(src);
+ dst[2] = VshrU128<offset + 2>(src);
+}
+
+inline void Prepare3_16(const uint16x8_t src[2], uint16x4_t low[3],
uint16x4_t high[3]) {
uint16x8_t s[3];
s[0] = VshrU128<0>(src);
@@ -594,7 +618,7 @@ inline void Prepare3_16(const uint16x8x2_t src, uint16x4_t low[3],
high[2] = vget_high_u16(s[2]);
}
-inline void Prepare5_8(const uint8x8x2_t src, uint8x8_t dst[5]) {
+inline void Prepare5_8(const uint8x8_t src[2], uint8x8_t dst[5]) {
dst[0] = VshrU128<0>(src);
dst[1] = VshrU128<1>(src);
dst[2] = VshrU128<2>(src);
@@ -602,7 +626,16 @@ inline void Prepare5_8(const uint8x8x2_t src, uint8x8_t dst[5]) {
dst[4] = VshrU128<4>(src);
}
-inline void Prepare5_16(const uint16x8x2_t src, uint16x4_t low[5],
+template <int offset>
+inline void Prepare5_8(const uint8x16_t src[2], uint8x16_t dst[5]) {
+ dst[0] = VshrU128<offset + 0>(src);
+ dst[1] = VshrU128<offset + 1>(src);
+ dst[2] = VshrU128<offset + 2>(src);
+ dst[3] = VshrU128<offset + 3>(src);
+ dst[4] = VshrU128<offset + 4>(src);
+}
+
+inline void Prepare5_16(const uint16x8_t src[2], uint16x4_t low[5],
uint16x4_t high[5]) {
Prepare3_16(src, low, high);
const uint16x8_t s3 = VshrU128<6>(src);
@@ -641,6 +674,30 @@ inline uint16x8_t Sum3W_16(const uint8x8_t src[3]) {
return vaddw_u8(sum, src[2]);
}
+inline uint16x8_t Sum3WLo16(const uint8x16_t src[3]) {
+ const uint16x8_t sum = vaddl_u8(vget_low_u8(src[0]), vget_low_u8(src[1]));
+ return vaddw_u8(sum, vget_low_u8(src[2]));
+}
+
+inline uint16x8_t Sum3WHi16(const uint8x16_t src[3]) {
+ const uint16x8_t sum = vaddl_u8(vget_high_u8(src[0]), vget_high_u8(src[1]));
+ return vaddw_u8(sum, vget_high_u8(src[2]));
+}
+
+inline uint16x8_t Sum5WLo16(const uint8x16_t src[5]) {
+ const uint16x8_t sum01 = vaddl_u8(vget_low_u8(src[0]), vget_low_u8(src[1]));
+ const uint16x8_t sum23 = vaddl_u8(vget_low_u8(src[2]), vget_low_u8(src[3]));
+ const uint16x8_t sum = vaddq_u16(sum01, sum23);
+ return vaddw_u8(sum, vget_low_u8(src[4]));
+}
+
+inline uint16x8_t Sum5WHi16(const uint8x16_t src[5]) {
+ const uint16x8_t sum01 = vaddl_u8(vget_high_u8(src[0]), vget_high_u8(src[1]));
+ const uint16x8_t sum23 = vaddl_u8(vget_high_u8(src[2]), vget_high_u8(src[3]));
+ const uint16x8_t sum = vaddq_u16(sum01, sum23);
+ return vaddw_u8(sum, vget_high_u8(src[4]));
+}
+
inline uint32x4_t Sum3W_32(const uint16x4_t src[3]) {
const uint32x4_t sum = vaddl_u16(src[0], src[1]);
return vaddw_u16(sum, src[2]);
@@ -678,13 +735,28 @@ inline uint32x4_t Sum5W_32(const uint16x4_t src[5]) {
return vaddw_u16(sum0123, src[4]);
}
-inline uint16x8_t Sum3Horizontal(const uint8x8x2_t src) {
+inline uint16x8_t Sum3Horizontal(const uint8x8_t src[2]) {
uint8x8_t s[3];
Prepare3_8(src, s);
return Sum3W_16(s);
}
-inline uint32x4x2_t Sum3WHorizontal(const uint16x8x2_t src) {
+inline uint16x8_t Sum3Horizontal(const uint8x16_t src) {
+ uint8x8_t s[2];
+ s[0] = vget_low_u8(src);
+ s[1] = vget_high_u8(src);
+ return Sum3Horizontal(s);
+}
+
+template <int offset>
+inline void Sum3Horizontal(const uint8x16_t src[2], uint16x8_t dst[2]) {
+ uint8x16_t s[3];
+ Prepare3_8<offset>(src, s);
+ dst[0] = Sum3WLo16(s);
+ dst[1] = Sum3WHi16(s);
+}
+
+inline uint32x4x2_t Sum3WHorizontal(const uint16x8_t src[2]) {
uint16x4_t low[3], high[3];
uint32x4x2_t sum;
Prepare3_16(src, low, high);
@@ -693,7 +765,7 @@ inline uint32x4x2_t Sum3WHorizontal(const uint16x8x2_t src) {
return sum;
}
-inline uint16x8_t Sum5Horizontal(const uint8x8x2_t src) {
+inline uint16x8_t Sum5Horizontal(const uint8x8_t src[2]) {
uint8x8_t s[5];
Prepare5_8(src, s);
const uint16x8_t sum01 = vaddl_u8(s[0], s[1]);
@@ -702,7 +774,23 @@ inline uint16x8_t Sum5Horizontal(const uint8x8x2_t src) {
return vaddw_u8(sum0123, s[4]);
}
-inline uint32x4x2_t Sum5WHorizontal(const uint16x8x2_t src) {
+inline uint16x8_t Sum5Horizontal(const uint8x16_t src) {
+ uint8x8_t s[2];
+ s[0] = vget_low_u8(src);
+ s[1] = vget_high_u8(src);
+ return Sum5Horizontal(s);
+}
+
+template <int offset>
+inline void Sum5Horizontal(const uint8x16_t src[2], uint16x8_t* const dst0,
+ uint16x8_t* const dst1) {
+ uint8x16_t s[5];
+ Prepare5_8<offset>(src, s);
+ *dst0 = Sum5WLo16(s);
+ *dst1 = Sum5WHi16(s);
+}
+
+inline uint32x4x2_t Sum5WHorizontal(const uint16x8_t src[2]) {
uint16x4_t low[5], high[5];
Prepare5_16(src, low, high);
uint32x4x2_t sum;
@@ -711,35 +799,68 @@ inline uint32x4x2_t Sum5WHorizontal(const uint16x8x2_t src) {
return sum;
}
-void SumHorizontal(const uint16x4_t src[5], uint32x4_t* const row_sq3,
- uint32x4_t* const row_sq5) {
- const uint32x4_t sum04 = vaddl_u16(src[0], src[4]);
- const uint32x4_t sum12 = vaddl_u16(src[1], src[2]);
- *row_sq3 = vaddw_u16(sum12, src[3]);
- *row_sq5 = vaddq_u32(sum04, *row_sq3);
+template <int offset>
+void SumHorizontal(const uint8x16_t src[2], uint16x8_t* const row3_0,
+ uint16x8_t* const row3_1, uint16x8_t* const row5_0,
+ uint16x8_t* const row5_1) {
+ uint8x16_t s[5];
+ Prepare5_8<offset>(src, s);
+ const uint16x8_t sum04_lo = vaddl_u8(vget_low_u8(s[0]), vget_low_u8(s[4]));
+ const uint16x8_t sum04_hi = vaddl_u8(vget_high_u8(s[0]), vget_high_u8(s[4]));
+ *row3_0 = Sum3WLo16(s + 1);
+ *row3_1 = Sum3WHi16(s + 1);
+ *row5_0 = vaddq_u16(sum04_lo, *row3_0);
+ *row5_1 = vaddq_u16(sum04_hi, *row3_1);
}
-void SumHorizontal(const uint8x8x2_t src, const uint16x8x2_t sq,
- uint16x8_t* const row3, uint16x8_t* const row5,
- uint32x4x2_t* const row_sq3, uint32x4x2_t* const row_sq5) {
+void SumHorizontal(const uint8x8_t src[2], uint16x8_t* const row3,
+ uint16x8_t* const row5) {
uint8x8_t s[5];
Prepare5_8(src, s);
const uint16x8_t sum04 = vaddl_u8(s[0], s[4]);
const uint16x8_t sum12 = vaddl_u8(s[1], s[2]);
*row3 = vaddw_u8(sum12, s[3]);
*row5 = vaddq_u16(sum04, *row3);
+}
+
+void SumHorizontal(const uint16x4_t src[5], uint32x4_t* const row_sq3,
+ uint32x4_t* const row_sq5) {
+ const uint32x4_t sum04 = vaddl_u16(src[0], src[4]);
+ const uint32x4_t sum12 = vaddl_u16(src[1], src[2]);
+ *row_sq3 = vaddw_u16(sum12, src[3]);
+ *row_sq5 = vaddq_u32(sum04, *row_sq3);
+}
+
+void SumHorizontal(const uint16x8_t sq[2], uint32x4x2_t* const row_sq3,
+ uint32x4x2_t* const row_sq5) {
uint16x4_t low[5], high[5];
Prepare5_16(sq, low, high);
SumHorizontal(low, &row_sq3->val[0], &row_sq5->val[0]);
SumHorizontal(high, &row_sq3->val[1], &row_sq5->val[1]);
}
-inline uint16x8_t Sum343(const uint8x8x2_t src) {
- uint8x8_t s[3];
- Prepare3_8(src, s);
- const uint16x8_t sum = Sum3W_16(s);
+void SumHorizontal(const uint8x8_t src[2], const uint16x8_t sq[2],
+ uint16x8_t* const row3, uint16x8_t* const row5,
+ uint32x4x2_t* const row_sq3, uint32x4x2_t* const row_sq5) {
+ SumHorizontal(src, row3, row5);
+ SumHorizontal(sq, row_sq3, row_sq5);
+}
+
+void SumHorizontal(const uint8x16_t src, const uint16x8_t sq[2],
+ uint16x8_t* const row3, uint16x8_t* const row5,
+ uint32x4x2_t* const row_sq3, uint32x4x2_t* const row_sq5) {
+ uint8x8_t s[2];
+ s[0] = vget_low_u8(src);
+ s[1] = vget_high_u8(src);
+ return SumHorizontal(s, sq, row3, row5, row_sq3, row_sq5);
+}
+
+template <int offset>
+inline uint16x8_t Sum343(const uint8x16_t ma3[2]) {
+ const uint16x8_t sum = (offset == 0) ? Sum3WLo16(ma3) : Sum3WHi16(ma3);
const uint16x8_t sum3 = Sum3_16(sum, sum, sum);
- return vaddw_u8(sum3, s[1]);
+ return vaddw_u8(sum3,
+ (offset == 0) ? vget_low_u8(ma3[1]) : vget_high_u8(ma3[1]));
}
inline uint32x4_t Sum343W(const uint16x4_t src[3]) {
@@ -748,7 +869,7 @@ inline uint32x4_t Sum343W(const uint16x4_t src[3]) {
return vaddw_u16(sum3, src[1]);
}
-inline uint32x4x2_t Sum343W(const uint16x8x2_t src) {
+inline uint32x4x2_t Sum343W(const uint16x8_t src[2]) {
uint16x4_t low[3], high[3];
uint32x4x2_t d;
Prepare3_16(src, low, high);
@@ -757,13 +878,13 @@ inline uint32x4x2_t Sum343W(const uint16x8x2_t src) {
return d;
}
-inline uint16x8_t Sum565(const uint8x8x2_t src) {
- uint8x8_t s[3];
- Prepare3_8(src, s);
- const uint16x8_t sum = Sum3W_16(s);
+template <int offset>
+inline uint16x8_t Sum565(const uint8x16_t ma5[2]) {
+ const uint16x8_t sum = (offset == 0) ? Sum3WLo16(ma5) : Sum3WHi16(ma5);
const uint16x8_t sum4 = vshlq_n_u16(sum, 2);
const uint16x8_t sum5 = vaddq_u16(sum4, sum);
- return vaddw_u8(sum5, s[1]);
+ return vaddw_u8(sum5,
+ (offset == 0) ? vget_low_u8(ma5[1]) : vget_high_u8(ma5[1]));
}
inline uint32x4_t Sum565W(const uint16x4_t src[3]) {
@@ -773,7 +894,7 @@ inline uint32x4_t Sum565W(const uint16x4_t src[3]) {
return vaddw_u16(sum5, src[1]);
}
-inline uint32x4x2_t Sum565W(const uint16x8x2_t src) {
+inline uint32x4x2_t Sum565W(const uint16x8_t src[2]) {
uint16x4_t low[3], high[3];
uint32x4x2_t d;
Prepare3_16(src, low, high);
@@ -783,21 +904,21 @@ inline uint32x4x2_t Sum565W(const uint16x8x2_t src) {
}
inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride,
- const int height, const ptrdiff_t sum_stride, uint16_t* sum3,
- uint16_t* sum5, uint32_t* square_sum3,
- uint32_t* square_sum5) {
- int y = height;
+ const ptrdiff_t sum_stride, uint16_t* sum3, uint16_t* sum5,
+ uint32_t* square_sum3, uint32_t* square_sum5) {
+ int y = 2;
+ // Don't change loop width to 16, which is even slower.
do {
- uint8x8x2_t s;
- uint16x8x2_t sq;
- s.val[0] = vld1_u8(src);
- sq.val[0] = vmull_u8(s.val[0], s.val[0]);
+ uint8x8_t s[2];
+ uint16x8_t sq[2];
+ s[0] = vld1_u8(src);
+ sq[0] = vmull_u8(s[0], s[0]);
ptrdiff_t x = 0;
do {
uint16x8_t row3, row5;
uint32x4x2_t row_sq3, row_sq5;
- s.val[1] = vld1_u8(src + x + 8);
- sq.val[1] = vmull_u8(s.val[1], s.val[1]);
+ s[1] = vld1_u8(src + x + 8);
+ sq[1] = vmull_u8(s[1], s[1]);
SumHorizontal(s, sq, &row3, &row5, &row_sq3, &row_sq5);
vst1q_u16(sum3, row3);
vst1q_u16(sum5, row5);
@@ -805,8 +926,8 @@ inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride,
vst1q_u32(square_sum3 + 4, row_sq3.val[1]);
vst1q_u32(square_sum5 + 0, row_sq5.val[0]);
vst1q_u32(square_sum5 + 4, row_sq5.val[1]);
- s.val[0] = s.val[1];
- sq.val[0] = sq.val[1];
+ s[0] = s[1];
+ sq[0] = sq[1];
sum3 += 8;
sum5 += 8;
square_sum3 += 8;
@@ -819,21 +940,22 @@ inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride,
template <int size>
inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride,
- const int height, const ptrdiff_t sum_stride, uint16_t* sums,
+ const ptrdiff_t sum_stride, uint16_t* sums,
uint32_t* square_sums) {
static_assert(size == 3 || size == 5, "");
- int y = height;
+ int y = 2;
+ // Don't change loop width to 16, which is even slower.
do {
- uint8x8x2_t s;
- uint16x8x2_t sq;
- s.val[0] = vld1_u8(src);
- sq.val[0] = vmull_u8(s.val[0], s.val[0]);
+ uint8x8_t s[2];
+ uint16x8_t sq[2];
+ s[0] = vld1_u8(src);
+ sq[0] = vmull_u8(s[0], s[0]);
ptrdiff_t x = 0;
do {
uint16x8_t row;
uint32x4x2_t row_sq;
- s.val[1] = vld1_u8(src + x + 8);
- sq.val[1] = vmull_u8(s.val[1], s.val[1]);
+ s[1] = vld1_u8(src + x + 8);
+ sq[1] = vmull_u8(s[1], s[1]);
if (size == 3) {
row = Sum3Horizontal(s);
row_sq = Sum3WHorizontal(sq);
@@ -844,8 +966,8 @@ inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride,
vst1q_u16(sums, row);
vst1q_u32(square_sums + 0, row_sq.val[0]);
vst1q_u32(square_sums + 4, row_sq.val[1]);
- s.val[0] = s.val[1];
- sq.val[0] = sq.val[1];
+ s[0] = s[1];
+ sq[0] = sq[1];
sums += 8;
square_sums += 8;
x += 8;
@@ -871,10 +993,18 @@ inline uint16x4_t CalculateMa(const uint16x4_t sum, const uint32x4_t sum_sq,
return vmovn_u32(shifted);
}
-template <int n>
+inline uint8x8_t AdjustValue(const uint8x8_t value, const uint8x8_t index,
+ const int threshold) {
+ const uint8x8_t thresholds = vdup_n_u8(threshold);
+ const uint8x8_t offset = vcgt_u8(index, thresholds);
+ // Adding 255 is equivalent to subtracting 1 for 8-bit data.
+ return vadd_u8(value, offset);
+}
+
+template <int n, int offset>
inline void CalculateIntermediate(const uint16x8_t sum,
const uint32x4x2_t sum_sq,
- const uint32_t scale, uint8x8_t* const ma,
+ const uint32_t scale, uint8x16_t* const ma,
uint16x8_t* const b) {
constexpr uint32_t one_over_n =
((1 << kSgrProjReciprocalBits) + (n >> 1)) / n;
@@ -882,19 +1012,39 @@ inline void CalculateIntermediate(const uint16x8_t sum,
const uint16x4_t z1 =
CalculateMa<n>(vget_high_u16(sum), sum_sq.val[1], scale);
const uint16x8_t z01 = vcombine_u16(z0, z1);
- // Using vqmovn_u16() needs an extra sign extension instruction.
- const uint16x8_t z = vminq_u16(z01, vdupq_n_u16(255));
- // Using vgetq_lane_s16() can save the sign extension instruction.
- const uint8_t lookup[8] = {
- kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 0)],
- kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 1)],
- kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 2)],
- kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 3)],
- kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 4)],
- kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 5)],
- kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 6)],
- kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 7)]};
- *ma = vld1_u8(lookup);
+ const uint8x8_t idx = vqmovn_u16(z01);
+ // Use table lookup to read elements whose indices are less than 48.
+ // Using one uint8x8x4_t vector and one uint8x8x2_t vector is faster than
+ // using two uint8x8x3_t vectors.
+ uint8x8x4_t table0;
+ uint8x8x2_t table1;
+ table0.val[0] = vld1_u8(kSgrMaLookup + 0 * 8);
+ table0.val[1] = vld1_u8(kSgrMaLookup + 1 * 8);
+ table0.val[2] = vld1_u8(kSgrMaLookup + 2 * 8);
+ table0.val[3] = vld1_u8(kSgrMaLookup + 3 * 8);
+ table1.val[0] = vld1_u8(kSgrMaLookup + 4 * 8);
+ table1.val[1] = vld1_u8(kSgrMaLookup + 5 * 8);
+ // All elements whose indices are out of range [0, 47] are set to 0.
+ uint8x8_t val = vtbl4_u8(table0, idx); // Range [0, 31].
+ // Subtract 8 to shuffle the next index range.
+ const uint8x8_t index = vsub_u8(idx, vdup_n_u8(32));
+ const uint8x8_t res = vtbl2_u8(table1, index); // Range [32, 47].
+ // Use OR instruction to combine shuffle results together.
+ val = vorr_u8(val, res);
+
+ // For elements whose indices are larger than 47, since they seldom change
+ // values with the increase of the index, we use comparison and arithmetic
+ // operations to calculate their values.
+ // Elements whose indices are larger than 47 (with value 0) are set to 5.
+ val = vmax_u8(val, vdup_n_u8(5));
+ val = AdjustValue(val, idx, 55); // 55 is the last index which value is 5.
+ val = AdjustValue(val, idx, 72); // 72 is the last index which value is 4.
+ val = AdjustValue(val, idx, 101); // 101 is the last index which value is 3.
+ val = AdjustValue(val, idx, 169); // 169 is the last index which value is 2.
+ val = AdjustValue(val, idx, 254); // 254 is the last index which value is 1.
+ *ma = (offset == 0) ? vcombine_u8(val, vget_high_u8(*ma))
+ : vcombine_u8(vget_low_u8(*ma), val);
+
// b = ma * b * one_over_n
// |ma| = [0, 255]
// |sum| is a box sum with radius 1 or 2.
@@ -906,7 +1056,8 @@ inline void CalculateIntermediate(const uint16x8_t sum,
// |kSgrProjReciprocalBits| is 12.
// Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
// Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
- const uint16x8_t maq = vmovl_u8(*ma);
+ const uint16x8_t maq =
+ vmovl_u8((offset == 0) ? vget_low_u8(*ma) : vget_high_u8(*ma));
const uint32x4_t m0 = vmull_u16(vget_low_u16(maq), vget_low_u16(sum));
const uint32x4_t m1 = vmull_u16(vget_high_u16(maq), vget_high_u16(sum));
const uint32x4_t m2 = vmulq_n_u32(m0, one_over_n);
@@ -916,37 +1067,39 @@ inline void CalculateIntermediate(const uint16x8_t sum,
*b = vcombine_u16(b_lo, b_hi);
}
+template <int offset>
inline void CalculateIntermediate5(const uint16x8_t s5[5],
const uint32x4x2_t sq5[5],
- const uint32_t scale, uint8x8_t* const ma,
+ const uint32_t scale, uint8x16_t* const ma,
uint16x8_t* const b) {
const uint16x8_t sum = Sum5_16(s5);
const uint32x4x2_t sum_sq = Sum5_32(sq5);
- CalculateIntermediate<25>(sum, sum_sq, scale, ma, b);
+ CalculateIntermediate<25, offset>(sum, sum_sq, scale, ma, b);
}
+template <int offset>
inline void CalculateIntermediate3(const uint16x8_t s3[3],
const uint32x4x2_t sq3[3],
- const uint32_t scale, uint8x8_t* const ma,
+ const uint32_t scale, uint8x16_t* const ma,
uint16x8_t* const b) {
const uint16x8_t sum = Sum3_16(s3);
const uint32x4x2_t sum_sq = Sum3_32(sq3);
- CalculateIntermediate<9>(sum, sum_sq, scale, ma, b);
+ CalculateIntermediate<9, offset>(sum, sum_sq, scale, ma, b);
}
-inline void Store343_444(const uint8x8x2_t ma3, const uint16x8x2_t b3,
+template <int offset>
+inline void Store343_444(const uint8x16_t ma3[3], const uint16x8_t b3[2],
const ptrdiff_t x, uint16x8_t* const sum_ma343,
uint16x8_t* const sum_ma444,
uint32x4x2_t* const sum_b343,
uint32x4x2_t* const sum_b444, uint16_t* const ma343,
uint16_t* const ma444, uint32_t* const b343,
uint32_t* const b444) {
- uint8x8_t s[3];
- Prepare3_8(ma3, s);
- const uint16x8_t sum_ma111 = Sum3W_16(s);
+ const uint16x8_t sum_ma111 = (offset == 0) ? Sum3WLo16(ma3) : Sum3WHi16(ma3);
*sum_ma444 = vshlq_n_u16(sum_ma111, 2);
const uint16x8_t sum333 = vsubq_u16(*sum_ma444, sum_ma111);
- *sum_ma343 = vaddw_u8(sum333, s[1]);
+ *sum_ma343 = vaddw_u8(
+ sum333, (offset == 0) ? vget_low_u8(ma3[1]) : vget_high_u8(ma3[1]));
uint16x4_t low[3], high[3];
uint32x4x2_t sum_b111;
Prepare3_16(b3, low, high);
@@ -966,93 +1119,211 @@ inline void Store343_444(const uint8x8x2_t ma3, const uint16x8x2_t b3,
vst1q_u32(b444 + x + 4, sum_b444->val[1]);
}
-inline void Store343_444(const uint8x8x2_t ma3, const uint16x8x2_t b3,
+template <int offset>
+inline void Store343_444(const uint8x16_t ma3[3], const uint16x8_t b3[2],
const ptrdiff_t x, uint16x8_t* const sum_ma343,
uint32x4x2_t* const sum_b343, uint16_t* const ma343,
uint16_t* const ma444, uint32_t* const b343,
uint32_t* const b444) {
uint16x8_t sum_ma444;
uint32x4x2_t sum_b444;
- Store343_444(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, &sum_b444, ma343,
- ma444, b343, b444);
+ Store343_444<offset>(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, &sum_b444,
+ ma343, ma444, b343, b444);
}
-inline void Store343_444(const uint8x8x2_t ma3, const uint16x8x2_t b3,
+template <int offset>
+inline void Store343_444(const uint8x16_t ma3[3], const uint16x8_t b3[2],
const ptrdiff_t x, uint16_t* const ma343,
uint16_t* const ma444, uint32_t* const b343,
uint32_t* const b444) {
uint16x8_t sum_ma343;
uint32x4x2_t sum_b343;
- Store343_444(ma3, b3, x, &sum_ma343, &sum_b343, ma343, ma444, b343, b444);
+ Store343_444<offset>(ma3, b3, x, &sum_ma343, &sum_b343, ma343, ma444, b343,
+ b444);
}
-LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5(
- const uint8_t* const src0, const uint8_t* const src1, const ptrdiff_t x,
- const uint32_t scale, uint16_t* const sum5[5],
- uint32_t* const square_sum5[5], uint8x8x2_t s[2], uint16x8x2_t sq[2],
- uint8x8_t* const ma, uint16x8_t* const b) {
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5Lo(
+ const uint8_t* const src0, const uint8_t* const src1, const uint32_t scale,
+ uint8x16_t s[2][2], uint16_t* const sum5[5], uint32_t* const square_sum5[5],
+ uint16x8_t sq[2][4], uint8x16_t* const ma, uint16x8_t* const b) {
uint16x8_t s5[5];
uint32x4x2_t sq5[5];
- s[0].val[1] = vld1_u8(src0 + x + 8);
- s[1].val[1] = vld1_u8(src1 + x + 8);
- sq[0].val[1] = vmull_u8(s[0].val[1], s[0].val[1]);
- sq[1].val[1] = vmull_u8(s[1].val[1], s[1].val[1]);
- s5[3] = Sum5Horizontal(s[0]);
- s5[4] = Sum5Horizontal(s[1]);
+ s[0][0] = vld1q_u8(src0);
+ s[1][0] = vld1q_u8(src1);
+ sq[0][0] = vmull_u8(vget_low_u8(s[0][0]), vget_low_u8(s[0][0]));
+ sq[1][0] = vmull_u8(vget_low_u8(s[1][0]), vget_low_u8(s[1][0]));
+ sq[0][1] = vmull_u8(vget_high_u8(s[0][0]), vget_high_u8(s[0][0]));
+ sq[1][1] = vmull_u8(vget_high_u8(s[1][0]), vget_high_u8(s[1][0]));
+ s5[3] = Sum5Horizontal(s[0][0]);
+ s5[4] = Sum5Horizontal(s[1][0]);
sq5[3] = Sum5WHorizontal(sq[0]);
sq5[4] = Sum5WHorizontal(sq[1]);
- vst1q_u16(sum5[3] + x, s5[3]);
- vst1q_u16(sum5[4] + x, s5[4]);
+ vst1q_u16(sum5[3], s5[3]);
+ vst1q_u16(sum5[4], s5[4]);
+ vst1q_u32(square_sum5[3] + 0, sq5[3].val[0]);
+ vst1q_u32(square_sum5[3] + 4, sq5[3].val[1]);
+ vst1q_u32(square_sum5[4] + 0, sq5[4].val[0]);
+ vst1q_u32(square_sum5[4] + 4, sq5[4].val[1]);
+ s5[0] = vld1q_u16(sum5[0]);
+ s5[1] = vld1q_u16(sum5[1]);
+ s5[2] = vld1q_u16(sum5[2]);
+ sq5[0].val[0] = vld1q_u32(square_sum5[0] + 0);
+ sq5[0].val[1] = vld1q_u32(square_sum5[0] + 4);
+ sq5[1].val[0] = vld1q_u32(square_sum5[1] + 0);
+ sq5[1].val[1] = vld1q_u32(square_sum5[1] + 4);
+ sq5[2].val[0] = vld1q_u32(square_sum5[2] + 0);
+ sq5[2].val[1] = vld1q_u32(square_sum5[2] + 4);
+ CalculateIntermediate5<0>(s5, sq5, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5(
+ const uint8_t* const src0, const uint8_t* const src1, const ptrdiff_t x,
+ const uint32_t scale, uint8x16_t s[2][2], uint16_t* const sum5[5],
+ uint32_t* const square_sum5[5], uint16x8_t sq[2][4], uint8x16_t ma[2],
+ uint16x8_t b[2]) {
+ uint16x8_t s5[2][5];
+ uint32x4x2_t sq5[5];
+ s[0][1] = vld1q_u8(src0 + x + 8);
+ s[1][1] = vld1q_u8(src1 + x + 8);
+ sq[0][2] = vmull_u8(vget_low_u8(s[0][1]), vget_low_u8(s[0][1]));
+ sq[1][2] = vmull_u8(vget_low_u8(s[1][1]), vget_low_u8(s[1][1]));
+ Sum5Horizontal<8>(s[0], &s5[0][3], &s5[1][3]);
+ Sum5Horizontal<8>(s[1], &s5[0][4], &s5[1][4]);
+ sq5[3] = Sum5WHorizontal(sq[0] + 1);
+ sq5[4] = Sum5WHorizontal(sq[1] + 1);
+ vst1q_u16(sum5[3] + x, s5[0][3]);
+ vst1q_u16(sum5[4] + x, s5[0][4]);
vst1q_u32(square_sum5[3] + x + 0, sq5[3].val[0]);
vst1q_u32(square_sum5[3] + x + 4, sq5[3].val[1]);
vst1q_u32(square_sum5[4] + x + 0, sq5[4].val[0]);
vst1q_u32(square_sum5[4] + x + 4, sq5[4].val[1]);
- s5[0] = vld1q_u16(sum5[0] + x);
- s5[1] = vld1q_u16(sum5[1] + x);
- s5[2] = vld1q_u16(sum5[2] + x);
+ s5[0][0] = vld1q_u16(sum5[0] + x);
+ s5[0][1] = vld1q_u16(sum5[1] + x);
+ s5[0][2] = vld1q_u16(sum5[2] + x);
sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 0);
sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 4);
sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 0);
sq5[1].val[1] = vld1q_u32(square_sum5[1] + x + 4);
sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 0);
sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 4);
- CalculateIntermediate5(s5, sq5, scale, ma, b);
+ CalculateIntermediate5<8>(s5[0], sq5, scale, &ma[0], &b[0]);
+
+ sq[0][3] = vmull_u8(vget_high_u8(s[0][1]), vget_high_u8(s[0][1]));
+ sq[1][3] = vmull_u8(vget_high_u8(s[1][1]), vget_high_u8(s[1][1]));
+ sq5[3] = Sum5WHorizontal(sq[0] + 2);
+ sq5[4] = Sum5WHorizontal(sq[1] + 2);
+ vst1q_u16(sum5[3] + x + 8, s5[1][3]);
+ vst1q_u16(sum5[4] + x + 8, s5[1][4]);
+ vst1q_u32(square_sum5[3] + x + 8, sq5[3].val[0]);
+ vst1q_u32(square_sum5[3] + x + 12, sq5[3].val[1]);
+ vst1q_u32(square_sum5[4] + x + 8, sq5[4].val[0]);
+ vst1q_u32(square_sum5[4] + x + 12, sq5[4].val[1]);
+ s5[1][0] = vld1q_u16(sum5[0] + x + 8);
+ s5[1][1] = vld1q_u16(sum5[1] + x + 8);
+ s5[1][2] = vld1q_u16(sum5[2] + x + 8);
+ sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 8);
+ sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 12);
+ sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 8);
+ sq5[1].val[1] = vld1q_u32(square_sum5[1] + x + 12);
+ sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 8);
+ sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 12);
+ CalculateIntermediate5<0>(s5[1], sq5, scale, &ma[1], &b[1]);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRowLo(
+ const uint8_t* const src, const uint32_t scale, uint8x16_t* const s,
+ const uint16_t* const sum5[5], const uint32_t* const square_sum5[5],
+ uint16x8_t sq[2], uint8x16_t* const ma, uint16x8_t* const b) {
+ uint16x8_t s5[5];
+ uint32x4x2_t sq5[5];
+ *s = vld1q_u8(src);
+ sq[0] = vmull_u8(vget_low_u8(*s), vget_low_u8(*s));
+ sq[1] = vmull_u8(vget_high_u8(*s), vget_high_u8(*s));
+ s5[3] = s5[4] = Sum5Horizontal(*s);
+ sq5[3] = sq5[4] = Sum5WHorizontal(sq);
+ s5[0] = vld1q_u16(sum5[0]);
+ s5[1] = vld1q_u16(sum5[1]);
+ s5[2] = vld1q_u16(sum5[2]);
+ sq5[0].val[0] = vld1q_u32(square_sum5[0] + 0);
+ sq5[0].val[1] = vld1q_u32(square_sum5[0] + 4);
+ sq5[1].val[0] = vld1q_u32(square_sum5[1] + 0);
+ sq5[1].val[1] = vld1q_u32(square_sum5[1] + 4);
+ sq5[2].val[0] = vld1q_u32(square_sum5[2] + 0);
+ sq5[2].val[1] = vld1q_u32(square_sum5[2] + 4);
+ CalculateIntermediate5<0>(s5, sq5, scale, ma, b);
}
LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRow(
const uint8_t* const src, const ptrdiff_t x, const uint32_t scale,
- const uint16_t* const sum5[5], const uint32_t* const square_sum5[5],
- uint8x8x2_t* const s, uint16x8x2_t* const sq, uint8x8_t* const ma,
- uint16x8_t* const b) {
- uint16x8_t s5[5];
+ uint8x16_t s[2], const uint16_t* const sum5[5],
+ const uint32_t* const square_sum5[5], uint16x8_t sq[3], uint8x16_t ma[2],
+ uint16x8_t b[2]) {
+ uint16x8_t s5[2][5];
uint32x4x2_t sq5[5];
- s->val[1] = vld1_u8(src + x + 8);
- sq->val[1] = vmull_u8(s->val[1], s->val[1]);
- s5[3] = s5[4] = Sum5Horizontal(*s);
- sq5[3] = sq5[4] = Sum5WHorizontal(*sq);
- s5[0] = vld1q_u16(sum5[0] + x);
- s5[1] = vld1q_u16(sum5[1] + x);
- s5[2] = vld1q_u16(sum5[2] + x);
+ s[1] = vld1q_u8(src + x + 8);
+ sq[1] = vmull_u8(vget_low_u8(s[1]), vget_low_u8(s[1]));
+ Sum5Horizontal<8>(s, &s5[0][3], &s5[1][3]);
+ sq5[3] = sq5[4] = Sum5WHorizontal(sq);
+ s5[0][0] = vld1q_u16(sum5[0] + x);
+ s5[0][1] = vld1q_u16(sum5[1] + x);
+ s5[0][2] = vld1q_u16(sum5[2] + x);
+ s5[0][4] = s5[0][3];
sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 0);
sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 4);
sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 0);
sq5[1].val[1] = vld1q_u32(square_sum5[1] + x + 4);
sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 0);
sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 4);
- CalculateIntermediate5(s5, sq5, scale, ma, b);
+ CalculateIntermediate5<8>(s5[0], sq5, scale, &ma[0], &b[0]);
+
+ sq[2] = vmull_u8(vget_high_u8(s[1]), vget_high_u8(s[1]));
+ sq5[3] = sq5[4] = Sum5WHorizontal(sq + 1);
+ s5[1][0] = vld1q_u16(sum5[0] + x + 8);
+ s5[1][1] = vld1q_u16(sum5[1] + x + 8);
+ s5[1][2] = vld1q_u16(sum5[2] + x + 8);
+ s5[1][4] = s5[1][3];
+ sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 8);
+ sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 12);
+ sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 8);
+ sq5[1].val[1] = vld1q_u32(square_sum5[1] + x + 12);
+ sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 8);
+ sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 12);
+ CalculateIntermediate5<0>(s5[1], sq5, scale, &ma[1], &b[1]);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3Lo(
+ const uint8_t* const src, const uint32_t scale, uint8x16_t* const s,
+ uint16_t* const sum3[3], uint32_t* const square_sum3[3], uint16x8_t sq[2],
+ uint8x16_t* const ma, uint16x8_t* const b) {
+ uint16x8_t s3[3];
+ uint32x4x2_t sq3[3];
+ *s = vld1q_u8(src);
+ sq[0] = vmull_u8(vget_low_u8(*s), vget_low_u8(*s));
+ sq[1] = vmull_u8(vget_high_u8(*s), vget_high_u8(*s));
+ s3[2] = Sum3Horizontal(*s);
+ sq3[2] = Sum3WHorizontal(sq);
+ vst1q_u16(sum3[2], s3[2]);
+ vst1q_u32(square_sum3[2] + 0, sq3[2].val[0]);
+ vst1q_u32(square_sum3[2] + 4, sq3[2].val[1]);
+ s3[0] = vld1q_u16(sum3[0]);
+ s3[1] = vld1q_u16(sum3[1]);
+ sq3[0].val[0] = vld1q_u32(square_sum3[0] + 0);
+ sq3[0].val[1] = vld1q_u32(square_sum3[0] + 4);
+ sq3[1].val[0] = vld1q_u32(square_sum3[1] + 0);
+ sq3[1].val[1] = vld1q_u32(square_sum3[1] + 4);
+ CalculateIntermediate3<0>(s3, sq3, scale, ma, b);
}
LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3(
const uint8_t* const src, const ptrdiff_t x, const uint32_t scale,
- uint16_t* const sum3[3], uint32_t* const square_sum3[3],
- uint8x8x2_t* const s, uint16x8x2_t* const sq, uint8x8_t* const ma,
- uint16x8_t* const b) {
- uint16x8_t s3[3];
+ uint16_t* const sum3[3], uint32_t* const square_sum3[3], uint8x16_t s[2],
+ uint16x8_t sq[3], uint8x16_t ma[2], uint16x8_t b[2]) {
+ uint16x8_t s3[4];
uint32x4x2_t sq3[3];
- s->val[1] = vld1_u8(src + x + 8);
- sq->val[1] = vmull_u8(s->val[1], s->val[1]);
- s3[2] = Sum3Horizontal(*s);
- sq3[2] = Sum3WHorizontal(*sq);
+ s[1] = vld1q_u8(src + x + 8);
+ sq[1] = vmull_u8(vget_low_u8(s[1]), vget_low_u8(s[1]));
+ Sum3Horizontal<8>(s, s3 + 2);
+ sq3[2] = Sum3WHorizontal(sq);
vst1q_u16(sum3[2] + x, s3[2]);
vst1q_u32(square_sum3[2] + x + 0, sq3[2].val[0]);
vst1q_u32(square_sum3[2] + x + 4, sq3[2].val[1]);
@@ -1062,71 +1333,204 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3(
sq3[0].val[1] = vld1q_u32(square_sum3[0] + x + 4);
sq3[1].val[0] = vld1q_u32(square_sum3[1] + x + 0);
sq3[1].val[1] = vld1q_u32(square_sum3[1] + x + 4);
- CalculateIntermediate3(s3, sq3, scale, ma, b);
+ CalculateIntermediate3<8>(s3, sq3, scale, &ma[0], &b[0]);
+
+ sq[2] = vmull_u8(vget_high_u8(s[1]), vget_high_u8(s[1]));
+ sq3[2] = Sum3WHorizontal(sq + 1);
+ vst1q_u16(sum3[2] + x + 8, s3[3]);
+ vst1q_u32(square_sum3[2] + x + 8, sq3[2].val[0]);
+ vst1q_u32(square_sum3[2] + x + 12, sq3[2].val[1]);
+ s3[1] = vld1q_u16(sum3[0] + x + 8);
+ s3[2] = vld1q_u16(sum3[1] + x + 8);
+ sq3[0].val[0] = vld1q_u32(square_sum3[0] + x + 8);
+ sq3[0].val[1] = vld1q_u32(square_sum3[0] + x + 12);
+ sq3[1].val[0] = vld1q_u32(square_sum3[1] + x + 8);
+ sq3[1].val[1] = vld1q_u32(square_sum3[1] + x + 12);
+ CalculateIntermediate3<0>(s3 + 1, sq3, scale, &ma[1], &b[1]);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLo(
+ const uint8_t* const src0, const uint8_t* const src1,
+ const uint16_t scales[2], uint8x16_t s[2][2], uint16_t* const sum3[4],
+ uint16_t* const sum5[5], uint32_t* const square_sum3[4],
+ uint32_t* const square_sum5[5], uint16x8_t sq[2][4], uint8x16_t ma3[2][2],
+ uint16x8_t b3[2][3], uint8x16_t* const ma5, uint16x8_t* const b5) {
+ uint16x8_t s3[4], s5[5];
+ uint32x4x2_t sq3[4], sq5[5];
+ s[0][0] = vld1q_u8(src0);
+ s[1][0] = vld1q_u8(src1);
+ sq[0][0] = vmull_u8(vget_low_u8(s[0][0]), vget_low_u8(s[0][0]));
+ sq[1][0] = vmull_u8(vget_low_u8(s[1][0]), vget_low_u8(s[1][0]));
+ sq[0][1] = vmull_u8(vget_high_u8(s[0][0]), vget_high_u8(s[0][0]));
+ sq[1][1] = vmull_u8(vget_high_u8(s[1][0]), vget_high_u8(s[1][0]));
+ SumHorizontal(s[0][0], sq[0], &s3[2], &s5[3], &sq3[2], &sq5[3]);
+ SumHorizontal(s[1][0], sq[1], &s3[3], &s5[4], &sq3[3], &sq5[4]);
+ vst1q_u16(sum3[2], s3[2]);
+ vst1q_u16(sum3[3], s3[3]);
+ vst1q_u32(square_sum3[2] + 0, sq3[2].val[0]);
+ vst1q_u32(square_sum3[2] + 4, sq3[2].val[1]);
+ vst1q_u32(square_sum3[3] + 0, sq3[3].val[0]);
+ vst1q_u32(square_sum3[3] + 4, sq3[3].val[1]);
+ vst1q_u16(sum5[3], s5[3]);
+ vst1q_u16(sum5[4], s5[4]);
+ vst1q_u32(square_sum5[3] + 0, sq5[3].val[0]);
+ vst1q_u32(square_sum5[3] + 4, sq5[3].val[1]);
+ vst1q_u32(square_sum5[4] + 0, sq5[4].val[0]);
+ vst1q_u32(square_sum5[4] + 4, sq5[4].val[1]);
+ s3[0] = vld1q_u16(sum3[0]);
+ s3[1] = vld1q_u16(sum3[1]);
+ sq3[0].val[0] = vld1q_u32(square_sum3[0] + 0);
+ sq3[0].val[1] = vld1q_u32(square_sum3[0] + 4);
+ sq3[1].val[0] = vld1q_u32(square_sum3[1] + 0);
+ sq3[1].val[1] = vld1q_u32(square_sum3[1] + 4);
+ s5[0] = vld1q_u16(sum5[0]);
+ s5[1] = vld1q_u16(sum5[1]);
+ s5[2] = vld1q_u16(sum5[2]);
+ sq5[0].val[0] = vld1q_u32(square_sum5[0] + 0);
+ sq5[0].val[1] = vld1q_u32(square_sum5[0] + 4);
+ sq5[1].val[0] = vld1q_u32(square_sum5[1] + 0);
+ sq5[1].val[1] = vld1q_u32(square_sum5[1] + 4);
+ sq5[2].val[0] = vld1q_u32(square_sum5[2] + 0);
+ sq5[2].val[1] = vld1q_u32(square_sum5[2] + 4);
+ CalculateIntermediate3<0>(s3, sq3, scales[1], ma3[0], b3[0]);
+ CalculateIntermediate3<0>(s3 + 1, sq3 + 1, scales[1], ma3[1], b3[1]);
+ CalculateIntermediate5<0>(s5, sq5, scales[0], ma5, b5);
}
LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess(
const uint8_t* const src0, const uint8_t* const src1, const ptrdiff_t x,
- const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5],
- uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
- uint8x8x2_t s[2], uint16x8x2_t sq[2], uint8x8_t* const ma3_0,
- uint8x8_t* const ma3_1, uint16x8_t* const b3_0, uint16x8_t* const b3_1,
- uint8x8_t* const ma5, uint16x8_t* const b5) {
- uint16x8_t s3[4], s5[5];
+ const uint16_t scales[2], uint8x16_t s[2][2], uint16_t* const sum3[4],
+ uint16_t* const sum5[5], uint32_t* const square_sum3[4],
+ uint32_t* const square_sum5[5], uint16x8_t sq[2][4], uint8x16_t ma3[2][2],
+ uint16x8_t b3[2][3], uint8x16_t ma5[2], uint16x8_t b5[2]) {
+ uint16x8_t s3[2][4], s5[2][5];
uint32x4x2_t sq3[4], sq5[5];
- s[0].val[1] = vld1_u8(src0 + x + 8);
- s[1].val[1] = vld1_u8(src1 + x + 8);
- sq[0].val[1] = vmull_u8(s[0].val[1], s[0].val[1]);
- sq[1].val[1] = vmull_u8(s[1].val[1], s[1].val[1]);
- SumHorizontal(s[0], sq[0], &s3[2], &s5[3], &sq3[2], &sq5[3]);
- SumHorizontal(s[1], sq[1], &s3[3], &s5[4], &sq3[3], &sq5[4]);
- vst1q_u16(sum3[2] + x, s3[2]);
- vst1q_u16(sum3[3] + x, s3[3]);
+ s[0][1] = vld1q_u8(src0 + x + 8);
+ s[1][1] = vld1q_u8(src1 + x + 8);
+ sq[0][2] = vmull_u8(vget_low_u8(s[0][1]), vget_low_u8(s[0][1]));
+ sq[1][2] = vmull_u8(vget_low_u8(s[1][1]), vget_low_u8(s[1][1]));
+ SumHorizontal<8>(s[0], &s3[0][2], &s3[1][2], &s5[0][3], &s5[1][3]);
+ SumHorizontal<8>(s[1], &s3[0][3], &s3[1][3], &s5[0][4], &s5[1][4]);
+ SumHorizontal(sq[0] + 1, &sq3[2], &sq5[3]);
+ SumHorizontal(sq[1] + 1, &sq3[3], &sq5[4]);
+ vst1q_u16(sum3[2] + x, s3[0][2]);
+ vst1q_u16(sum3[3] + x, s3[0][3]);
vst1q_u32(square_sum3[2] + x + 0, sq3[2].val[0]);
vst1q_u32(square_sum3[2] + x + 4, sq3[2].val[1]);
vst1q_u32(square_sum3[3] + x + 0, sq3[3].val[0]);
vst1q_u32(square_sum3[3] + x + 4, sq3[3].val[1]);
- vst1q_u16(sum5[3] + x, s5[3]);
- vst1q_u16(sum5[4] + x, s5[4]);
+ vst1q_u16(sum5[3] + x, s5[0][3]);
+ vst1q_u16(sum5[4] + x, s5[0][4]);
vst1q_u32(square_sum5[3] + x + 0, sq5[3].val[0]);
vst1q_u32(square_sum5[3] + x + 4, sq5[3].val[1]);
vst1q_u32(square_sum5[4] + x + 0, sq5[4].val[0]);
vst1q_u32(square_sum5[4] + x + 4, sq5[4].val[1]);
- s3[0] = vld1q_u16(sum3[0] + x);
- s3[1] = vld1q_u16(sum3[1] + x);
+ s3[0][0] = vld1q_u16(sum3[0] + x);
+ s3[0][1] = vld1q_u16(sum3[1] + x);
sq3[0].val[0] = vld1q_u32(square_sum3[0] + x + 0);
sq3[0].val[1] = vld1q_u32(square_sum3[0] + x + 4);
sq3[1].val[0] = vld1q_u32(square_sum3[1] + x + 0);
sq3[1].val[1] = vld1q_u32(square_sum3[1] + x + 4);
- s5[0] = vld1q_u16(sum5[0] + x);
- s5[1] = vld1q_u16(sum5[1] + x);
- s5[2] = vld1q_u16(sum5[2] + x);
+ s5[0][0] = vld1q_u16(sum5[0] + x);
+ s5[0][1] = vld1q_u16(sum5[1] + x);
+ s5[0][2] = vld1q_u16(sum5[2] + x);
sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 0);
sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 4);
sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 0);
sq5[1].val[1] = vld1q_u32(square_sum5[1] + x + 4);
sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 0);
sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 4);
- CalculateIntermediate3(s3, sq3, scales[1], ma3_0, b3_0);
- CalculateIntermediate3(s3 + 1, sq3 + 1, scales[1], ma3_1, b3_1);
- CalculateIntermediate5(s5, sq5, scales[0], ma5, b5);
+ CalculateIntermediate3<8>(s3[0], sq3, scales[1], &ma3[0][0], &b3[0][1]);
+ CalculateIntermediate3<8>(s3[0] + 1, sq3 + 1, scales[1], &ma3[1][0],
+ &b3[1][1]);
+ CalculateIntermediate5<8>(s5[0], sq5, scales[0], &ma5[0], &b5[0]);
+
+ sq[0][3] = vmull_u8(vget_high_u8(s[0][1]), vget_high_u8(s[0][1]));
+ sq[1][3] = vmull_u8(vget_high_u8(s[1][1]), vget_high_u8(s[1][1]));
+ SumHorizontal(sq[0] + 2, &sq3[2], &sq5[3]);
+ SumHorizontal(sq[1] + 2, &sq3[3], &sq5[4]);
+ vst1q_u16(sum3[2] + x + 8, s3[1][2]);
+ vst1q_u16(sum3[3] + x + 8, s3[1][3]);
+ vst1q_u32(square_sum3[2] + x + 8, sq3[2].val[0]);
+ vst1q_u32(square_sum3[2] + x + 12, sq3[2].val[1]);
+ vst1q_u32(square_sum3[3] + x + 8, sq3[3].val[0]);
+ vst1q_u32(square_sum3[3] + x + 12, sq3[3].val[1]);
+ vst1q_u16(sum5[3] + x + 8, s5[1][3]);
+ vst1q_u16(sum5[4] + x + 8, s5[1][4]);
+ vst1q_u32(square_sum5[3] + x + 8, sq5[3].val[0]);
+ vst1q_u32(square_sum5[3] + x + 12, sq5[3].val[1]);
+ vst1q_u32(square_sum5[4] + x + 8, sq5[4].val[0]);
+ vst1q_u32(square_sum5[4] + x + 12, sq5[4].val[1]);
+ s3[1][0] = vld1q_u16(sum3[0] + x + 8);
+ s3[1][1] = vld1q_u16(sum3[1] + x + 8);
+ sq3[0].val[0] = vld1q_u32(square_sum3[0] + x + 8);
+ sq3[0].val[1] = vld1q_u32(square_sum3[0] + x + 12);
+ sq3[1].val[0] = vld1q_u32(square_sum3[1] + x + 8);
+ sq3[1].val[1] = vld1q_u32(square_sum3[1] + x + 12);
+ s5[1][0] = vld1q_u16(sum5[0] + x + 8);
+ s5[1][1] = vld1q_u16(sum5[1] + x + 8);
+ s5[1][2] = vld1q_u16(sum5[2] + x + 8);
+ sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 8);
+ sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 12);
+ sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 8);
+ sq5[1].val[1] = vld1q_u32(square_sum5[1] + x + 12);
+ sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 8);
+ sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 12);
+ CalculateIntermediate3<0>(s3[1], sq3, scales[1], &ma3[0][1], &b3[0][2]);
+ CalculateIntermediate3<0>(s3[1] + 1, sq3 + 1, scales[1], &ma3[1][1],
+ &b3[1][2]);
+ CalculateIntermediate5<0>(s5[1], sq5, scales[0], &ma5[1], &b5[1]);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRowLo(
+ const uint8_t* const src, const uint16_t scales[2],
+ const uint16_t* const sum3[4], const uint16_t* const sum5[5],
+ const uint32_t* const square_sum3[4], const uint32_t* const square_sum5[5],
+ uint8x16_t* const s, uint16x8_t sq[2], uint8x16_t* const ma3,
+ uint8x16_t* const ma5, uint16x8_t* const b3, uint16x8_t* const b5) {
+ uint16x8_t s3[3], s5[5];
+ uint32x4x2_t sq3[3], sq5[5];
+ *s = vld1q_u8(src);
+ sq[0] = vmull_u8(vget_low_u8(*s), vget_low_u8(*s));
+ sq[1] = vmull_u8(vget_high_u8(*s), vget_high_u8(*s));
+ SumHorizontal(*s, sq, &s3[2], &s5[3], &sq3[2], &sq5[3]);
+ s5[0] = vld1q_u16(sum5[0]);
+ s5[1] = vld1q_u16(sum5[1]);
+ s5[2] = vld1q_u16(sum5[2]);
+ s5[4] = s5[3];
+ sq5[0].val[0] = vld1q_u32(square_sum5[0] + 0);
+ sq5[0].val[1] = vld1q_u32(square_sum5[0] + 4);
+ sq5[1].val[0] = vld1q_u32(square_sum5[1] + 0);
+ sq5[1].val[1] = vld1q_u32(square_sum5[1] + 4);
+ sq5[2].val[0] = vld1q_u32(square_sum5[2] + 0);
+ sq5[2].val[1] = vld1q_u32(square_sum5[2] + 4);
+ sq5[4] = sq5[3];
+ CalculateIntermediate5<0>(s5, sq5, scales[0], ma5, b5);
+ s3[0] = vld1q_u16(sum3[0]);
+ s3[1] = vld1q_u16(sum3[1]);
+ sq3[0].val[0] = vld1q_u32(square_sum3[0] + 0);
+ sq3[0].val[1] = vld1q_u32(square_sum3[0] + 4);
+ sq3[1].val[0] = vld1q_u32(square_sum3[1] + 0);
+ sq3[1].val[1] = vld1q_u32(square_sum3[1] + 4);
+ CalculateIntermediate3<0>(s3, sq3, scales[1], ma3, b3);
}
LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRow(
const uint8_t* const src, const ptrdiff_t x, const uint16_t scales[2],
const uint16_t* const sum3[4], const uint16_t* const sum5[5],
const uint32_t* const square_sum3[4], const uint32_t* const square_sum5[5],
- uint8x8x2_t* const s, uint16x8x2_t* const sq, uint8x8_t* const ma3,
- uint8x8_t* const ma5, uint16x8_t* const b3, uint16x8_t* const b5) {
- uint16x8_t s3[3], s5[5];
+ uint8x16_t s[2], uint16x8_t sq[3], uint8x16_t ma3[2], uint8x16_t ma5[2],
+ uint16x8_t b3[2], uint16x8_t b5[2]) {
+ uint16x8_t s3[2][3], s5[2][5];
uint32x4x2_t sq3[3], sq5[5];
- s->val[1] = vld1_u8(src + x + 8);
- sq->val[1] = vmull_u8(s->val[1], s->val[1]);
- SumHorizontal(*s, *sq, &s3[2], &s5[3], &sq3[2], &sq5[3]);
- s5[0] = vld1q_u16(sum5[0] + x);
- s5[1] = vld1q_u16(sum5[1] + x);
- s5[2] = vld1q_u16(sum5[2] + x);
- s5[4] = s5[3];
+ s[1] = vld1q_u8(src + x + 8);
+ sq[1] = vmull_u8(vget_low_u8(s[1]), vget_low_u8(s[1]));
+ SumHorizontal<8>(s, &s3[0][2], &s3[1][2], &s5[0][3], &s5[1][3]);
+ SumHorizontal(sq, &sq3[2], &sq5[3]);
+ s5[0][0] = vld1q_u16(sum5[0] + x);
+ s5[0][1] = vld1q_u16(sum5[1] + x);
+ s5[0][2] = vld1q_u16(sum5[2] + x);
+ s5[0][4] = s5[0][3];
sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 0);
sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 4);
sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 0);
@@ -1134,14 +1538,36 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRow(
sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 0);
sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 4);
sq5[4] = sq5[3];
- CalculateIntermediate5(s5, sq5, scales[0], ma5, b5);
- s3[0] = vld1q_u16(sum3[0] + x);
- s3[1] = vld1q_u16(sum3[1] + x);
+ CalculateIntermediate5<8>(s5[0], sq5, scales[0], &ma5[0], &b5[0]);
+ s3[0][0] = vld1q_u16(sum3[0] + x);
+ s3[0][1] = vld1q_u16(sum3[1] + x);
sq3[0].val[0] = vld1q_u32(square_sum3[0] + x + 0);
sq3[0].val[1] = vld1q_u32(square_sum3[0] + x + 4);
sq3[1].val[0] = vld1q_u32(square_sum3[1] + x + 0);
sq3[1].val[1] = vld1q_u32(square_sum3[1] + x + 4);
- CalculateIntermediate3(s3, sq3, scales[1], ma3, b3);
+ CalculateIntermediate3<8>(s3[0], sq3, scales[1], &ma3[0], &b3[0]);
+
+ sq[2] = vmull_u8(vget_high_u8(s[1]), vget_high_u8(s[1]));
+ SumHorizontal(sq + 1, &sq3[2], &sq5[3]);
+ s5[1][0] = vld1q_u16(sum5[0] + x + 8);
+ s5[1][1] = vld1q_u16(sum5[1] + x + 8);
+ s5[1][2] = vld1q_u16(sum5[2] + x + 8);
+ s5[1][4] = s5[1][3];
+ sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 8);
+ sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 12);
+ sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 8);
+ sq5[1].val[1] = vld1q_u32(square_sum5[1] + x + 12);
+ sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 8);
+ sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 12);
+ sq5[4] = sq5[3];
+ CalculateIntermediate5<0>(s5[1], sq5, scales[0], &ma5[1], &b5[1]);
+ s3[1][0] = vld1q_u16(sum3[0] + x + 8);
+ s3[1][1] = vld1q_u16(sum3[1] + x + 8);
+ sq3[0].val[0] = vld1q_u32(square_sum3[0] + x + 8);
+ sq3[0].val[1] = vld1q_u32(square_sum3[0] + x + 12);
+ sq3[1].val[0] = vld1q_u32(square_sum3[1] + x + 8);
+ sq3[1].val[1] = vld1q_u32(square_sum3[1] + x + 12);
+ CalculateIntermediate3<0>(s3[1], sq3, scales[1], &ma3[1], &b3[1]);
}
inline void BoxSumFilterPreProcess5(const uint8_t* const src0,
@@ -1150,33 +1576,39 @@ inline void BoxSumFilterPreProcess5(const uint8_t* const src0,
uint16_t* const sum5[5],
uint32_t* const square_sum5[5],
uint16_t* ma565, uint32_t* b565) {
- uint8x8x2_t s[2], mas;
- uint16x8x2_t sq[2], bs;
- s[0].val[0] = vld1_u8(src0);
- s[1].val[0] = vld1_u8(src1);
- sq[0].val[0] = vmull_u8(s[0].val[0], s[0].val[0]);
- sq[1].val[0] = vmull_u8(s[1].val[0], s[1].val[0]);
- BoxFilterPreProcess5(src0, src1, 0, scale, sum5, square_sum5, s, sq,
- &mas.val[0], &bs.val[0]);
+ uint8x16_t s[2][2], mas[2];
+ uint16x8_t sq[2][4], bs[3];
+ BoxFilterPreProcess5Lo(src0, src1, scale, s, sum5, square_sum5, sq, &mas[0],
+ &bs[0]);
int x = 0;
do {
- s[0].val[0] = s[0].val[1];
- s[1].val[0] = s[1].val[1];
- sq[0].val[0] = sq[0].val[1];
- sq[1].val[0] = sq[1].val[1];
- BoxFilterPreProcess5(src0, src1, x + 8, scale, sum5, square_sum5, s, sq,
- &mas.val[1], &bs.val[1]);
- const uint16x8_t ma = Sum565(mas);
- const uint32x4x2_t b = Sum565W(bs);
- vst1q_u16(ma565, ma);
- vst1q_u32(b565 + 0, b.val[0]);
- vst1q_u32(b565 + 4, b.val[1]);
- mas.val[0] = mas.val[1];
- bs.val[0] = bs.val[1];
- ma565 += 8;
- b565 += 8;
- x += 8;
+ uint16x8_t ma[2];
+ uint8x16_t masx[3];
+ uint32x4x2_t b[2];
+ BoxFilterPreProcess5(src0, src1, x + 8, scale, s, sum5, square_sum5, sq,
+ mas, bs + 1);
+ Prepare3_8<0>(mas, masx);
+ ma[0] = Sum565<0>(masx);
+ b[0] = Sum565W(bs);
+ vst1q_u16(ma565, ma[0]);
+ vst1q_u32(b565 + 0, b[0].val[0]);
+ vst1q_u32(b565 + 4, b[0].val[1]);
+
+ ma[1] = Sum565<8>(masx);
+ b[1] = Sum565W(bs + 1);
+ vst1q_u16(ma565 + 8, ma[1]);
+ vst1q_u32(b565 + 8, b[1].val[0]);
+ vst1q_u32(b565 + 12, b[1].val[1]);
+ s[0][0] = s[0][1];
+ s[1][0] = s[1][1];
+ sq[0][1] = sq[0][3];
+ sq[1][1] = sq[1][3];
+ mas[0] = mas[1];
+ bs[0] = bs[2];
+ ma565 += 16;
+ b565 += 16;
+ x += 16;
} while (x < width);
}
@@ -1185,35 +1617,44 @@ LIBGAV1_ALWAYS_INLINE void BoxSumFilterPreProcess3(
const uint8_t* const src, const int width, const uint32_t scale,
uint16_t* const sum3[3], uint32_t* const square_sum3[3], uint16_t* ma343,
uint16_t* ma444, uint32_t* b343, uint32_t* b444) {
- uint8x8x2_t s, mas;
- uint16x8x2_t sq, bs;
- s.val[0] = vld1_u8(src);
- sq.val[0] = vmull_u8(s.val[0], s.val[0]);
- BoxFilterPreProcess3(src, 0, scale, sum3, square_sum3, &s, &sq, &mas.val[0],
- &bs.val[0]);
+ uint8x16_t s[2], mas[2];
+ uint16x8_t sq[4], bs[3];
+ BoxFilterPreProcess3Lo(src, scale, &s[0], sum3, square_sum3, sq, &mas[0],
+ &bs[0]);
int x = 0;
do {
- s.val[0] = s.val[1];
- sq.val[0] = sq.val[1];
- BoxFilterPreProcess3(src, x + 8, scale, sum3, square_sum3, &s, &sq,
- &mas.val[1], &bs.val[1]);
+ uint8x16_t ma3x[3];
+ BoxFilterPreProcess3(src, x + 8, scale, sum3, square_sum3, s, sq + 1, mas,
+ bs + 1);
+ Prepare3_8<0>(mas, ma3x);
if (calculate444) {
- Store343_444(mas, bs, 0, ma343, ma444, b343, b444);
- ma444 += 8;
- b444 += 8;
+ Store343_444<0>(ma3x, bs + 0, 0, ma343, ma444, b343, b444);
+ Store343_444<8>(ma3x, bs + 1, 0, ma343 + 8, ma444 + 8, b343 + 8,
+ b444 + 8);
+ ma444 += 16;
+ b444 += 16;
} else {
- const uint16x8_t ma = Sum343(mas);
- const uint32x4x2_t b = Sum343W(bs);
- vst1q_u16(ma343, ma);
- vst1q_u32(b343 + 0, b.val[0]);
- vst1q_u32(b343 + 4, b.val[1]);
+ uint16x8_t ma[2];
+ uint32x4x2_t b[2];
+ ma[0] = Sum343<0>(ma3x);
+ b[0] = Sum343W(bs);
+ vst1q_u16(ma343, ma[0]);
+ vst1q_u32(b343 + 0, b[0].val[0]);
+ vst1q_u32(b343 + 4, b[0].val[1]);
+ ma[1] = Sum343<8>(ma3x);
+ b[1] = Sum343W(bs + 1);
+ vst1q_u16(ma343 + 8, ma[1]);
+ vst1q_u32(b343 + 8, b[1].val[0]);
+ vst1q_u32(b343 + 12, b[1].val[1]);
}
- mas.val[0] = mas.val[1];
- bs.val[0] = bs.val[1];
- ma343 += 8;
- b343 += 8;
- x += 8;
+ s[0] = s[1];
+ sq[1] = sq[3];
+ mas[0] = mas[1];
+ bs[0] = bs[2];
+ ma343 += 16;
+ b343 += 16;
+ x += 16;
} while (x < width);
}
@@ -1221,48 +1662,58 @@ inline void BoxSumFilterPreProcess(
const uint8_t* const src0, const uint8_t* const src1, const int width,
const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5],
uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
- uint16_t* const ma343[4], uint16_t* const ma444[2], uint16_t* ma565,
- uint32_t* const b343[4], uint32_t* const b444[2], uint32_t* b565) {
- uint8x8x2_t s[2];
- uint8x8x2_t ma3[2], ma5;
- uint16x8x2_t sq[2], b3[2], b5;
- s[0].val[0] = vld1_u8(src0);
- s[1].val[0] = vld1_u8(src1);
- sq[0].val[0] = vmull_u8(s[0].val[0], s[0].val[0]);
- sq[1].val[0] = vmull_u8(s[1].val[0], s[1].val[0]);
- BoxFilterPreProcess(src0, src1, 0, scales, sum3, sum5, square_sum3,
- square_sum5, s, sq, &ma3[0].val[0], &ma3[1].val[0],
- &b3[0].val[0], &b3[1].val[0], &ma5.val[0], &b5.val[0]);
+ uint16_t* const ma343[4], uint16_t* const ma444, uint16_t* ma565,
+ uint32_t* const b343[4], uint32_t* const b444, uint32_t* b565) {
+ uint8x16_t s[2][2], ma3[2][2], ma5[2];
+ uint16x8_t sq[2][4], b3[2][3], b5[3];
+ BoxFilterPreProcessLo(src0, src1, scales, s, sum3, sum5, square_sum3,
+ square_sum5, sq, ma3, b3, &ma5[0], &b5[0]);
int x = 0;
do {
- s[0].val[0] = s[0].val[1];
- s[1].val[0] = s[1].val[1];
- sq[0].val[0] = sq[0].val[1];
- sq[1].val[0] = sq[1].val[1];
- BoxFilterPreProcess(src0, src1, x + 8, scales, sum3, sum5, square_sum3,
- square_sum5, s, sq, &ma3[0].val[1], &ma3[1].val[1],
- &b3[0].val[1], &b3[1].val[1], &ma5.val[1], &b5.val[1]);
- uint16x8_t ma = Sum343(ma3[0]);
- uint32x4x2_t b = Sum343W(b3[0]);
- vst1q_u16(ma343[0] + x, ma);
- vst1q_u32(b343[0] + x, b.val[0]);
- vst1q_u32(b343[0] + x + 4, b.val[1]);
- Store343_444(ma3[1], b3[1], x, ma343[1], ma444[0], b343[1], b444[0]);
- ma = Sum565(ma5);
- b = Sum565W(b5);
- vst1q_u16(ma565, ma);
- vst1q_u32(b565 + 0, b.val[0]);
- vst1q_u32(b565 + 4, b.val[1]);
- ma3[0].val[0] = ma3[0].val[1];
- ma3[1].val[0] = ma3[1].val[1];
- b3[0].val[0] = b3[0].val[1];
- b3[1].val[0] = b3[1].val[1];
- ma5.val[0] = ma5.val[1];
- b5.val[0] = b5.val[1];
- ma565 += 8;
- b565 += 8;
- x += 8;
+ uint16x8_t ma[2];
+ uint8x16_t ma3x[3], ma5x[3];
+ uint32x4x2_t b[2];
+ BoxFilterPreProcess(src0, src1, x + 8, scales, s, sum3, sum5, square_sum3,
+ square_sum5, sq, ma3, b3, ma5, b5 + 1);
+ Prepare3_8<0>(ma3[0], ma3x);
+ ma[0] = Sum343<0>(ma3x);
+ ma[1] = Sum343<8>(ma3x);
+ b[0] = Sum343W(b3[0] + 0);
+ b[1] = Sum343W(b3[0] + 1);
+ vst1q_u16(ma343[0] + x, ma[0]);
+ vst1q_u16(ma343[0] + x + 8, ma[1]);
+ vst1q_u32(b343[0] + x, b[0].val[0]);
+ vst1q_u32(b343[0] + x + 4, b[0].val[1]);
+ vst1q_u32(b343[0] + x + 8, b[1].val[0]);
+ vst1q_u32(b343[0] + x + 12, b[1].val[1]);
+ Prepare3_8<0>(ma3[1], ma3x);
+ Store343_444<0>(ma3x, b3[1], x, ma343[1], ma444, b343[1], b444);
+ Store343_444<8>(ma3x, b3[1] + 1, x + 8, ma343[1], ma444, b343[1], b444);
+ Prepare3_8<0>(ma5, ma5x);
+ ma[0] = Sum565<0>(ma5x);
+ ma[1] = Sum565<8>(ma5x);
+ b[0] = Sum565W(b5);
+ b[1] = Sum565W(b5 + 1);
+ vst1q_u16(ma565, ma[0]);
+ vst1q_u16(ma565 + 8, ma[1]);
+ vst1q_u32(b565 + 0, b[0].val[0]);
+ vst1q_u32(b565 + 4, b[0].val[1]);
+ vst1q_u32(b565 + 8, b[1].val[0]);
+ vst1q_u32(b565 + 12, b[1].val[1]);
+ s[0][0] = s[0][1];
+ s[1][0] = s[1][1];
+ sq[0][1] = sq[0][3];
+ sq[1][1] = sq[1][3];
+ ma3[0][0] = ma3[0][1];
+ ma3[1][0] = ma3[1][1];
+ b3[0][0] = b3[0][2];
+ b3[1][0] = b3[1][2];
+ ma5[0] = ma5[1];
+ b5[0] = b5[2];
+ ma565 += 16;
+ b565 += 16;
+ x += 16;
} while (x < width);
}
@@ -1310,37 +1761,36 @@ inline int16x8_t CalculateFilteredOutputPass2(const uint8x8_t s,
return CalculateFilteredOutput<5>(s, ma_sum, b_sum);
}
-inline void SelfGuidedFinal(const uint8x8_t src, const int32x4_t v[2],
- uint8_t* const dst) {
+inline uint8x8_t SelfGuidedFinal(const uint8x8_t src, const int32x4_t v[2]) {
const int16x4_t v_lo =
vrshrn_n_s32(v[0], kSgrProjRestoreBits + kSgrProjPrecisionBits);
const int16x4_t v_hi =
vrshrn_n_s32(v[1], kSgrProjRestoreBits + kSgrProjPrecisionBits);
const int16x8_t vv = vcombine_s16(v_lo, v_hi);
- const int16x8_t s = ZeroExtend(src);
- const int16x8_t d = vaddq_s16(s, vv);
- vst1_u8(dst, vqmovun_s16(d));
+ const int16x8_t d =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(vv), src));
+ return vqmovun_s16(d);
}
-inline void SelfGuidedDoubleMultiplier(const uint8x8_t src,
- const int16x8_t filter[2], const int w0,
- const int w2, uint8_t* const dst) {
+inline uint8x8_t SelfGuidedDoubleMultiplier(const uint8x8_t src,
+ const int16x8_t filter[2],
+ const int w0, const int w2) {
int32x4_t v[2];
v[0] = vmull_n_s16(vget_low_s16(filter[0]), w0);
v[1] = vmull_n_s16(vget_high_s16(filter[0]), w0);
v[0] = vmlal_n_s16(v[0], vget_low_s16(filter[1]), w2);
v[1] = vmlal_n_s16(v[1], vget_high_s16(filter[1]), w2);
- SelfGuidedFinal(src, v, dst);
+ return SelfGuidedFinal(src, v);
}
-inline void SelfGuidedSingleMultiplier(const uint8x8_t src,
- const int16x8_t filter, const int w0,
- uint8_t* const dst) {
+inline uint8x8_t SelfGuidedSingleMultiplier(const uint8x8_t src,
+ const int16x8_t filter,
+ const int w0) {
// weight: -96 to 96 (Sgrproj_Xqd_Min/Max)
int32x4_t v[2];
v[0] = vmull_n_s16(vget_low_s16(filter), w0);
v[1] = vmull_n_s16(vget_high_s16(filter), w0);
- SelfGuidedFinal(src, v, dst);
+ return SelfGuidedFinal(src, v);
}
LIBGAV1_ALWAYS_INLINE void BoxFilterPass1(
@@ -1349,43 +1799,60 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterPass1(
uint32_t* const square_sum5[5], const int width, const uint32_t scale,
const int16_t w0, uint16_t* const ma565[2], uint32_t* const b565[2],
uint8_t* const dst) {
- uint8x8x2_t s[2], mas;
- uint16x8x2_t sq[2], bs;
- s[0].val[0] = vld1_u8(src0);
- s[1].val[0] = vld1_u8(src1);
- sq[0].val[0] = vmull_u8(s[0].val[0], s[0].val[0]);
- sq[1].val[0] = vmull_u8(s[1].val[0], s[1].val[0]);
- BoxFilterPreProcess5(src0, src1, 0, scale, sum5, square_sum5, s, sq,
- &mas.val[0], &bs.val[0]);
+ uint8x16_t s[2][2], mas[2];
+ uint16x8_t sq[2][4], bs[3];
+ BoxFilterPreProcess5Lo(src0, src1, scale, s, sum5, square_sum5, sq, &mas[0],
+ &bs[0]);
int x = 0;
do {
- s[0].val[0] = s[0].val[1];
- s[1].val[0] = s[1].val[1];
- sq[0].val[0] = sq[0].val[1];
- sq[1].val[0] = sq[1].val[1];
- BoxFilterPreProcess5(src0, src1, x + 8, scale, sum5, square_sum5, s, sq,
- &mas.val[1], &bs.val[1]);
uint16x8_t ma[2];
+ uint8x16_t masx[3];
uint32x4x2_t b[2];
- ma[1] = Sum565(mas);
+ int16x8_t p0, p1;
+ BoxFilterPreProcess5(src0, src1, x + 8, scale, s, sum5, square_sum5, sq,
+ mas, bs + 1);
+ Prepare3_8<0>(mas, masx);
+ ma[1] = Sum565<0>(masx);
b[1] = Sum565W(bs);
vst1q_u16(ma565[1] + x, ma[1]);
vst1q_u32(b565[1] + x + 0, b[1].val[0]);
vst1q_u32(b565[1] + x + 4, b[1].val[1]);
- const uint8x8_t sr0 = vld1_u8(src + x);
- const uint8x8_t sr1 = vld1_u8(src + stride + x);
- int16x8_t p0, p1;
+ const uint8x16_t sr0 = vld1q_u8(src + x);
+ const uint8x16_t sr1 = vld1q_u8(src + stride + x);
+ const uint8x8_t sr00 = vget_low_u8(sr0);
+ const uint8x8_t sr10 = vget_low_u8(sr1);
ma[0] = vld1q_u16(ma565[0] + x);
b[0].val[0] = vld1q_u32(b565[0] + x + 0);
b[0].val[1] = vld1q_u32(b565[0] + x + 4);
- p0 = CalculateFilteredOutputPass1(sr0, ma, b);
- p1 = CalculateFilteredOutput<4>(sr1, ma[1], b[1]);
- SelfGuidedSingleMultiplier(sr0, p0, w0, dst + x);
- SelfGuidedSingleMultiplier(sr1, p1, w0, dst + stride + x);
- mas.val[0] = mas.val[1];
- bs.val[0] = bs.val[1];
- x += 8;
+ p0 = CalculateFilteredOutputPass1(sr00, ma, b);
+ p1 = CalculateFilteredOutput<4>(sr10, ma[1], b[1]);
+ const uint8x8_t d00 = SelfGuidedSingleMultiplier(sr00, p0, w0);
+ const uint8x8_t d10 = SelfGuidedSingleMultiplier(sr10, p1, w0);
+
+ ma[1] = Sum565<8>(masx);
+ b[1] = Sum565W(bs + 1);
+ vst1q_u16(ma565[1] + x + 8, ma[1]);
+ vst1q_u32(b565[1] + x + 8, b[1].val[0]);
+ vst1q_u32(b565[1] + x + 12, b[1].val[1]);
+ const uint8x8_t sr01 = vget_high_u8(sr0);
+ const uint8x8_t sr11 = vget_high_u8(sr1);
+ ma[0] = vld1q_u16(ma565[0] + x + 8);
+ b[0].val[0] = vld1q_u32(b565[0] + x + 8);
+ b[0].val[1] = vld1q_u32(b565[0] + x + 12);
+ p0 = CalculateFilteredOutputPass1(sr01, ma, b);
+ p1 = CalculateFilteredOutput<4>(sr11, ma[1], b[1]);
+ const uint8x8_t d01 = SelfGuidedSingleMultiplier(sr01, p0, w0);
+ const uint8x8_t d11 = SelfGuidedSingleMultiplier(sr11, p1, w0);
+ vst1q_u8(dst + x, vcombine_u8(d00, d01));
+ vst1q_u8(dst + stride + x, vcombine_u8(d10, d11));
+ s[0][0] = s[0][1];
+ s[1][0] = s[1][1];
+ sq[0][1] = sq[0][3];
+ sq[1][1] = sq[1][3];
+ mas[0] = mas[1];
+ bs[0] = bs[2];
+ x += 16;
} while (x < width);
}
@@ -1396,34 +1863,45 @@ inline void BoxFilterPass1LastRow(const uint8_t* const src,
uint32_t* const square_sum5[5],
uint16_t* ma565, uint32_t* b565,
uint8_t* const dst) {
- uint8x8x2_t s, mas;
- uint16x8x2_t sq, bs;
- s.val[0] = vld1_u8(src0);
- sq.val[0] = vmull_u8(s.val[0], s.val[0]);
- BoxFilterPreProcess5LastRow(src0, 0, scale, sum5, square_sum5, &s, &sq,
- &mas.val[0], &bs.val[0]);
+ uint8x16_t s[2], mas[2];
+ uint16x8_t sq[4], bs[4];
+ BoxFilterPreProcess5LastRowLo(src0, scale, s, sum5, square_sum5, sq, &mas[0],
+ &bs[0]);
int x = 0;
do {
- s.val[0] = s.val[1];
- sq.val[0] = sq.val[1];
- BoxFilterPreProcess5LastRow(src0, x + 8, scale, sum5, square_sum5, &s, &sq,
- &mas.val[1], &bs.val[1]);
uint16x8_t ma[2];
+ uint8x16_t masx[3];
uint32x4x2_t b[2];
- ma[1] = Sum565(mas);
+ BoxFilterPreProcess5LastRow(src0, x + 8, scale, s, sum5, square_sum5,
+ sq + 1, mas, bs + 1);
+ Prepare3_8<0>(mas, masx);
+ ma[1] = Sum565<0>(masx);
b[1] = Sum565W(bs);
- mas.val[0] = mas.val[1];
- bs.val[0] = bs.val[1];
ma[0] = vld1q_u16(ma565);
b[0].val[0] = vld1q_u32(b565 + 0);
b[0].val[1] = vld1q_u32(b565 + 4);
- const uint8x8_t sr = vld1_u8(src + x);
- const int16x8_t p = CalculateFilteredOutputPass1(sr, ma, b);
- SelfGuidedSingleMultiplier(sr, p, w0, dst + x);
- ma565 += 8;
- b565 += 8;
- x += 8;
+ const uint8x16_t sr = vld1q_u8(src + x);
+ const uint8x8_t sr0 = vget_low_u8(sr);
+ const int16x8_t p0 = CalculateFilteredOutputPass1(sr0, ma, b);
+ const uint8x8_t d0 = SelfGuidedSingleMultiplier(sr0, p0, w0);
+
+ ma[1] = Sum565<8>(masx);
+ b[1] = Sum565W(bs + 1);
+ bs[0] = bs[2];
+ const uint8x8_t sr1 = vget_high_u8(sr);
+ ma[0] = vld1q_u16(ma565 + 8);
+ b[0].val[0] = vld1q_u32(b565 + 8);
+ b[0].val[1] = vld1q_u32(b565 + 12);
+ const int16x8_t p1 = CalculateFilteredOutputPass1(sr1, ma, b);
+ const uint8x8_t d1 = SelfGuidedSingleMultiplier(sr1, p1, w0);
+ vst1q_u8(dst + x, vcombine_u8(d0, d1));
+ s[0] = s[1];
+ sq[1] = sq[3];
+ mas[0] = mas[1];
+ ma565 += 16;
+ b565 += 16;
+ x += 16;
} while (x < width);
}
@@ -1433,35 +1911,49 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterPass2(
uint32_t* const square_sum3[3], uint16_t* const ma343[3],
uint16_t* const ma444[2], uint32_t* const b343[3], uint32_t* const b444[2],
uint8_t* const dst) {
- uint8x8x2_t s, mas;
- uint16x8x2_t sq, bs;
- s.val[0] = vld1_u8(src0);
- sq.val[0] = vmull_u8(s.val[0], s.val[0]);
- BoxFilterPreProcess3(src0, 0, scale, sum3, square_sum3, &s, &sq, &mas.val[0],
- &bs.val[0]);
+ uint8x16_t s[2], mas[2];
+ uint16x8_t sq[4], bs[3];
+ BoxFilterPreProcess3Lo(src0, scale, &s[0], sum3, square_sum3, sq, &mas[0],
+ &bs[0]);
int x = 0;
do {
- s.val[0] = s.val[1];
- sq.val[0] = sq.val[1];
- BoxFilterPreProcess3(src0, x + 8, scale, sum3, square_sum3, &s, &sq,
- &mas.val[1], &bs.val[1]);
uint16x8_t ma[3];
+ uint8x16_t ma3x[3];
uint32x4x2_t b[3];
- Store343_444(mas, bs, x, &ma[2], &b[2], ma343[2], ma444[1], b343[2],
- b444[1]);
- const uint8x8_t sr = vld1_u8(src + x);
+ BoxFilterPreProcess3(src0, x + 8, scale, sum3, square_sum3, s, sq + 1, mas,
+ bs + 1);
+ Prepare3_8<0>(mas, ma3x);
+ Store343_444<0>(ma3x, bs, x, &ma[2], &b[2], ma343[2], ma444[1], b343[2],
+ b444[1]);
+ const uint8x16_t sr = vld1q_u8(src + x);
+ const uint8x8_t sr0 = vget_low_u8(sr);
ma[0] = vld1q_u16(ma343[0] + x);
ma[1] = vld1q_u16(ma444[0] + x);
b[0].val[0] = vld1q_u32(b343[0] + x + 0);
b[0].val[1] = vld1q_u32(b343[0] + x + 4);
b[1].val[0] = vld1q_u32(b444[0] + x + 0);
b[1].val[1] = vld1q_u32(b444[0] + x + 4);
- const int16x8_t p = CalculateFilteredOutputPass2(sr, ma, b);
- SelfGuidedSingleMultiplier(sr, p, w0, dst + x);
- mas.val[0] = mas.val[1];
- bs.val[0] = bs.val[1];
- x += 8;
+ const int16x8_t p0 = CalculateFilteredOutputPass2(sr0, ma, b);
+ const uint8x8_t d0 = SelfGuidedSingleMultiplier(sr0, p0, w0);
+
+ Store343_444<8>(ma3x, bs + 1, x + 8, &ma[2], &b[2], ma343[2], ma444[1],
+ b343[2], b444[1]);
+ const uint8x8_t sr1 = vget_high_u8(sr);
+ ma[0] = vld1q_u16(ma343[0] + x + 8);
+ ma[1] = vld1q_u16(ma444[0] + x + 8);
+ b[0].val[0] = vld1q_u32(b343[0] + x + 8);
+ b[0].val[1] = vld1q_u32(b343[0] + x + 12);
+ b[1].val[0] = vld1q_u32(b444[0] + x + 8);
+ b[1].val[1] = vld1q_u32(b444[0] + x + 12);
+ const int16x8_t p1 = CalculateFilteredOutputPass2(sr1, ma, b);
+ const uint8x8_t d1 = SelfGuidedSingleMultiplier(sr1, p1, w0);
+ vst1q_u8(dst + x, vcombine_u8(d0, d1));
+ s[0] = s[1];
+ sq[1] = sq[3];
+ mas[0] = mas[1];
+ bs[0] = bs[2];
+ x += 16;
} while (x < width);
}
@@ -1474,64 +1966,96 @@ LIBGAV1_ALWAYS_INLINE void BoxFilter(
uint16_t* const ma343[4], uint16_t* const ma444[3],
uint16_t* const ma565[2], uint32_t* const b343[4], uint32_t* const b444[3],
uint32_t* const b565[2], uint8_t* const dst) {
- uint8x8x2_t s[2], ma3[2], ma5;
- uint16x8x2_t sq[2], b3[2], b5;
- s[0].val[0] = vld1_u8(src0);
- s[1].val[0] = vld1_u8(src1);
- sq[0].val[0] = vmull_u8(s[0].val[0], s[0].val[0]);
- sq[1].val[0] = vmull_u8(s[1].val[0], s[1].val[0]);
- BoxFilterPreProcess(src0, src1, 0, scales, sum3, sum5, square_sum3,
- square_sum5, s, sq, &ma3[0].val[0], &ma3[1].val[0],
- &b3[0].val[0], &b3[1].val[0], &ma5.val[0], &b5.val[0]);
+ uint8x16_t s[2][2], ma3[2][2], ma5[2];
+ uint16x8_t sq[2][4], b3[2][3], b5[3];
+ BoxFilterPreProcessLo(src0, src1, scales, s, sum3, sum5, square_sum3,
+ square_sum5, sq, ma3, b3, &ma5[0], &b5[0]);
int x = 0;
do {
- s[0].val[0] = s[0].val[1];
- s[1].val[0] = s[1].val[1];
- sq[0].val[0] = sq[0].val[1];
- sq[1].val[0] = sq[1].val[1];
- BoxFilterPreProcess(src0, src1, x + 8, scales, sum3, sum5, square_sum3,
- square_sum5, s, sq, &ma3[0].val[1], &ma3[1].val[1],
- &b3[0].val[1], &b3[1].val[1], &ma5.val[1], &b5.val[1]);
uint16x8_t ma[3][3];
+ uint8x16_t ma3x[2][3], ma5x[3];
uint32x4x2_t b[3][3];
- Store343_444(ma3[0], b3[0], x, &ma[1][2], &ma[2][1], &b[1][2], &b[2][1],
- ma343[2], ma444[1], b343[2], b444[1]);
- Store343_444(ma3[1], b3[1], x, &ma[2][2], &b[2][2], ma343[3], ma444[2],
- b343[3], b444[2]);
- ma[0][1] = Sum565(ma5);
+ int16x8_t p[2][2];
+ BoxFilterPreProcess(src0, src1, x + 8, scales, s, sum3, sum5, square_sum3,
+ square_sum5, sq, ma3, b3, ma5, b5 + 1);
+ Prepare3_8<0>(ma3[0], ma3x[0]);
+ Prepare3_8<0>(ma3[1], ma3x[1]);
+ Store343_444<0>(ma3x[0], b3[0], x, &ma[1][2], &ma[2][1], &b[1][2], &b[2][1],
+ ma343[2], ma444[1], b343[2], b444[1]);
+ Store343_444<0>(ma3x[1], b3[1], x, &ma[2][2], &b[2][2], ma343[3], ma444[2],
+ b343[3], b444[2]);
+ Prepare3_8<0>(ma5, ma5x);
+ ma[0][1] = Sum565<0>(ma5x);
b[0][1] = Sum565W(b5);
vst1q_u16(ma565[1] + x, ma[0][1]);
vst1q_u32(b565[1] + x, b[0][1].val[0]);
vst1q_u32(b565[1] + x + 4, b[0][1].val[1]);
- ma3[0].val[0] = ma3[0].val[1];
- ma3[1].val[0] = ma3[1].val[1];
- b3[0].val[0] = b3[0].val[1];
- b3[1].val[0] = b3[1].val[1];
- ma5.val[0] = ma5.val[1];
- b5.val[0] = b5.val[1];
- int16x8_t p[2][2];
- const uint8x8_t sr0 = vld1_u8(src + x);
- const uint8x8_t sr1 = vld1_u8(src + stride + x);
+ const uint8x16_t sr0 = vld1q_u8(src + x);
+ const uint8x16_t sr1 = vld1q_u8(src + stride + x);
+ const uint8x8_t sr00 = vget_low_u8(sr0);
+ const uint8x8_t sr10 = vget_low_u8(sr1);
ma[0][0] = vld1q_u16(ma565[0] + x);
b[0][0].val[0] = vld1q_u32(b565[0] + x);
b[0][0].val[1] = vld1q_u32(b565[0] + x + 4);
- p[0][0] = CalculateFilteredOutputPass1(sr0, ma[0], b[0]);
- p[1][0] = CalculateFilteredOutput<4>(sr1, ma[0][1], b[0][1]);
+ p[0][0] = CalculateFilteredOutputPass1(sr00, ma[0], b[0]);
+ p[1][0] = CalculateFilteredOutput<4>(sr10, ma[0][1], b[0][1]);
ma[1][0] = vld1q_u16(ma343[0] + x);
ma[1][1] = vld1q_u16(ma444[0] + x);
b[1][0].val[0] = vld1q_u32(b343[0] + x);
b[1][0].val[1] = vld1q_u32(b343[0] + x + 4);
b[1][1].val[0] = vld1q_u32(b444[0] + x);
b[1][1].val[1] = vld1q_u32(b444[0] + x + 4);
- p[0][1] = CalculateFilteredOutputPass2(sr0, ma[1], b[1]);
+ p[0][1] = CalculateFilteredOutputPass2(sr00, ma[1], b[1]);
ma[2][0] = vld1q_u16(ma343[1] + x);
b[2][0].val[0] = vld1q_u32(b343[1] + x);
b[2][0].val[1] = vld1q_u32(b343[1] + x + 4);
- p[1][1] = CalculateFilteredOutputPass2(sr1, ma[2], b[2]);
- SelfGuidedDoubleMultiplier(sr0, p[0], w0, w2, dst + x);
- SelfGuidedDoubleMultiplier(sr1, p[1], w0, w2, dst + stride + x);
- x += 8;
+ p[1][1] = CalculateFilteredOutputPass2(sr10, ma[2], b[2]);
+ const uint8x8_t d00 = SelfGuidedDoubleMultiplier(sr00, p[0], w0, w2);
+ const uint8x8_t d10 = SelfGuidedDoubleMultiplier(sr10, p[1], w0, w2);
+
+ Store343_444<8>(ma3x[0], b3[0] + 1, x + 8, &ma[1][2], &ma[2][1], &b[1][2],
+ &b[2][1], ma343[2], ma444[1], b343[2], b444[1]);
+ Store343_444<8>(ma3x[1], b3[1] + 1, x + 8, &ma[2][2], &b[2][2], ma343[3],
+ ma444[2], b343[3], b444[2]);
+ ma[0][1] = Sum565<8>(ma5x);
+ b[0][1] = Sum565W(b5 + 1);
+ vst1q_u16(ma565[1] + x + 8, ma[0][1]);
+ vst1q_u32(b565[1] + x + 8, b[0][1].val[0]);
+ vst1q_u32(b565[1] + x + 12, b[0][1].val[1]);
+ b3[0][0] = b3[0][2];
+ b3[1][0] = b3[1][2];
+ b5[0] = b5[2];
+ const uint8x8_t sr01 = vget_high_u8(sr0);
+ const uint8x8_t sr11 = vget_high_u8(sr1);
+ ma[0][0] = vld1q_u16(ma565[0] + x + 8);
+ b[0][0].val[0] = vld1q_u32(b565[0] + x + 8);
+ b[0][0].val[1] = vld1q_u32(b565[0] + x + 12);
+ p[0][0] = CalculateFilteredOutputPass1(sr01, ma[0], b[0]);
+ p[1][0] = CalculateFilteredOutput<4>(sr11, ma[0][1], b[0][1]);
+ ma[1][0] = vld1q_u16(ma343[0] + x + 8);
+ ma[1][1] = vld1q_u16(ma444[0] + x + 8);
+ b[1][0].val[0] = vld1q_u32(b343[0] + x + 8);
+ b[1][0].val[1] = vld1q_u32(b343[0] + x + 12);
+ b[1][1].val[0] = vld1q_u32(b444[0] + x + 8);
+ b[1][1].val[1] = vld1q_u32(b444[0] + x + 12);
+ p[0][1] = CalculateFilteredOutputPass2(sr01, ma[1], b[1]);
+ ma[2][0] = vld1q_u16(ma343[1] + x + 8);
+ b[2][0].val[0] = vld1q_u32(b343[1] + x + 8);
+ b[2][0].val[1] = vld1q_u32(b343[1] + x + 12);
+ p[1][1] = CalculateFilteredOutputPass2(sr11, ma[2], b[2]);
+ const uint8x8_t d01 = SelfGuidedDoubleMultiplier(sr01, p[0], w0, w2);
+ const uint8x8_t d11 = SelfGuidedDoubleMultiplier(sr11, p[1], w0, w2);
+ vst1q_u8(dst + x, vcombine_u8(d00, d01));
+ vst1q_u8(dst + stride + x, vcombine_u8(d10, d11));
+ s[0][0] = s[0][1];
+ s[1][0] = s[1][1];
+ sq[0][1] = sq[0][3];
+ sq[1][1] = sq[1][3];
+ ma3[0][0] = ma3[0][1];
+ ma3[1][0] = ma3[1][1];
+ ma5[0] = ma5[1];
+ x += 16;
} while (x < width);
}
@@ -1540,58 +2064,79 @@ inline void BoxFilterLastRow(
const uint16_t scales[2], const int16_t w0, const int16_t w2,
uint16_t* const sum3[4], uint16_t* const sum5[5],
uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
- uint16_t* const ma343[4], uint16_t* const ma444[3],
- uint16_t* const ma565[2], uint32_t* const b343[4], uint32_t* const b444[3],
- uint32_t* const b565[2], uint8_t* const dst) {
- uint8x8x2_t s, ma3, ma5;
- uint16x8x2_t sq, b3, b5;
- uint16x8_t ma[3];
+ uint16_t* const ma343, uint16_t* const ma444, uint16_t* const ma565,
+ uint32_t* const b343, uint32_t* const b444, uint32_t* const b565,
+ uint8_t* const dst) {
+ uint8x16_t s[2], ma3[2], ma5[2];
+ uint16x8_t sq[4], ma[3], b3[3], b5[3];
uint32x4x2_t b[3];
- s.val[0] = vld1_u8(src0);
- sq.val[0] = vmull_u8(s.val[0], s.val[0]);
- BoxFilterPreProcessLastRow(src0, 0, scales, sum3, sum5, square_sum3,
- square_sum5, &s, &sq, &ma3.val[0], &ma5.val[0],
- &b3.val[0], &b5.val[0]);
+ BoxFilterPreProcessLastRowLo(src0, scales, sum3, sum5, square_sum3,
+ square_sum5, &s[0], sq, &ma3[0], &ma5[0], &b3[0],
+ &b5[0]);
int x = 0;
do {
- s.val[0] = s.val[1];
- sq.val[0] = sq.val[1];
+ uint8x16_t ma3x[3], ma5x[3];
+ int16x8_t p[2];
BoxFilterPreProcessLastRow(src0, x + 8, scales, sum3, sum5, square_sum3,
- square_sum5, &s, &sq, &ma3.val[1], &ma5.val[1],
- &b3.val[1], &b5.val[1]);
- ma[1] = Sum565(ma5);
+ square_sum5, s, sq + 1, ma3, ma5, &b3[1],
+ &b5[1]);
+ Prepare3_8<0>(ma5, ma5x);
+ ma[1] = Sum565<0>(ma5x);
b[1] = Sum565W(b5);
- ma5.val[0] = ma5.val[1];
- b5.val[0] = b5.val[1];
- ma[2] = Sum343(ma3);
+ Prepare3_8<0>(ma3, ma3x);
+ ma[2] = Sum343<0>(ma3x);
b[2] = Sum343W(b3);
- ma3.val[0] = ma3.val[1];
- b3.val[0] = b3.val[1];
- const uint8x8_t sr = vld1_u8(src + x);
- int16x8_t p[2];
- ma[0] = vld1q_u16(ma565[0] + x);
- b[0].val[0] = vld1q_u32(b565[0] + x + 0);
- b[0].val[1] = vld1q_u32(b565[0] + x + 4);
- p[0] = CalculateFilteredOutputPass1(sr, ma, b);
- ma[0] = vld1q_u16(ma343[0] + x);
- ma[1] = vld1q_u16(ma444[0] + x);
- b[0].val[0] = vld1q_u32(b343[0] + x + 0);
- b[0].val[1] = vld1q_u32(b343[0] + x + 4);
- b[1].val[0] = vld1q_u32(b444[0] + x + 0);
- b[1].val[1] = vld1q_u32(b444[0] + x + 4);
- p[1] = CalculateFilteredOutputPass2(sr, ma, b);
- SelfGuidedDoubleMultiplier(sr, p, w0, w2, dst + x);
- x += 8;
+ const uint8x16_t sr = vld1q_u8(src + x);
+ const uint8x8_t sr0 = vget_low_u8(sr);
+ ma[0] = vld1q_u16(ma565 + x);
+ b[0].val[0] = vld1q_u32(b565 + x + 0);
+ b[0].val[1] = vld1q_u32(b565 + x + 4);
+ p[0] = CalculateFilteredOutputPass1(sr0, ma, b);
+ ma[0] = vld1q_u16(ma343 + x);
+ ma[1] = vld1q_u16(ma444 + x);
+ b[0].val[0] = vld1q_u32(b343 + x + 0);
+ b[0].val[1] = vld1q_u32(b343 + x + 4);
+ b[1].val[0] = vld1q_u32(b444 + x + 0);
+ b[1].val[1] = vld1q_u32(b444 + x + 4);
+ p[1] = CalculateFilteredOutputPass2(sr0, ma, b);
+ const uint8x8_t d0 = SelfGuidedDoubleMultiplier(sr0, p, w0, w2);
+
+ ma[1] = Sum565<8>(ma5x);
+ b[1] = Sum565W(b5 + 1);
+ b5[0] = b5[2];
+ ma[2] = Sum343<8>(ma3x);
+ b[2] = Sum343W(b3 + 1);
+ b3[0] = b3[2];
+ const uint8x8_t sr1 = vget_high_u8(sr);
+ ma[0] = vld1q_u16(ma565 + x + 8);
+ b[0].val[0] = vld1q_u32(b565 + x + 8);
+ b[0].val[1] = vld1q_u32(b565 + x + 12);
+ p[0] = CalculateFilteredOutputPass1(sr1, ma, b);
+ ma[0] = vld1q_u16(ma343 + x + 8);
+ ma[1] = vld1q_u16(ma444 + x + 8);
+ b[0].val[0] = vld1q_u32(b343 + x + 8);
+ b[0].val[1] = vld1q_u32(b343 + x + 12);
+ b[1].val[0] = vld1q_u32(b444 + x + 8);
+ b[1].val[1] = vld1q_u32(b444 + x + 12);
+ p[1] = CalculateFilteredOutputPass2(sr1, ma, b);
+ const uint8x8_t d1 = SelfGuidedDoubleMultiplier(sr1, p, w0, w2);
+ vst1q_u8(dst + x, vcombine_u8(d0, d1));
+ s[0] = s[1];
+ sq[1] = sq[3];
+ ma3[0] = ma3[1];
+ ma5[0] = ma5[1];
+ x += 16;
} while (x < width);
}
LIBGAV1_ALWAYS_INLINE void BoxFilterProcess(
const RestorationUnitInfo& restoration_info, const uint8_t* src,
- const uint8_t* const top_border, const uint8_t* bottom_border,
- const ptrdiff_t stride, const int width, const int height,
+ const ptrdiff_t stride, const uint8_t* const top_border,
+ const ptrdiff_t top_border_stride, const uint8_t* bottom_border,
+ const ptrdiff_t bottom_border_stride, const int width, const int height,
SgrBuffer* const sgr_buffer, uint8_t* dst) {
- const auto temp_stride = Align<ptrdiff_t>(width, 8);
+ const auto temp_stride = Align<ptrdiff_t>(width, 16);
const ptrdiff_t sum_stride = temp_stride + 8;
const int sgr_proj_index = restoration_info.sgr_proj_info.index;
const uint16_t* const scales = kSgrScaleParameter[sgr_proj_index]; // < 2^12.
@@ -1628,13 +2173,13 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterProcess(
b565[1] = b565[0] + temp_stride;
assert(scales[0] != 0);
assert(scales[1] != 0);
- BoxSum(top_border, stride, 2, sum_stride, sum3[0], sum5[1], square_sum3[0],
- square_sum5[1]);
+ BoxSum(top_border, top_border_stride, sum_stride, sum3[0], sum5[1],
+ square_sum3[0], square_sum5[1]);
sum5[0] = sum5[1];
square_sum5[0] = square_sum5[1];
const uint8_t* const s = (height > 1) ? src + stride : bottom_border;
BoxSumFilterPreProcess(src, s, width, scales, sum3, sum5, square_sum3,
- square_sum5, ma343, ma444, ma565[0], b343, b444,
+ square_sum5, ma343, ma444[0], ma565[0], b343, b444[0],
b565[0]);
sum5[0] = sgr_buffer->sum5;
square_sum5[0] = sgr_buffer->square_sum5;
@@ -1665,7 +2210,7 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterProcess(
const uint8_t* sr[2];
if ((height & 1) == 0) {
sr[0] = bottom_border;
- sr[1] = bottom_border + stride;
+ sr[1] = bottom_border + bottom_border_stride;
} else {
sr[0] = src + 2 * stride;
sr[1] = bottom_border;
@@ -1689,20 +2234,22 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterProcess(
std::swap(ma565[0], ma565[1]);
std::swap(b565[0], b565[1]);
}
- BoxFilterLastRow(src + 3, bottom_border + stride, width, scales, w0, w2,
- sum3, sum5, square_sum3, square_sum5, ma343, ma444, ma565,
- b343, b444, b565, dst);
+ BoxFilterLastRow(src + 3, bottom_border + bottom_border_stride, width,
+ scales, w0, w2, sum3, sum5, square_sum3, square_sum5,
+ ma343[0], ma444[0], ma565[0], b343[0], b444[0], b565[0],
+ dst);
}
}
inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
- const uint8_t* src,
+ const uint8_t* src, const ptrdiff_t stride,
const uint8_t* const top_border,
+ const ptrdiff_t top_border_stride,
const uint8_t* bottom_border,
- const ptrdiff_t stride, const int width,
- const int height, SgrBuffer* const sgr_buffer,
- uint8_t* dst) {
- const auto temp_stride = Align<ptrdiff_t>(width, 8);
+ const ptrdiff_t bottom_border_stride,
+ const int width, const int height,
+ SgrBuffer* const sgr_buffer, uint8_t* dst) {
+ const auto temp_stride = Align<ptrdiff_t>(width, 16);
const ptrdiff_t sum_stride = temp_stride + 8;
const int sgr_proj_index = restoration_info.sgr_proj_info.index;
const uint32_t scale = kSgrScaleParameter[sgr_proj_index][0]; // < 2^12.
@@ -1720,7 +2267,7 @@ inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
b565[0] = sgr_buffer->b565;
b565[1] = b565[0] + temp_stride;
assert(scale != 0);
- BoxSum<5>(top_border, stride, 2, sum_stride, sum5[1], square_sum5[1]);
+ BoxSum<5>(top_border, top_border_stride, sum_stride, sum5[1], square_sum5[1]);
sum5[0] = sum5[1];
square_sum5[0] = square_sum5[1];
const uint8_t* const s = (height > 1) ? src + stride : bottom_border;
@@ -1746,7 +2293,7 @@ inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
const uint8_t* sr[2];
if ((height & 1) == 0) {
sr[0] = bottom_border;
- sr[1] = bottom_border + stride;
+ sr[1] = bottom_border + bottom_border_stride;
} else {
sr[0] = src + 2 * stride;
sr[1] = bottom_border;
@@ -1763,20 +2310,21 @@ inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
Circulate5PointersBy2<uint16_t>(sum5);
Circulate5PointersBy2<uint32_t>(square_sum5);
}
- BoxFilterPass1LastRow(src + 3, bottom_border + stride, width, scale, w0,
- sum5, square_sum5, ma565[0], b565[0], dst);
+ BoxFilterPass1LastRow(src + 3, bottom_border + bottom_border_stride, width,
+ scale, w0, sum5, square_sum5, ma565[0], b565[0], dst);
}
}
inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
- const uint8_t* src,
+ const uint8_t* src, const ptrdiff_t stride,
const uint8_t* const top_border,
+ const ptrdiff_t top_border_stride,
const uint8_t* bottom_border,
- const ptrdiff_t stride, const int width,
- const int height, SgrBuffer* const sgr_buffer,
- uint8_t* dst) {
+ const ptrdiff_t bottom_border_stride,
+ const int width, const int height,
+ SgrBuffer* const sgr_buffer, uint8_t* dst) {
assert(restoration_info.sgr_proj_info.multiplier[0] == 0);
- const auto temp_stride = Align<ptrdiff_t>(width, 8);
+ const auto temp_stride = Align<ptrdiff_t>(width, 16);
const ptrdiff_t sum_stride = temp_stride + 8;
const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
const int16_t w0 = (1 << kSgrProjPrecisionBits) - w1;
@@ -1799,7 +2347,7 @@ inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
b444[0] = sgr_buffer->b444;
b444[1] = b444[0] + temp_stride;
assert(scale != 0);
- BoxSum<3>(top_border, stride, 2, sum_stride, sum3[0], square_sum3[0]);
+ BoxSum<3>(top_border, top_border_stride, sum_stride, sum3[0], square_sum3[0]);
BoxSumFilterPreProcess3<false>(src, width, scale, sum3, square_sum3, ma343[0],
nullptr, b343[0], nullptr);
Circulate3PointersBy1<uint16_t>(sum3);
@@ -1809,7 +2357,7 @@ inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
s = src + stride;
} else {
s = bottom_border;
- bottom_border += stride;
+ bottom_border += bottom_border_stride;
}
BoxSumFilterPreProcess3<true>(s, width, scale, sum3, square_sum3, ma343[1],
ma444[0], b343[1], b444[0]);
@@ -1836,7 +2384,7 @@ inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
ma343, ma444, b343, b444, dst);
src += stride;
dst += stride;
- bottom_border += stride;
+ bottom_border += bottom_border_stride;
Circulate3PointersBy1<uint16_t>(ma343);
Circulate3PointersBy1<uint32_t>(b343);
std::swap(ma444[0], ma444[1]);
@@ -1849,8 +2397,9 @@ inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
// part of the visible frame.
void SelfGuidedFilter_NEON(
const RestorationUnitInfo& restoration_info, const void* const source,
- const void* const top_border, const void* const bottom_border,
- const ptrdiff_t stride, const int width, const int height,
+ const ptrdiff_t stride, const void* const top_border,
+ const ptrdiff_t top_border_stride, const void* const bottom_border,
+ const ptrdiff_t bottom_border_stride, const int width, const int height,
RestorationBuffer* const restoration_buffer, void* const dest) {
const int index = restoration_info.sgr_proj_info.index;
const int radius_pass_0 = kSgrProjParams[index][0]; // 2 or 0
@@ -1864,14 +2413,17 @@ void SelfGuidedFilter_NEON(
// |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the
// following assertion.
assert(radius_pass_0 != 0);
- BoxFilterProcessPass1(restoration_info, src - 3, top - 3, bottom - 3,
- stride, width, height, sgr_buffer, dst);
+ BoxFilterProcessPass1(restoration_info, src - 3, stride, top - 3,
+ top_border_stride, bottom - 3, bottom_border_stride,
+ width, height, sgr_buffer, dst);
} else if (radius_pass_0 == 0) {
- BoxFilterProcessPass2(restoration_info, src - 2, top - 2, bottom - 2,
- stride, width, height, sgr_buffer, dst);
+ BoxFilterProcessPass2(restoration_info, src - 2, stride, top - 2,
+ top_border_stride, bottom - 2, bottom_border_stride,
+ width, height, sgr_buffer, dst);
} else {
- BoxFilterProcess(restoration_info, src - 3, top - 3, bottom - 3, stride,
- width, height, sgr_buffer, dst);
+ BoxFilterProcess(restoration_info, src - 3, stride, top - 3,
+ top_border_stride, bottom - 3, bottom_border_stride, width,
+ height, sgr_buffer, dst);
}
}
@@ -1890,7 +2442,7 @@ void LoopRestorationInit_NEON() { low_bitdepth::Init8bpp(); }
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_NEON
+#else // !LIBGAV1_ENABLE_NEON
namespace libgav1 {
namespace dsp {
diff --git a/src/dsp/arm/mask_blend_neon.cc b/src/dsp/arm/mask_blend_neon.cc
index 084f42f..ee50923 100644
--- a/src/dsp/arm/mask_blend_neon.cc
+++ b/src/dsp/arm/mask_blend_neon.cc
@@ -432,7 +432,7 @@ void MaskBlendInit_NEON() { low_bitdepth::Init8bpp(); }
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_NEON
+#else // !LIBGAV1_ENABLE_NEON
namespace libgav1 {
namespace dsp {
diff --git a/src/dsp/arm/motion_field_projection_neon.cc b/src/dsp/arm/motion_field_projection_neon.cc
index 8caba7d..3e731b2 100644
--- a/src/dsp/arm/motion_field_projection_neon.cc
+++ b/src/dsp/arm/motion_field_projection_neon.cc
@@ -382,7 +382,7 @@ void MotionFieldProjectionInit_NEON() {
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_NEON
+#else // !LIBGAV1_ENABLE_NEON
namespace libgav1 {
namespace dsp {
diff --git a/src/dsp/arm/motion_vector_search_neon.cc b/src/dsp/arm/motion_vector_search_neon.cc
index 8a403a6..da3ba17 100644
--- a/src/dsp/arm/motion_vector_search_neon.cc
+++ b/src/dsp/arm/motion_vector_search_neon.cc
@@ -256,7 +256,7 @@ void MotionVectorSearchInit_NEON() {
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_NEON
+#else // !LIBGAV1_ENABLE_NEON
namespace libgav1 {
namespace dsp {
diff --git a/src/dsp/arm/obmc_neon.cc b/src/dsp/arm/obmc_neon.cc
index 66ad663..1111a90 100644
--- a/src/dsp/arm/obmc_neon.cc
+++ b/src/dsp/arm/obmc_neon.cc
@@ -380,7 +380,7 @@ void ObmcInit_NEON() { Init8bpp(); }
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_NEON
+#else // !LIBGAV1_ENABLE_NEON
namespace libgav1 {
namespace dsp {
diff --git a/src/dsp/arm/super_res_neon.cc b/src/dsp/arm/super_res_neon.cc
index 1680450..91537c4 100644
--- a/src/dsp/arm/super_res_neon.cc
+++ b/src/dsp/arm/super_res_neon.cc
@@ -12,7 +12,6 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "src/dsp/arm/common_neon.h"
#include "src/dsp/super_res.h"
#include "src/utils/cpu.h"
@@ -20,6 +19,7 @@
#include <arm_neon.h>
+#include "src/dsp/arm/common_neon.h"
#include "src/dsp/constants.h"
#include "src/dsp/dsp.h"
#include "src/utils/common.h"
@@ -82,10 +82,10 @@ inline uint8x8_t SuperRes(const uint8x8_t src[kSuperResFilterTaps],
}
void SuperRes_NEON(const void* const coefficients, void* const source,
- const ptrdiff_t stride, const int height,
+ const ptrdiff_t source_stride, const int height,
const int downscaled_width, const int upscaled_width,
const int initial_subpixel_x, const int step,
- void* const dest) {
+ void* const dest, const ptrdiff_t dest_stride) {
auto* src = static_cast<uint8_t*>(source) - DivideBy2(kSuperResFilterTaps);
auto* dst = static_cast<uint8_t*>(dest);
int y = height;
@@ -100,7 +100,7 @@ void SuperRes_NEON(const void* const coefficients, void* const source,
int x = RightShiftWithCeiling(upscaled_width, 4);
// The below code calculates up to 15 extra upscaled
// pixels which will over-read up to 15 downscaled pixels in the end of each
- // row. kSuperResHorizontalBorder accounts for this.
+ // row. kSuperResHorizontalPadding accounts for this.
do {
for (int i = 0; i < 8; ++i, subpixel_x += step) {
sr[i] = vld1_u8(&src[subpixel_x >> kSuperResScaleBits]);
@@ -135,8 +135,8 @@ void SuperRes_NEON(const void* const coefficients, void* const source,
vst1q_u8(dst_ptr, vcombine_u8(d0, d1));
dst_ptr += 16;
} while (--x != 0);
- src += stride;
- dst += stride;
+ src += source_stride;
+ dst += dest_stride;
} while (--y != 0);
}
@@ -149,12 +149,147 @@ void Init8bpp() {
} // namespace
} // namespace low_bitdepth
-void SuperResInit_NEON() { low_bitdepth::Init8bpp(); }
+//------------------------------------------------------------------------------
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+void SuperResCoefficients_NEON(const int upscaled_width,
+ const int initial_subpixel_x, const int step,
+ void* const coefficients) {
+ auto* dst = static_cast<uint16_t*>(coefficients);
+ int subpixel_x = initial_subpixel_x;
+ int x = RightShiftWithCeiling(upscaled_width, 3);
+ do {
+ uint16x8_t filter[8];
+ for (int i = 0; i < 8; ++i, subpixel_x += step) {
+ const uint8x8_t filter_8 =
+ vld1_u8(kUpscaleFilterUnsigned[(subpixel_x & kSuperResScaleMask) >>
+ kSuperResExtraBits]);
+ // uint8_t -> uint16_t
+ filter[i] = vmovl_u8(filter_8);
+ }
+
+ Transpose8x8(filter);
+
+ vst1q_u16(dst, filter[0]);
+ dst += 8;
+ vst1q_u16(dst, filter[1]);
+ dst += 8;
+ vst1q_u16(dst, filter[2]);
+ dst += 8;
+ vst1q_u16(dst, filter[3]);
+ dst += 8;
+ vst1q_u16(dst, filter[4]);
+ dst += 8;
+ vst1q_u16(dst, filter[5]);
+ dst += 8;
+ vst1q_u16(dst, filter[6]);
+ dst += 8;
+ vst1q_u16(dst, filter[7]);
+ dst += 8;
+ } while (--x != 0);
+}
+
+// The sum is clipped to [0, ((1 << bitdepth) -1)]. Adding all positive and then
+// subtracting all negative with saturation will clip to zero.
+// 0 1 2 3 4 5 6 7
+// tap sign: - + - + + - + -
+inline uint16x8_t SuperRes(const uint16x8_t src[kSuperResFilterTaps],
+ const uint16_t** coefficients, int bitdepth) {
+ uint16x8_t f[kSuperResFilterTaps];
+ for (int i = 0; i < kSuperResFilterTaps; ++i, *coefficients += 8) {
+ f[i] = vld1q_u16(*coefficients);
+ }
+
+ uint32x4_t res_lo = vmull_u16(vget_low_u16(src[1]), vget_low_u16(f[1]));
+ res_lo = vmlal_u16(res_lo, vget_low_u16(src[3]), vget_low_u16(f[3]));
+ res_lo = vmlal_u16(res_lo, vget_low_u16(src[4]), vget_low_u16(f[4]));
+ res_lo = vmlal_u16(res_lo, vget_low_u16(src[6]), vget_low_u16(f[6]));
+
+ uint32x4_t temp_lo = vmull_u16(vget_low_u16(src[0]), vget_low_u16(f[0]));
+ temp_lo = vmlal_u16(temp_lo, vget_low_u16(src[2]), vget_low_u16(f[2]));
+ temp_lo = vmlal_u16(temp_lo, vget_low_u16(src[5]), vget_low_u16(f[5]));
+ temp_lo = vmlal_u16(temp_lo, vget_low_u16(src[7]), vget_low_u16(f[7]));
+
+ res_lo = vqsubq_u32(res_lo, temp_lo);
+
+ uint32x4_t res_hi = vmull_u16(vget_high_u16(src[1]), vget_high_u16(f[1]));
+ res_hi = vmlal_u16(res_hi, vget_high_u16(src[3]), vget_high_u16(f[3]));
+ res_hi = vmlal_u16(res_hi, vget_high_u16(src[4]), vget_high_u16(f[4]));
+ res_hi = vmlal_u16(res_hi, vget_high_u16(src[6]), vget_high_u16(f[6]));
+ uint32x4_t temp_hi = vmull_u16(vget_high_u16(src[0]), vget_high_u16(f[0]));
+ temp_hi = vmlal_u16(temp_hi, vget_high_u16(src[2]), vget_high_u16(f[2]));
+ temp_hi = vmlal_u16(temp_hi, vget_high_u16(src[5]), vget_high_u16(f[5]));
+ temp_hi = vmlal_u16(temp_hi, vget_high_u16(src[7]), vget_high_u16(f[7]));
+
+ res_hi = vqsubq_u32(res_hi, temp_hi);
+
+ const uint16x8_t res = vcombine_u16(vqrshrn_n_u32(res_lo, kFilterBits),
+ vqrshrn_n_u32(res_hi, kFilterBits));
+
+ // Clip the result at (1 << bd) - 1.
+ return vminq_u16(res, vdupq_n_u16((1 << bitdepth) - 1));
+}
+
+template <int bitdepth>
+void SuperRes_NEON(const void* const coefficients, void* const source,
+ const ptrdiff_t source_stride, const int height,
+ const int downscaled_width, const int upscaled_width,
+ const int initial_subpixel_x, const int step,
+ void* const dest, const ptrdiff_t dest_stride) {
+ auto* src = static_cast<uint16_t*>(source) - DivideBy2(kSuperResFilterTaps);
+ auto* dst = static_cast<uint16_t*>(dest);
+ int y = height;
+ do {
+ const auto* filter = static_cast<const uint16_t*>(coefficients);
+ uint16_t* dst_ptr = dst;
+ ExtendLine<uint16_t>(src + DivideBy2(kSuperResFilterTaps), downscaled_width,
+ kSuperResHorizontalBorder, kSuperResHorizontalBorder);
+ int subpixel_x = initial_subpixel_x;
+ uint16x8_t sr[8];
+ int x = RightShiftWithCeiling(upscaled_width, 3);
+ // The below code calculates up to 7 extra upscaled
+ // pixels which will over-read up to 7 downscaled pixels in the end of each
+ // row. kSuperResHorizontalBorder accounts for this.
+ do {
+ for (int i = 0; i < 8; ++i, subpixel_x += step) {
+ sr[i] = vld1q_u16(&src[subpixel_x >> kSuperResScaleBits]);
+ }
+
+ Transpose8x8(sr);
+
+ const uint16x8_t d0 = SuperRes(sr, &filter, bitdepth);
+ vst1q_u16(dst_ptr, d0);
+ dst_ptr += 8;
+ } while (--x != 0);
+ src += source_stride;
+ dst += dest_stride;
+ } while (--y != 0);
+}
+
+void Init10bpp() {
+ Dsp* dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+ dsp->super_res_coefficients = SuperResCoefficients_NEON;
+ dsp->super_res = SuperRes_NEON<10>;
+}
+
+} // namespace
+} // namespace high_bitdepth
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+void SuperResInit_NEON() {
+ low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ high_bitdepth::Init10bpp();
+#endif
+}
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_NEON
+#else // !LIBGAV1_ENABLE_NEON
namespace libgav1 {
namespace dsp {
diff --git a/src/dsp/arm/super_res_neon.h b/src/dsp/arm/super_res_neon.h
index f51785d..65e48c5 100644
--- a/src/dsp/arm/super_res_neon.h
+++ b/src/dsp/arm/super_res_neon.h
@@ -31,7 +31,10 @@ void SuperResInit_NEON();
#if LIBGAV1_ENABLE_NEON
#define LIBGAV1_Dsp8bpp_SuperRes LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_SuperResClip LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_SuperResCoefficients LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_SuperResCoefficients LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_SuperRes LIBGAV1_CPU_NEON
#endif // LIBGAV1_ENABLE_NEON
#endif // LIBGAV1_SRC_DSP_ARM_SUPER_RES_NEON_H_
diff --git a/src/dsp/arm/warp_neon.cc b/src/dsp/arm/warp_neon.cc
index 7a41998..c7fb739 100644
--- a/src/dsp/arm/warp_neon.cc
+++ b/src/dsp/arm/warp_neon.cc
@@ -289,7 +289,7 @@ void Warp_NEON(const void* const source, const ptrdiff_t source_stride,
const int16x8_t sum = vld1q_s16(tmp);
vst1_u8(reinterpret_cast<uint8_t*>(dst_row), vqmovun_s16(sum));
}
-#else // !defined(__aarch64__)
+#else // !defined(__aarch64__)
int16x8_t filter[8];
for (int x = 0; x < 8; ++x) {
const int offset =
@@ -442,7 +442,7 @@ void WarpInit_NEON() { low_bitdepth::Init8bpp(); }
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_NEON
+#else // !LIBGAV1_ENABLE_NEON
namespace libgav1 {
namespace dsp {
diff --git a/src/dsp/arm/weight_mask_neon.cc b/src/dsp/arm/weight_mask_neon.cc
index 49d3be0..7e5bff0 100644
--- a/src/dsp/arm/weight_mask_neon.cc
+++ b/src/dsp/arm/weight_mask_neon.cc
@@ -451,7 +451,7 @@ void WeightMaskInit_NEON() { low_bitdepth::Init8bpp(); }
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_NEON
+#else // !LIBGAV1_ENABLE_NEON
namespace libgav1 {
namespace dsp {