New upstream version 0.16.3

author: qinxialei <xialeiqin@gmail.com> 2021-04-22 11:20:15 +0800
committer: qinxialei <xialeiqin@gmail.com> 2021-04-22 11:20:15 +0800
commit: 2381d803c76105f44717d75f089ec37f51e5cfe4 (patch)
tree: 33f40fb4dfd1039ac262d5f1c1065d298578ddc1 /src/dsp/arm
parent: e8d277081293b6fb2a5d469616baaa7a06f52496 (diff)
download: libgav1-2381d803c76105f44717d75f089ec37f51e5cfe4.tar.gz
libgav1-2381d803c76105f44717d75f089ec37f51e5cfe4.tar.bz2
libgav1-2381d803c76105f44717d75f089ec37f51e5cfe4.zip
32 files changed, 7046 insertions, 1251 deletions
diff --git a/src/dsp/arm/average_blend_neon.cc b/src/dsp/arm/average_blend_neon.cc
index 834e8b4..5b4c094 100644
--- a/src/dsp/arm/average_blend_neon.cc
+++ b/src/dsp/arm/average_blend_neon.cc
@@ -35,6 +35,11 @@ namespace {
 constexpr int kInterPostRoundBit =
     kInterRoundBitsVertical - kInterRoundBitsCompoundVertical;
 
+}  // namespace
+
+namespace low_bitdepth {
+namespace {
+
 inline uint8x8_t AverageBlend8Row(const int16_t* prediction_0,
                                   const int16_t* prediction_1) {
   const int16x8_t pred0 = vld1q_s16(prediction_0);
@@ -128,13 +133,139 @@ void Init8bpp() {
 }
 
 }  // namespace
+}  // namespace low_bitdepth
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+inline uint16x8_t AverageBlend8Row(const uint16_t* prediction_0,
+                                   const uint16_t* prediction_1,
+                                   const int32x4_t compound_offset,
+                                   const uint16x8_t v_bitdepth) {
+  const uint16x8_t pred0 = vld1q_u16(prediction_0);
+  const uint16x8_t pred1 = vld1q_u16(prediction_1);
+  const uint32x4_t pred_lo =
+      vaddl_u16(vget_low_u16(pred0), vget_low_u16(pred1));
+  const uint32x4_t pred_hi =
+      vaddl_u16(vget_high_u16(pred0), vget_high_u16(pred1));
+  const int32x4_t offset_lo =
+      vsubq_s32(vreinterpretq_s32_u32(pred_lo), compound_offset);
+  const int32x4_t offset_hi =
+      vsubq_s32(vreinterpretq_s32_u32(pred_hi), compound_offset);
+  const uint16x4_t res_lo = vqrshrun_n_s32(offset_lo, kInterPostRoundBit + 1);
+  const uint16x4_t res_hi = vqrshrun_n_s32(offset_hi, kInterPostRoundBit + 1);
+  return vminq_u16(vcombine_u16(res_lo, res_hi), v_bitdepth);
+}
+
+inline void AverageBlendLargeRow(const uint16_t* prediction_0,
+                                 const uint16_t* prediction_1, const int width,
+                                 uint16_t* dest,
+                                 const int32x4_t compound_offset,
+                                 const uint16x8_t v_bitdepth) {
+  int x = width;
+  do {
+    vst1q_u16(dest, AverageBlend8Row(prediction_0, prediction_1,
+                                     compound_offset, v_bitdepth));
+    prediction_0 += 8;
+    prediction_1 += 8;
+    dest += 8;
+
+    vst1q_u16(dest, AverageBlend8Row(prediction_0, prediction_1,
+                                     compound_offset, v_bitdepth));
+    prediction_0 += 8;
+    prediction_1 += 8;
+    dest += 8;
+
+    x -= 16;
+  } while (x != 0);
+}
+
+void AverageBlend_NEON(const void* prediction_0, const void* prediction_1,
+                       const int width, const int height, void* const dest,
+                       const ptrdiff_t dest_stride) {
+  auto* dst = static_cast<uint16_t*>(dest);
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  int y = height;
+
+  const ptrdiff_t dst_stride = dest_stride >> 1;
+  const int32x4_t compound_offset =
+      vdupq_n_s32(static_cast<int32_t>(kCompoundOffset + kCompoundOffset));
+  const uint16x8_t v_bitdepth = vdupq_n_u16((1 << kBitdepth10) - 1);
+  if (width == 4) {
+    do {
+      const uint16x8_t result =
+          AverageBlend8Row(pred_0, pred_1, compound_offset, v_bitdepth);
+      pred_0 += 8;
+      pred_1 += 8;
+
+      vst1_u16(dst, vget_low_u16(result));
+      dst += dst_stride;
+      vst1_u16(dst, vget_high_u16(result));
+      dst += dst_stride;
+      y -= 2;
+    } while (y != 0);
+    return;
+  }
+
+  if (width == 8) {
+    do {
+      vst1q_u16(dst,
+                AverageBlend8Row(pred_0, pred_1, compound_offset, v_bitdepth));
+      dst += dst_stride;
+      pred_0 += 8;
+      pred_1 += 8;
+
+      vst1q_u16(dst,
+                AverageBlend8Row(pred_0, pred_1, compound_offset, v_bitdepth));
+      dst += dst_stride;
+      pred_0 += 8;
+      pred_1 += 8;
+
+      y -= 2;
+    } while (y != 0);
+    return;
+  }
+
+  do {
+    AverageBlendLargeRow(pred_0, pred_1, width, dst, compound_offset,
+                         v_bitdepth);
+    dst += dst_stride;
+    pred_0 += width;
+    pred_1 += width;
+
+    AverageBlendLargeRow(pred_0, pred_1, width, dst, compound_offset,
+                         v_bitdepth);
+    dst += dst_stride;
+    pred_0 += width;
+    pred_1 += width;
+
+    y -= 2;
+  } while (y != 0);
+}
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+  dsp->average_blend = AverageBlend_NEON;
+}
+
+}  // namespace
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
 
-void AverageBlendInit_NEON() { Init8bpp(); }
+void AverageBlendInit_NEON() {
+  low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+}
 
 }  // namespace dsp
 }  // namespace libgav1
 
-#else  // !LIBGAV1_ENABLE_NEON
+#else   // !LIBGAV1_ENABLE_NEON
 
 namespace libgav1 {
 namespace dsp {
diff --git a/src/dsp/arm/cdef_neon.cc b/src/dsp/arm/cdef_neon.cc
index 4d0e76f..60c72d6 100644
--- a/src/dsp/arm/cdef_neon.cc
+++ b/src/dsp/arm/cdef_neon.cc
@@ -265,7 +265,7 @@ LIBGAV1_ALWAYS_INLINE void AddPartial(const void* const source,
   // 05 15 25 35 45 55 65 75  00 00 00 00 00 00 00 00
   // 06 16 26 36 46 56 66 76  00 00 00 00 00 00 00 00
   // 07 17 27 37 47 57 67 77  00 00 00 00 00 00 00 00
-  partial_lo[2] = vsetq_lane_u16(SumVector(v_src[0]), partial_lo[2], 0);
+  partial_lo[2] = vsetq_lane_u16(SumVector(v_src[0]), vdupq_n_u16(0), 0);
   partial_lo[2] = vsetq_lane_u16(SumVector(v_src[1]), partial_lo[2], 1);
   partial_lo[2] = vsetq_lane_u16(SumVector(v_src[2]), partial_lo[2], 2);
   partial_lo[2] = vsetq_lane_u16(SumVector(v_src[3]), partial_lo[2], 3);
@@ -285,9 +285,8 @@ LIBGAV1_ALWAYS_INLINE void AddPartial(const void* const source,
   // 50 51 52 53 54 55 56 57  00 00 00 00 00 00 00 00
   // 60 61 62 63 64 65 66 67  00 00 00 00 00 00 00 00
   // 70 71 72 73 74 75 76 77  00 00 00 00 00 00 00 00
-  const uint8x8_t v_zero = vdup_n_u8(0);
-  partial_lo[6] = vaddl_u8(v_zero, v_src[0]);
-  for (int i = 1; i < 8; ++i) {
+  partial_lo[6] = vaddl_u8(v_src[0], v_src[1]);
+  for (int i = 2; i < 8; ++i) {
     partial_lo[6] = vaddw_u8(partial_lo[6], v_src[i]);
   }
 
@@ -451,7 +450,7 @@ void LoadDirection4(const uint16_t* const src, const ptrdiff_t stride,
 
 int16x8_t Constrain(const uint16x8_t pixel, const uint16x8_t reference,
                     const uint16x8_t threshold, const int16x8_t damping) {
-  // If reference > pixel, the difference will be negative, so covert to 0 or
+  // If reference > pixel, the difference will be negative, so convert to 0 or
   // -1.
   const uint16x8_t sign = vcgtq_u16(reference, pixel);
   const uint16x8_t abs_diff = vabdq_u16(pixel, reference);
@@ -686,7 +685,7 @@ void CdefInit_NEON() { low_bitdepth::Init8bpp(); }
 
 }  // namespace dsp
 }  // namespace libgav1
-#else  // !LIBGAV1_ENABLE_NEON
+#else   // !LIBGAV1_ENABLE_NEON
 namespace libgav1 {
 namespace dsp {
 
diff --git a/src/dsp/arm/common_neon.h b/src/dsp/arm/common_neon.h
index dcb7567..05e0d05 100644
--- a/src/dsp/arm/common_neon.h
+++ b/src/dsp/arm/common_neon.h
@@ -28,8 +28,7 @@
 
 #if 0
 #include <cstdio>
-
-#include "absl/strings/str_cat.h"
+#include <string>
 
 constexpr bool kEnablePrintRegs = true;
 
@@ -86,11 +85,11 @@ inline void PrintVectQ(const DebugRegisterQ r, const char* const name,
 
 inline void PrintReg(const int32x4x2_t val, const std::string& name) {
   DebugRegisterQ r;
-  vst1q_u32(r.u32, val.val[0]);
-  const std::string name0 = absl::StrCat(name, ".val[0]").c_str();
+  vst1q_s32(r.i32, val.val[0]);
+  const std::string name0 = name + std::string(".val[0]");
   PrintVectQ(r, name0.c_str(), 32);
-  vst1q_u32(r.u32, val.val[1]);
-  const std::string name1 = absl::StrCat(name, ".val[1]").c_str();
+  vst1q_s32(r.i32, val.val[1]);
+  const std::string name1 = name + std::string(".val[1]");
   PrintVectQ(r, name1.c_str(), 32);
 }
 
@@ -169,14 +168,14 @@ inline void PrintReg(const int8x8_t val, const char* name) {
 // Print an individual (non-vector) value in decimal format.
 inline void PrintReg(const int x, const char* name) {
   if (kEnablePrintRegs) {
-    printf("%s: %d\n", name, x);
+    fprintf(stderr, "%s: %d\n", name, x);
   }
 }
 
 // Print an individual (non-vector) value in hexadecimal format.
 inline void PrintHex(const int x, const char* name) {
   if (kEnablePrintRegs) {
-    printf("%s: %x\n", name, x);
+    fprintf(stderr, "%s: %x\n", name, x);
   }
 }
 
@@ -277,22 +276,32 @@ inline void Store2(uint16_t* const buf, const uint16x4_t val) {
   ValueToMem<uint32_t>(buf, vget_lane_u32(vreinterpret_u32_u16(val), lane));
 }
 
+// Simplify code when caller has |buf| cast as uint8_t*.
+inline void Store4(void* const buf, const uint16x4_t val) {
+  vst1_u16(static_cast<uint16_t*>(buf), val);
+}
+
+// Simplify code when caller has |buf| cast as uint8_t*.
+inline void Store8(void* const buf, const uint16x8_t val) {
+  vst1q_u16(static_cast<uint16_t*>(buf), val);
+}
+
 //------------------------------------------------------------------------------
 // Bit manipulation.
 
 // vshXX_n_XX() requires an immediate.
 template <int shift>
-inline uint8x8_t LeftShift(const uint8x8_t vector) {
+inline uint8x8_t LeftShiftVector(const uint8x8_t vector) {
   return vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(vector), shift));
 }
 
 template <int shift>
-inline uint8x8_t RightShift(const uint8x8_t vector) {
+inline uint8x8_t RightShiftVector(const uint8x8_t vector) {
   return vreinterpret_u8_u64(vshr_n_u64(vreinterpret_u64_u8(vector), shift));
 }
 
 template <int shift>
-inline int8x8_t RightShift(const int8x8_t vector) {
+inline int8x8_t RightShiftVector(const int8x8_t vector) {
   return vreinterpret_s8_u64(vshr_n_u64(vreinterpret_u64_s8(vector), shift));
 }
 
@@ -387,6 +396,15 @@ inline uint16_t SumVector(const uint8x8_t a) {
 #endif  // defined(__aarch64__)
 }
 
+inline uint32_t SumVector(const uint32x2_t a) {
+#if defined(__aarch64__)
+  return vaddv_u32(a);
+#else
+  const uint64x1_t b = vpaddl_u32(a);
+  return vget_lane_u32(vreinterpret_u32_u64(b), 0);
+#endif  // defined(__aarch64__)
+}
+
 inline uint32_t SumVector(const uint32x4_t a) {
 #if defined(__aarch64__)
   return vaddvq_u32(a);
@@ -447,6 +465,36 @@ inline uint16x8x2_t VtrnqU64(uint32x4_t a0, uint32x4_t a1) {
 }
 
 // Input:
+// 00 01 02 03
+// 10 11 12 13
+// 20 21 22 23
+// 30 31 32 33
+inline void Transpose4x4(uint16x4_t a[4]) {
+  // b:
+  // 00 10 02 12
+  // 01 11 03 13
+  const uint16x4x2_t b = vtrn_u16(a[0], a[1]);
+  // c:
+  // 20 30 22 32
+  // 21 31 23 33
+  const uint16x4x2_t c = vtrn_u16(a[2], a[3]);
+  // d:
+  // 00 10 20 30
+  // 02 12 22 32
+  const uint32x2x2_t d =
+      vtrn_u32(vreinterpret_u32_u16(b.val[0]), vreinterpret_u32_u16(c.val[0]));
+  // e:
+  // 01 11 21 31
+  // 03 13 23 33
+  const uint32x2x2_t e =
+      vtrn_u32(vreinterpret_u32_u16(b.val[1]), vreinterpret_u32_u16(c.val[1]));
+  a[0] = vreinterpret_u16_u32(d.val[0]);
+  a[1] = vreinterpret_u16_u32(e.val[0]);
+  a[2] = vreinterpret_u16_u32(d.val[1]);
+  a[3] = vreinterpret_u16_u32(e.val[1]);
+}
+
+// Input:
 // a: 00 01 02 03 10 11 12 13
 // b: 20 21 22 23 30 31 32 33
 // Output:
diff --git a/src/dsp/arm/convolve_neon.cc b/src/dsp/arm/convolve_neon.cc
index fd9b912..331bfe2 100644
--- a/src/dsp/arm/convolve_neon.cc
+++ b/src/dsp/arm/convolve_neon.cc
@@ -101,245 +101,278 @@ int16x8_t SumOnePassTaps(const uint8x8_t* const src,
   return vreinterpretq_s16_u16(sum);
 }
 
-template <int filter_index, bool negative_outside_taps>
-int16x8_t SumHorizontalTaps(const uint8_t* const src,
-                            const uint8x8_t* const v_tap) {
-  uint8x8_t v_src[8];
-  const uint8x16_t src_long = vld1q_u8(src);
-  int16x8_t sum;
-
-  if (filter_index < 2) {
-    v_src[0] = vget_low_u8(vextq_u8(src_long, src_long, 1));
-    v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 2));
-    v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 3));
-    v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 4));
-    v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 5));
-    v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 6));
-    sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src, v_tap + 1);
-  } else if (filter_index == 2) {
-    v_src[0] = vget_low_u8(src_long);
-    v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
-    v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2));
-    v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3));
-    v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 4));
-    v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 5));
-    v_src[6] = vget_low_u8(vextq_u8(src_long, src_long, 6));
-    v_src[7] = vget_low_u8(vextq_u8(src_long, src_long, 7));
-    sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src, v_tap);
-  } else if (filter_index == 3) {
-    v_src[0] = vget_low_u8(vextq_u8(src_long, src_long, 3));
-    v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 4));
-    sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src, v_tap + 3);
-  } else if (filter_index > 3) {
-    v_src[0] = vget_low_u8(vextq_u8(src_long, src_long, 2));
-    v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 3));
-    v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 4));
-    v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 5));
-    sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src, v_tap + 2);
-  }
-  return sum;
-}
-
-template <int filter_index, bool negative_outside_taps>
-uint8x8_t SimpleHorizontalTaps(const uint8_t* const src,
-                               const uint8x8_t* const v_tap) {
-  int16x8_t sum =
-      SumHorizontalTaps<filter_index, negative_outside_taps>(src, v_tap);
-
-  // Normally the Horizontal pass does the downshift in two passes:
-  // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
-  // kInterRoundBitsHorizontal). Each one uses a rounding shift. Combining them
-  // requires adding the rounding offset from the skipped shift.
-  constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2);
-
-  sum = vaddq_s16(sum, vdupq_n_s16(first_shift_rounding_bit));
-  return vqrshrun_n_s16(sum, kFilterBits - 1);
-}
-
-template <int filter_index, bool negative_outside_taps>
-uint16x8_t HorizontalTaps8To16(const uint8_t* const src,
-                               const uint8x8_t* const v_tap) {
-  const int16x8_t sum =
-      SumHorizontalTaps<filter_index, negative_outside_taps>(src, v_tap);
-
-  return vreinterpretq_u16_s16(
-      vrshrq_n_s16(sum, kInterRoundBitsHorizontal - 1));
-}
-
-template <int filter_index>
-int16x8_t SumHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride,
-                               const uint8x8_t* const v_tap) {
-  uint16x8_t sum;
-  const uint8x8_t input0 = vld1_u8(src);
-  src += src_stride;
-  const uint8x8_t input1 = vld1_u8(src);
-  uint8x8x2_t input = vzip_u8(input0, input1);
-
-  if (filter_index == 3) {
-    // tap signs : + +
-    sum = vmull_u8(vext_u8(input.val[0], input.val[1], 6), v_tap[3]);
-    sum = vmlal_u8(sum, input.val[1], v_tap[4]);
-  } else if (filter_index == 4) {
-    // tap signs : - + + -
-    sum = vmull_u8(vext_u8(input.val[0], input.val[1], 6), v_tap[3]);
-    sum = vmlsl_u8(sum, RightShift<4 * 8>(input.val[0]), v_tap[2]);
-    sum = vmlal_u8(sum, input.val[1], v_tap[4]);
-    sum = vmlsl_u8(sum, RightShift<2 * 8>(input.val[1]), v_tap[5]);
-  } else {
-    // tap signs : + + + +
-    sum = vmull_u8(RightShift<4 * 8>(input.val[0]), v_tap[2]);
-    sum = vmlal_u8(sum, vext_u8(input.val[0], input.val[1], 6), v_tap[3]);
-    sum = vmlal_u8(sum, input.val[1], v_tap[4]);
-    sum = vmlal_u8(sum, RightShift<2 * 8>(input.val[1]), v_tap[5]);
-  }
-
-  return vreinterpretq_s16_u16(sum);
-}
-
-template <int filter_index>
-uint8x8_t SimpleHorizontalTaps2x2(const uint8_t* src,
-                                  const ptrdiff_t src_stride,
-                                  const uint8x8_t* const v_tap) {
-  int16x8_t sum = SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
-
-  // Normally the Horizontal pass does the downshift in two passes:
-  // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
-  // kInterRoundBitsHorizontal). Each one uses a rounding shift. Combining them
-  // requires adding the rounding offset from the skipped shift.
-  constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2);
-
-  sum = vaddq_s16(sum, vdupq_n_s16(first_shift_rounding_bit));
-  return vqrshrun_n_s16(sum, kFilterBits - 1);
-}
-
-template <int filter_index>
-uint16x8_t HorizontalTaps8To16_2x2(const uint8_t* src,
-                                   const ptrdiff_t src_stride,
-                                   const uint8x8_t* const v_tap) {
-  const int16x8_t sum =
-      SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
-
-  return vreinterpretq_u16_s16(
-      vrshrq_n_s16(sum, kInterRoundBitsHorizontal - 1));
-}
-
-template <int num_taps, int step, int filter_index,
-          bool negative_outside_taps = true, bool is_2d = false,
-          bool is_compound = false>
-void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride,
-                      void* const dest, const ptrdiff_t pred_stride,
-                      const int width, const int height,
-                      const uint8x8_t* const v_tap) {
+template <int filter_index, bool negative_outside_taps, bool is_2d,
+          bool is_compound>
+void FilterHorizontalWidth8AndUp(const uint8_t* src, const ptrdiff_t src_stride,
+                                 void* const dest, const ptrdiff_t pred_stride,
+                                 const int width, const int height,
+                                 const uint8x8_t* const v_tap) {
   auto* dest8 = static_cast<uint8_t*>(dest);
   auto* dest16 = static_cast<uint16_t*>(dest);
-
-  // 4 tap filters are never used when width > 4.
-  if (num_taps != 4 && width > 4) {
-    int y = 0;
+  if (!is_2d) {
+    int y = height;
     do {
       int x = 0;
-      do {
-        if (is_2d || is_compound) {
-          const uint16x8_t v_sum =
-              HorizontalTaps8To16<filter_index, negative_outside_taps>(&src[x],
-                                                                       v_tap);
+      do {  // Increasing loop counter x is better.
+        const uint8x16_t src_long = vld1q_u8(src + x);
+        uint8x8_t v_src[8];
+        int16x8_t sum;
+        if (filter_index < 2) {
+          v_src[0] = vget_low_u8(src_long);
+          v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+          v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2));
+          v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3));
+          v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 4));
+          v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 5));
+          sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src,
+                                                                    v_tap + 1);
+        } else if (filter_index == 2) {
+          v_src[0] = vget_low_u8(src_long);
+          v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+          v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2));
+          v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3));
+          v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 4));
+          v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 5));
+          v_src[6] = vget_low_u8(vextq_u8(src_long, src_long, 6));
+          v_src[7] = vget_low_u8(vextq_u8(src_long, src_long, 7));
+          sum = SumOnePassTaps<filter_index, false>(v_src, v_tap);
+        } else if (filter_index == 3) {
+          v_src[0] = vget_low_u8(src_long);
+          v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+          sum = SumOnePassTaps<filter_index, false>(v_src, v_tap + 3);
+        } else if (filter_index > 3) {
+          v_src[0] = vget_low_u8(src_long);
+          v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+          v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2));
+          v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3));
+          sum = SumOnePassTaps<filter_index, false>(v_src, v_tap + 2);
+        }
+        if (is_compound) {
+          const uint16x8_t v_sum = vreinterpretq_u16_s16(
+              vrshrq_n_s16(sum, kInterRoundBitsHorizontal - 1));
           vst1q_u16(&dest16[x], v_sum);
         } else {
-          const uint8x8_t result =
-              SimpleHorizontalTaps<filter_index, negative_outside_taps>(&src[x],
-                                                                        v_tap);
+          // Normally the Horizontal pass does the downshift in two passes:
+          // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
+          // kInterRoundBitsHorizontal). Each one uses a rounding shift.
+          // Combining them requires adding the rounding offset from the skipped
+          // shift.
+          constexpr int first_shift_rounding_bit =
+              1 << (kInterRoundBitsHorizontal - 2);
+          sum = vaddq_s16(sum, vdupq_n_s16(first_shift_rounding_bit));
+          const uint8x8_t result = vqrshrun_n_s16(sum, kFilterBits - 1);
           vst1_u8(&dest8[x], result);
         }
-        x += step;
+        x += 8;
       } while (x < width);
       src += src_stride;
       dest8 += pred_stride;
       dest16 += pred_stride;
-    } while (++y < height);
+    } while (--y != 0);
+  } else {
+    int x = 0;
+    do {
+      const uint8_t* s = src + x;
+      int y = height;
+      do {  // Increasing loop counter x is better.
+        const uint8x16_t src_long = vld1q_u8(s);
+        uint8x8_t v_src[8];
+        int16x8_t sum;
+        if (filter_index < 2) {
+          v_src[0] = vget_low_u8(src_long);
+          v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+          v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2));
+          v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3));
+          v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 4));
+          v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 5));
+          sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src,
+                                                                    v_tap + 1);
+        } else if (filter_index == 2) {
+          v_src[0] = vget_low_u8(src_long);
+          v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+          v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2));
+          v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3));
+          v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 4));
+          v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 5));
+          v_src[6] = vget_low_u8(vextq_u8(src_long, src_long, 6));
+          v_src[7] = vget_low_u8(vextq_u8(src_long, src_long, 7));
+          sum = SumOnePassTaps<filter_index, false>(v_src, v_tap);
+        } else if (filter_index == 3) {
+          v_src[0] = vget_low_u8(src_long);
+          v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+          sum = SumOnePassTaps<filter_index, false>(v_src, v_tap + 3);
+        } else if (filter_index > 3) {
+          v_src[0] = vget_low_u8(src_long);
+          v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+          v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2));
+          v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3));
+          sum = SumOnePassTaps<filter_index, false>(v_src, v_tap + 2);
+        }
+        const uint16x8_t v_sum = vreinterpretq_u16_s16(
+            vrshrq_n_s16(sum, kInterRoundBitsHorizontal - 1));
+        vst1q_u16(dest16, v_sum);
+        s += src_stride;
+        dest16 += 8;
+      } while (--y != 0);
+      x += 8;
+    } while (x < width);
+  }
+}
+
+template <int filter_index, bool is_2d, bool is_compound>
+void FilterHorizontalWidth4(const uint8_t* src, const ptrdiff_t src_stride,
+                            void* const dest, const ptrdiff_t pred_stride,
+                            const int height, const uint8x8_t* const v_tap) {
+  auto* dest8 = static_cast<uint8_t*>(dest);
+  auto* dest16 = static_cast<uint16_t*>(dest);
+  int y = height;
+  do {
+    uint8x8_t v_src[4];
+    int16x8_t sum;
+    v_src[0] = vld1_u8(src);
+    if (filter_index == 3) {
+      v_src[1] = RightShiftVector<1 * 8>(v_src[0]);
+      sum = SumOnePassTaps<filter_index, false>(v_src, v_tap + 3);
+    } else {
+      v_src[1] = RightShiftVector<1 * 8>(v_src[0]);
+      v_src[2] = RightShiftVector<2 * 8>(v_src[0]);
+      v_src[3] = RightShiftVector<3 * 8>(v_src[0]);
+      sum = SumOnePassTaps<filter_index, false>(v_src, v_tap + 2);
+    }
+    if (is_2d || is_compound) {
+      const uint16x4_t v_sum = vreinterpret_u16_s16(
+          vrshr_n_s16(vget_low_s16(sum), kInterRoundBitsHorizontal - 1));
+      vst1_u16(dest16, v_sum);
+    } else {
+      constexpr int first_shift_rounding_bit =
+          1 << (kInterRoundBitsHorizontal - 2);
+      sum = vaddq_s16(sum, vdupq_n_s16(first_shift_rounding_bit));
+      const uint8x8_t result = vqrshrun_n_s16(sum, kFilterBits - 1);
+      StoreLo4(&dest8[0], result);
+    }
+    src += src_stride;
+    dest8 += pred_stride;
+    dest16 += pred_stride;
+  } while (--y != 0);
+}
+
+template <int filter_index, bool is_2d>
+void FilterHorizontalWidth2(const uint8_t* src, const ptrdiff_t src_stride,
+                            void* const dest, const ptrdiff_t pred_stride,
+                            const int height, const uint8x8_t* const v_tap) {
+  auto* dest8 = static_cast<uint8_t*>(dest);
+  auto* dest16 = static_cast<uint16_t*>(dest);
+  int y = height >> 1;
+  do {
+    const uint8x8_t input0 = vld1_u8(src);
+    const uint8x8_t input1 = vld1_u8(src + src_stride);
+    const uint8x8x2_t input = vzip_u8(input0, input1);
+    uint16x8_t sum;
+    if (filter_index == 3) {
+      // tap signs : + +
+      sum = vmull_u8(input.val[0], v_tap[3]);
+      sum = vmlal_u8(sum, vext_u8(input.val[0], input.val[1], 2), v_tap[4]);
+    } else if (filter_index == 4) {
+      // tap signs : - + + -
+      sum = vmull_u8(RightShiftVector<2 * 8>(input.val[0]), v_tap[3]);
+      sum = vmlsl_u8(sum, input.val[0], v_tap[2]);
+      sum = vmlal_u8(sum, RightShiftVector<4 * 8>(input.val[0]), v_tap[4]);
+      sum = vmlsl_u8(sum, vext_u8(input.val[0], input.val[1], 6), v_tap[5]);
+    } else {
+      // tap signs : + + + +
+      sum = vmull_u8(input.val[0], v_tap[2]);
+      sum = vmlal_u8(sum, RightShiftVector<2 * 8>(input.val[0]), v_tap[3]);
+      sum = vmlal_u8(sum, RightShiftVector<4 * 8>(input.val[0]), v_tap[4]);
+      sum = vmlal_u8(sum, vext_u8(input.val[0], input.val[1], 6), v_tap[5]);
+    }
+    int16x8_t s = vreinterpretq_s16_u16(sum);
+    if (is_2d) {
+      const uint16x8_t v_sum =
+          vreinterpretq_u16_s16(vrshrq_n_s16(s, kInterRoundBitsHorizontal - 1));
+      dest16[0] = vgetq_lane_u16(v_sum, 0);
+      dest16[1] = vgetq_lane_u16(v_sum, 2);
+      dest16 += pred_stride;
+      dest16[0] = vgetq_lane_u16(v_sum, 1);
+      dest16[1] = vgetq_lane_u16(v_sum, 3);
+      dest16 += pred_stride;
+    } else {
+      // Normally the Horizontal pass does the downshift in two passes:
+      // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
+      // kInterRoundBitsHorizontal). Each one uses a rounding shift.
+      // Combining them requires adding the rounding offset from the skipped
+      // shift.
+      constexpr int first_shift_rounding_bit =
+          1 << (kInterRoundBitsHorizontal - 2);
+      s = vaddq_s16(s, vdupq_n_s16(first_shift_rounding_bit));
+      const uint8x8_t result = vqrshrun_n_s16(s, kFilterBits - 1);
+      dest8[0] = vget_lane_u8(result, 0);
+      dest8[1] = vget_lane_u8(result, 2);
+      dest8 += pred_stride;
+      dest8[0] = vget_lane_u8(result, 1);
+      dest8[1] = vget_lane_u8(result, 3);
+      dest8 += pred_stride;
+    }
+    src += src_stride << 1;
+  } while (--y != 0);
+
+  // The 2d filters have an odd |height| because the horizontal pass
+  // generates context for the vertical pass.
+  if (is_2d) {
+    assert(height % 2 == 1);
+    const uint8x8_t input = vld1_u8(src);
+    uint16x8_t sum;
+    if (filter_index == 3) {
+      sum = vmull_u8(input, v_tap[3]);
+      sum = vmlal_u8(sum, RightShiftVector<1 * 8>(input), v_tap[4]);
+    } else if (filter_index == 4) {
+      sum = vmull_u8(RightShiftVector<1 * 8>(input), v_tap[3]);
+      sum = vmlsl_u8(sum, input, v_tap[2]);
+      sum = vmlal_u8(sum, RightShiftVector<2 * 8>(input), v_tap[4]);
+      sum = vmlsl_u8(sum, RightShiftVector<3 * 8>(input), v_tap[5]);
+    } else {
+      assert(filter_index == 5);
+      sum = vmull_u8(input, v_tap[2]);
+      sum = vmlal_u8(sum, RightShiftVector<1 * 8>(input), v_tap[3]);
+      sum = vmlal_u8(sum, RightShiftVector<2 * 8>(input), v_tap[4]);
+      sum = vmlal_u8(sum, RightShiftVector<3 * 8>(input), v_tap[5]);
+    }
+    // |sum| contains an int16_t value.
+    sum = vreinterpretq_u16_s16(vrshrq_n_s16(vreinterpretq_s16_u16(sum),
+                                             kInterRoundBitsHorizontal - 1));
+    Store2<0>(dest16, sum);
+  }
+}
+
+template <int filter_index, bool negative_outside_taps, bool is_2d,
+          bool is_compound>
+void FilterHorizontal(const uint8_t* const src, const ptrdiff_t src_stride,
+                      void* const dest, const ptrdiff_t pred_stride,
+                      const int width, const int height,
+                      const uint8x8_t* const v_tap) {
+  assert(width < 8 || filter_index <= 3);
+  // Don't simplify the redundant if conditions with the template parameters,
+  // which helps the compiler generate compact code.
+  if (width >= 8 && filter_index <= 3) {
+    FilterHorizontalWidth8AndUp<filter_index, negative_outside_taps, is_2d,
+                                is_compound>(src, src_stride, dest, pred_stride,
+                                             width, height, v_tap);
     return;
   }
 
-  // Horizontal passes only needs to account for |num_taps| 2 and 4 when
+  // Horizontal passes only needs to account for number of taps 2 and 4 when
   // |width| <= 4.
   assert(width <= 4);
-  assert(num_taps <= 4);
-  if (num_taps <= 4) {
+  assert(filter_index >= 3 && filter_index <= 5);
+  if (filter_index >= 3 && filter_index <= 5) {
     if (width == 4) {
-      int y = 0;
-      do {
-        if (is_2d || is_compound) {
-          const uint16x8_t v_sum =
-              HorizontalTaps8To16<filter_index, negative_outside_taps>(src,
-                                                                       v_tap);
-          vst1_u16(dest16, vget_low_u16(v_sum));
-        } else {
-          const uint8x8_t result =
-              SimpleHorizontalTaps<filter_index, negative_outside_taps>(src,
-                                                                        v_tap);
-          StoreLo4(&dest8[0], result);
-        }
-        src += src_stride;
-        dest8 += pred_stride;
-        dest16 += pred_stride;
-      } while (++y < height);
+      FilterHorizontalWidth4<filter_index, is_2d, is_compound>(
+          src, src_stride, dest, pred_stride, height, v_tap);
       return;
     }
-
+    assert(width == 2);
     if (!is_compound) {
-      int y = 0;
-      do {
-        if (is_2d) {
-          const uint16x8_t sum =
-              HorizontalTaps8To16_2x2<filter_index>(src, src_stride, v_tap);
-          dest16[0] = vgetq_lane_u16(sum, 0);
-          dest16[1] = vgetq_lane_u16(sum, 2);
-          dest16 += pred_stride;
-          dest16[0] = vgetq_lane_u16(sum, 1);
-          dest16[1] = vgetq_lane_u16(sum, 3);
-          dest16 += pred_stride;
-        } else {
-          const uint8x8_t sum =
-              SimpleHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
-
-          dest8[0] = vget_lane_u8(sum, 0);
-          dest8[1] = vget_lane_u8(sum, 2);
-          dest8 += pred_stride;
-
-          dest8[0] = vget_lane_u8(sum, 1);
-          dest8[1] = vget_lane_u8(sum, 3);
-          dest8 += pred_stride;
-        }
-
-        src += src_stride << 1;
-        y += 2;
-      } while (y < height - 1);
-
-      // The 2d filters have an odd |height| because the horizontal pass
-      // generates context for the vertical pass.
-      if (is_2d) {
-        assert(height % 2 == 1);
-        uint16x8_t sum;
-        const uint8x8_t input = vld1_u8(src);
-        if (filter_index == 3) {  // |num_taps| == 2
-          sum = vmull_u8(RightShift<3 * 8>(input), v_tap[3]);
-          sum = vmlal_u8(sum, RightShift<4 * 8>(input), v_tap[4]);
-        } else if (filter_index == 4) {
-          sum = vmull_u8(RightShift<3 * 8>(input), v_tap[3]);
-          sum = vmlsl_u8(sum, RightShift<2 * 8>(input), v_tap[2]);
-          sum = vmlal_u8(sum, RightShift<4 * 8>(input), v_tap[4]);
-          sum = vmlsl_u8(sum, RightShift<5 * 8>(input), v_tap[5]);
-        } else {
-          assert(filter_index == 5);
-          sum = vmull_u8(RightShift<2 * 8>(input), v_tap[2]);
-          sum = vmlal_u8(sum, RightShift<3 * 8>(input), v_tap[3]);
-          sum = vmlal_u8(sum, RightShift<4 * 8>(input), v_tap[4]);
-          sum = vmlal_u8(sum, RightShift<5 * 8>(input), v_tap[5]);
-        }
-        // |sum| contains an int16_t value.
-        sum = vreinterpretq_u16_s16(vrshrq_n_s16(
-            vreinterpretq_s16_u16(sum), kInterRoundBitsHorizontal - 1));
-        Store2<0>(dest16, sum);
-      }
+      FilterHorizontalWidth2<filter_index, is_2d>(src, src_stride, dest,
+                                                  pred_stride, height, v_tap);
     }
   }
 }
@@ -451,78 +484,85 @@ int16x8_t SimpleSum2DVerticalTaps(const int16x8_t* const src,
 }
 
 template <int num_taps, bool is_compound = false>
-void Filter2DVertical(const uint16_t* src, void* const dst,
-                      const ptrdiff_t dst_stride, const int width,
-                      const int height, const int16x8_t taps) {
+void Filter2DVerticalWidth8AndUp(const uint16_t* src, void* const dst,
+                                 const ptrdiff_t dst_stride, const int width,
+                                 const int height, const int16x8_t taps) {
   assert(width >= 8);
   constexpr int next_row = num_taps - 1;
-  // The Horizontal pass uses |width| as |stride| for the intermediate buffer.
-  const ptrdiff_t src_stride = width;
-
-  auto* dst8 = static_cast<uint8_t*>(dst);
-  auto* dst16 = static_cast<uint16_t*>(dst);
+  auto* const dst8 = static_cast<uint8_t*>(dst);
+  auto* const dst16 = static_cast<uint16_t*>(dst);
 
   int x = 0;
   do {
-    int16x8_t srcs[8];
-    const uint16_t* src_x = src + x;
-    srcs[0] = vreinterpretq_s16_u16(vld1q_u16(src_x));
-    src_x += src_stride;
+    int16x8_t srcs[9];
+    srcs[0] = vreinterpretq_s16_u16(vld1q_u16(src));
+    src += 8;
     if (num_taps >= 4) {
-      srcs[1] = vreinterpretq_s16_u16(vld1q_u16(src_x));
-      src_x += src_stride;
-      srcs[2] = vreinterpretq_s16_u16(vld1q_u16(src_x));
-      src_x += src_stride;
+      srcs[1] = vreinterpretq_s16_u16(vld1q_u16(src));
+      src += 8;
+      srcs[2] = vreinterpretq_s16_u16(vld1q_u16(src));
+      src += 8;
       if (num_taps >= 6) {
-        srcs[3] = vreinterpretq_s16_u16(vld1q_u16(src_x));
-        src_x += src_stride;
-        srcs[4] = vreinterpretq_s16_u16(vld1q_u16(src_x));
-        src_x += src_stride;
+        srcs[3] = vreinterpretq_s16_u16(vld1q_u16(src));
+        src += 8;
+        srcs[4] = vreinterpretq_s16_u16(vld1q_u16(src));
+        src += 8;
         if (num_taps == 8) {
-          srcs[5] = vreinterpretq_s16_u16(vld1q_u16(src_x));
-          src_x += src_stride;
-          srcs[6] = vreinterpretq_s16_u16(vld1q_u16(src_x));
-          src_x += src_stride;
+          srcs[5] = vreinterpretq_s16_u16(vld1q_u16(src));
+          src += 8;
+          srcs[6] = vreinterpretq_s16_u16(vld1q_u16(src));
+          src += 8;
         }
       }
     }
 
-    int y = 0;
+    uint8_t* d8 = dst8 + x;
+    uint16_t* d16 = dst16 + x;
+    int y = height;
     do {
-      srcs[next_row] = vreinterpretq_s16_u16(vld1q_u16(src_x));
-      src_x += src_stride;
-
-      const int16x8_t sum =
-          SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps);
+      srcs[next_row] = vreinterpretq_s16_u16(vld1q_u16(src));
+      src += 8;
+      srcs[next_row + 1] = vreinterpretq_s16_u16(vld1q_u16(src));
+      src += 8;
+      const int16x8_t sum0 =
+          SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs + 0, taps);
+      const int16x8_t sum1 =
+          SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs + 1, taps);
       if (is_compound) {
-        vst1q_u16(dst16 + x + y * dst_stride, vreinterpretq_u16_s16(sum));
+        vst1q_u16(d16, vreinterpretq_u16_s16(sum0));
+        d16 += dst_stride;
+        vst1q_u16(d16, vreinterpretq_u16_s16(sum1));
+        d16 += dst_stride;
       } else {
-        vst1_u8(dst8 + x + y * dst_stride, vqmovun_s16(sum));
+        vst1_u8(d8, vqmovun_s16(sum0));
+        d8 += dst_stride;
+        vst1_u8(d8, vqmovun_s16(sum1));
+        d8 += dst_stride;
       }
-
-      srcs[0] = srcs[1];
+      srcs[0] = srcs[2];
       if (num_taps >= 4) {
-        srcs[1] = srcs[2];
-        srcs[2] = srcs[3];
+        srcs[1] = srcs[3];
+        srcs[2] = srcs[4];
         if (num_taps >= 6) {
-          srcs[3] = srcs[4];
-          srcs[4] = srcs[5];
+          srcs[3] = srcs[5];
+          srcs[4] = srcs[6];
           if (num_taps == 8) {
-            srcs[5] = srcs[6];
-            srcs[6] = srcs[7];
+            srcs[5] = srcs[7];
+            srcs[6] = srcs[8];
           }
         }
       }
-    } while (++y < height);
+      y -= 2;
+    } while (y != 0);
     x += 8;
   } while (x < width);
 }
 
 // Take advantage of |src_stride| == |width| to process two rows at a time.
 template <int num_taps, bool is_compound = false>
-void Filter2DVertical4xH(const uint16_t* src, void* const dst,
-                         const ptrdiff_t dst_stride, const int height,
-                         const int16x8_t taps) {
+void Filter2DVerticalWidth4(const uint16_t* src, void* const dst,
+                            const ptrdiff_t dst_stride, const int height,
+                            const int16x8_t taps) {
   auto* dst8 = static_cast<uint8_t*>(dst);
   auto* dst16 = static_cast<uint16_t*>(dst);
 
@@ -545,7 +585,7 @@ void Filter2DVertical4xH(const uint16_t* src, void* const dst,
     }
   }
 
-  int y = 0;
+  int y = height;
   do {
     srcs[num_taps] = vreinterpretq_s16_u16(vld1q_u16(src));
     src += 8;
@@ -580,15 +620,15 @@ void Filter2DVertical4xH(const uint16_t* src, void* const dst,
         }
       }
     }
-    y += 2;
-  } while (y < height);
+    y -= 2;
+  } while (y != 0);
 }
 
 // Take advantage of |src_stride| == |width| to process four rows at a time.
 template <int num_taps>
-void Filter2DVertical2xH(const uint16_t* src, void* const dst,
-                         const ptrdiff_t dst_stride, const int height,
-                         const int16x8_t taps) {
+void Filter2DVerticalWidth2(const uint16_t* src, void* const dst,
+                            const ptrdiff_t dst_stride, const int height,
+                            const int16x8_t taps) {
   constexpr int next_row = (num_taps < 6) ? 4 : 8;
 
   auto* dst8 = static_cast<uint8_t*>(dst);
@@ -672,29 +712,47 @@ LIBGAV1_ALWAYS_INLINE void DoHorizontalPass(
   }
 
   if (filter_index == 2) {  // 8 tap.
-    FilterHorizontal<8, 8, 2, true, is_2d, is_compound>(
+    FilterHorizontal<2, true, is_2d, is_compound>(
         src, src_stride, dst, dst_stride, width, height, v_tap);
   } else if (filter_index == 1) {  // 6 tap.
     // Check if outside taps are positive.
     if ((filter_id == 1) | (filter_id == 15)) {
-      FilterHorizontal<6, 8, 1, false, is_2d, is_compound>(
-          src, src_stride, dst, dst_stride, width, height, v_tap);
+      FilterHorizontal<1, false, is_2d, is_compound>(
+          src + 1, src_stride, dst, dst_stride, width, height, v_tap);
     } else {
-      FilterHorizontal<6, 8, 1, true, is_2d, is_compound>(
-          src, src_stride, dst, dst_stride, width, height, v_tap);
+      FilterHorizontal<1, true, is_2d, is_compound>(
+          src + 1, src_stride, dst, dst_stride, width, height, v_tap);
     }
   } else if (filter_index == 0) {  // 6 tap.
-    FilterHorizontal<6, 8, 0, true, is_2d, is_compound>(
-        src, src_stride, dst, dst_stride, width, height, v_tap);
+    FilterHorizontal<0, true, is_2d, is_compound>(
+        src + 1, src_stride, dst, dst_stride, width, height, v_tap);
   } else if (filter_index == 4) {  // 4 tap.
-    FilterHorizontal<4, 8, 4, true, is_2d, is_compound>(
-        src, src_stride, dst, dst_stride, width, height, v_tap);
+    FilterHorizontal<4, true, is_2d, is_compound>(
+        src + 2, src_stride, dst, dst_stride, width, height, v_tap);
   } else if (filter_index == 5) {  // 4 tap.
-    FilterHorizontal<4, 8, 5, true, is_2d, is_compound>(
-        src, src_stride, dst, dst_stride, width, height, v_tap);
+    FilterHorizontal<5, true, is_2d, is_compound>(
+        src + 2, src_stride, dst, dst_stride, width, height, v_tap);
   } else {  // 2 tap.
-    FilterHorizontal<2, 8, 3, true, is_2d, is_compound>(
-        src, src_stride, dst, dst_stride, width, height, v_tap);
+    FilterHorizontal<3, true, is_2d, is_compound>(
+        src + 3, src_stride, dst, dst_stride, width, height, v_tap);
+  }
+}
+
+template <int vertical_taps>
+void Filter2DVertical(const uint16_t* const intermediate_result,
+                      const int width, const int height, const int16x8_t taps,
+                      void* const prediction, const ptrdiff_t pred_stride) {
+  auto* const dest = static_cast<uint8_t*>(prediction);
+  if (width >= 8) {
+    Filter2DVerticalWidth8AndUp<vertical_taps>(
+        intermediate_result, dest, pred_stride, width, height, taps);
+  } else if (width == 4) {
+    Filter2DVerticalWidth4<vertical_taps>(intermediate_result, dest,
+                                          pred_stride, height, taps);
+  } else {
+    assert(width == 2);
+    Filter2DVerticalWidth2<vertical_taps>(intermediate_result, dest,
+                                          pred_stride, height, taps);
   }
 }
 
@@ -704,7 +762,7 @@ void Convolve2D_NEON(const void* const reference,
                      const int vertical_filter_index,
                      const int horizontal_filter_id,
                      const int vertical_filter_id, const int width,
-                     const int height, void* prediction,
+                     const int height, void* const prediction,
                      const ptrdiff_t pred_stride) {
   const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
   const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
@@ -715,67 +773,31 @@ void Convolve2D_NEON(const void* const reference,
       intermediate_result[kMaxSuperBlockSizeInPixels *
                           (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
   const int intermediate_height = height + vertical_taps - 1;
-
   const ptrdiff_t src_stride = reference_stride;
-  const auto* src = static_cast<const uint8_t*>(reference) -
-                    (vertical_taps / 2 - 1) * src_stride - kHorizontalOffset;
+  const auto* const src = static_cast<const uint8_t*>(reference) -
+                          (vertical_taps / 2 - 1) * src_stride -
+                          kHorizontalOffset;
 
   DoHorizontalPass</*is_2d=*/true>(src, src_stride, intermediate_result, width,
                                    width, intermediate_height,
                                    horizontal_filter_id, horiz_filter_index);
 
   // Vertical filter.
-  auto* dest = static_cast<uint8_t*>(prediction);
-  const ptrdiff_t dest_stride = pred_stride;
   assert(vertical_filter_id != 0);
-
   const int16x8_t taps = vmovl_s8(
       vld1_s8(kHalfSubPixelFilters[vert_filter_index][vertical_filter_id]));
-
   if (vertical_taps == 8) {
-    if (width == 2) {
-      Filter2DVertical2xH<8>(intermediate_result, dest, dest_stride, height,
-                             taps);
-    } else if (width == 4) {
-      Filter2DVertical4xH<8>(intermediate_result, dest, dest_stride, height,
-                             taps);
-    } else {
-      Filter2DVertical<8>(intermediate_result, dest, dest_stride, width, height,
-                          taps);
-    }
+    Filter2DVertical<8>(intermediate_result, width, height, taps, prediction,
+                        pred_stride);
   } else if (vertical_taps == 6) {
-    if (width == 2) {
-      Filter2DVertical2xH<6>(intermediate_result, dest, dest_stride, height,
-                             taps);
-    } else if (width == 4) {
-      Filter2DVertical4xH<6>(intermediate_result, dest, dest_stride, height,
-                             taps);
-    } else {
-      Filter2DVertical<6>(intermediate_result, dest, dest_stride, width, height,
-                          taps);
-    }
+    Filter2DVertical<6>(intermediate_result, width, height, taps, prediction,
+                        pred_stride);
   } else if (vertical_taps == 4) {
-    if (width == 2) {
-      Filter2DVertical2xH<4>(intermediate_result, dest, dest_stride, height,
-                             taps);
-    } else if (width == 4) {
-      Filter2DVertical4xH<4>(intermediate_result, dest, dest_stride, height,
-                             taps);
-    } else {
-      Filter2DVertical<4>(intermediate_result, dest, dest_stride, width, height,
-                          taps);
-    }
+    Filter2DVertical<4>(intermediate_result, width, height, taps, prediction,
+                        pred_stride);
   } else {  // |vertical_taps| == 2
-    if (width == 2) {
-      Filter2DVertical2xH<2>(intermediate_result, dest, dest_stride, height,
-                             taps);
-    } else if (width == 4) {
-      Filter2DVertical4xH<2>(intermediate_result, dest, dest_stride, height,
-                             taps);
-    } else {
-      Filter2DVertical<2>(intermediate_result, dest, dest_stride, width, height,
-                          taps);
-    }
+    Filter2DVertical<2>(intermediate_result, width, height, taps, prediction,
+                        pred_stride);
   }
 }
 
@@ -788,7 +810,7 @@ void Convolve2D_NEON(const void* const reference,
 // increments. The first load covers the initial elements of src_x, while the
 // final load covers the taps.
 template <int grade_x>
-inline uint8x8x3_t LoadSrcVals(const uint8_t* src_x) {
+inline uint8x8x3_t LoadSrcVals(const uint8_t* const src_x) {
   uint8x8x3_t ret;
   const uint8x16_t src_val = vld1q_u8(src_x);
   ret.val[0] = vget_low_u8(src_val);
@@ -811,7 +833,7 @@ inline uint8x16_t GetPositive2TapFilter(const int tap_index) {
 }
 
 template <int grade_x>
-inline void ConvolveKernelHorizontal2Tap(const uint8_t* src,
+inline void ConvolveKernelHorizontal2Tap(const uint8_t* const src,
                                          const ptrdiff_t src_stride,
                                          const int width, const int subpixel_x,
                                          const int step_x,
@@ -843,7 +865,7 @@ inline void ConvolveKernelHorizontal2Tap(const uint8_t* src,
     // on x.
     const uint8x8_t taps[2] = {VQTbl1U8(filter_taps0, filter_indices),
                                VQTbl1U8(filter_taps1, filter_indices)};
-    int y = 0;
+    int y = intermediate_height;
     do {
       // Load a pool of samples to select from using stepped indices.
       const uint8x16_t src_vals = vld1q_u8(src_x);
@@ -860,7 +882,7 @@ inline void ConvolveKernelHorizontal2Tap(const uint8_t* src,
                              kInterRoundBitsHorizontal - 1));
       src_x += src_stride;
       intermediate += kIntermediateStride;
-    } while (++y < intermediate_height);
+    } while (--y != 0);
     return;
   }
 
@@ -883,7 +905,7 @@ inline void ConvolveKernelHorizontal2Tap(const uint8_t* src,
     // on x.
     const uint8x8_t taps[2] = {VQTbl1U8(filter_taps0, filter_indices),
                                VQTbl1U8(filter_taps1, filter_indices)};
-    int y = 0;
+    int y = intermediate_height;
     do {
       // Load a pool of samples to select from using stepped indices.
       const uint8x8x3_t src_vals = LoadSrcVals<grade_x>(src_x);
@@ -900,7 +922,7 @@ inline void ConvolveKernelHorizontal2Tap(const uint8_t* src,
                              kInterRoundBitsHorizontal - 1));
       src_x += src_stride;
       intermediate_x += kIntermediateStride;
-    } while (++y < intermediate_height);
+    } while (--y != 0);
     x += 8;
     p += step_x8;
   } while (x < width);
@@ -921,7 +943,7 @@ inline uint8x16_t GetPositive4TapFilter(const int tap_index) {
 
 // This filter is only possible when width <= 4.
 void ConvolveKernelHorizontalPositive4Tap(
-    const uint8_t* src, const ptrdiff_t src_stride, const int subpixel_x,
+    const uint8_t* const src, const ptrdiff_t src_stride, const int subpixel_x,
     const int step_x, const int intermediate_height, int16_t* intermediate) {
   const int kernel_offset = 2;
   const int ref_x = subpixel_x >> kScaleSubPixelBits;
@@ -950,7 +972,7 @@ void ConvolveKernelHorizontalPositive4Tap(
 
   const uint8x8_t src_indices =
       vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits));
-  int y = 0;
+  int y = intermediate_height;
   do {
     // Load a pool of samples to select from using stepped index vectors.
     const uint8x16_t src_vals = vld1q_u8(src_x);
@@ -970,7 +992,7 @@ void ConvolveKernelHorizontalPositive4Tap(
 
     src_x += src_stride;
     intermediate += kIntermediateStride;
-  } while (++y < intermediate_height);
+  } while (--y != 0);
 }
 
 // Pre-transpose the 4 tap filters in |kAbsHalfSubPixelFilters|[4].
@@ -988,7 +1010,7 @@ inline uint8x16_t GetSigned4TapFilter(const int tap_index) {
 
 // This filter is only possible when width <= 4.
 inline void ConvolveKernelHorizontalSigned4Tap(
-    const uint8_t* src, const ptrdiff_t src_stride, const int subpixel_x,
+    const uint8_t* const src, const ptrdiff_t src_stride, const int subpixel_x,
     const int step_x, const int intermediate_height, int16_t* intermediate) {
   const int kernel_offset = 2;
   const int ref_x = subpixel_x >> kScaleSubPixelBits;
@@ -1025,7 +1047,7 @@ inline void ConvolveKernelHorizontalSigned4Tap(
                                     vadd_u8(src_indices_base, vdup_n_u8(2)),
                                     vadd_u8(src_indices_base, vdup_n_u8(3))};
 
-  int y = 0;
+  int y = intermediate_height;
   do {
     // Load a pool of samples to select from using stepped indices.
     const uint8x16_t src_vals = vld1q_u8(src_x);
@@ -1042,7 +1064,7 @@ inline void ConvolveKernelHorizontalSigned4Tap(
                            kInterRoundBitsHorizontal - 1));
     src_x += src_stride;
     intermediate += kIntermediateStride;
-  } while (++y < intermediate_height);
+  } while (--y != 0);
 }
 
 // Pre-transpose the 6 tap filters in |kAbsHalfSubPixelFilters|[0].
@@ -1063,9 +1085,9 @@ inline uint8x16_t GetSigned6TapFilter(const int tap_index) {
 // This filter is only possible when width >= 8.
 template <int grade_x>
 inline void ConvolveKernelHorizontalSigned6Tap(
-    const uint8_t* src, const ptrdiff_t src_stride, const int width,
+    const uint8_t* const src, const ptrdiff_t src_stride, const int width,
     const int subpixel_x, const int step_x, const int intermediate_height,
-    int16_t* intermediate) {
+    int16_t* const intermediate) {
   const int kernel_offset = 1;
   const uint8x8_t one = vdup_n_u8(1);
   const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
@@ -1107,7 +1129,7 @@ inline void ConvolveKernelHorizontalSigned6Tap(
     for (int i = 0; i < 6; ++i) {
       taps[i] = VQTbl1U8(filter_taps[i], filter_indices);
     }
-    int y = 0;
+    int y = intermediate_height;
     do {
       // Load a pool of samples to select from using stepped indices.
       const uint8x8x3_t src_vals = LoadSrcVals<grade_x>(src_x);
@@ -1122,7 +1144,7 @@ inline void ConvolveKernelHorizontalSigned6Tap(
                              kInterRoundBitsHorizontal - 1));
       src_x += src_stride;
       intermediate_x += kIntermediateStride;
-    } while (++y < intermediate_height);
+    } while (--y != 0);
     x += 8;
     p += step_x8;
   } while (x < width);
@@ -1156,9 +1178,9 @@ inline int8x16_t GetMixed6TapFilter(const int tap_index) {
 // This filter is only possible when width >= 8.
 template <int grade_x>
 inline void ConvolveKernelHorizontalMixed6Tap(
-    const uint8_t* src, const ptrdiff_t src_stride, const int width,
+    const uint8_t* const src, const ptrdiff_t src_stride, const int width,
     const int subpixel_x, const int step_x, const int intermediate_height,
-    int16_t* intermediate) {
+    int16_t* const intermediate) {
   const int kernel_offset = 1;
   const uint8x8_t one = vdup_n_u8(1);
   const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
@@ -1205,7 +1227,7 @@ inline void ConvolveKernelHorizontalMixed6Tap(
     mixed_taps[0] = vmovl_s8(VQTbl1S8(mixed_filter_taps[0], filter_indices));
     mixed_taps[1] = vmovl_s8(VQTbl1S8(mixed_filter_taps[1], filter_indices));
 
-    int y = 0;
+    int y = intermediate_height;
     do {
       // Load a pool of samples to select from using stepped indices.
       const uint8x8x3_t src_vals = LoadSrcVals<grade_x>(src_x);
@@ -1224,7 +1246,7 @@ inline void ConvolveKernelHorizontalMixed6Tap(
                                              kInterRoundBitsHorizontal - 1));
       src_x += src_stride;
       intermediate_x += kIntermediateStride;
-    } while (++y < intermediate_height);
+    } while (--y != 0);
     x += 8;
     p += step_x8;
   } while (x < width);
@@ -1250,9 +1272,9 @@ inline uint8x16_t GetSigned8TapFilter(const int tap_index) {
 // This filter is only possible when width >= 8.
 template <int grade_x>
 inline void ConvolveKernelHorizontalSigned8Tap(
-    const uint8_t* src, const ptrdiff_t src_stride, const int width,
+    const uint8_t* const src, const ptrdiff_t src_stride, const int width,
     const int subpixel_x, const int step_x, const int intermediate_height,
-    int16_t* intermediate) {
+    int16_t* const intermediate) {
   const uint8x8_t one = vdup_n_u8(1);
   const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
   const int ref_x = subpixel_x >> kScaleSubPixelBits;
@@ -1290,7 +1312,7 @@ inline void ConvolveKernelHorizontalSigned8Tap(
       taps[i] = VQTbl1U8(filter_taps[i], filter_indices);
     }
 
-    int y = 0;
+    int y = intermediate_height;
     do {
       // Load a pool of samples to select from using stepped indices.
       const uint8x8x3_t src_vals = LoadSrcVals<grade_x>(src_x);
@@ -1306,7 +1328,7 @@ inline void ConvolveKernelHorizontalSigned8Tap(
                              kInterRoundBitsHorizontal - 1));
       src_x += src_stride;
       intermediate_x += kIntermediateStride;
-    } while (++y < intermediate_height);
+    } while (--y != 0);
     x += 8;
     p += step_x8;
   } while (x < width);
@@ -1314,9 +1336,9 @@ inline void ConvolveKernelHorizontalSigned8Tap(
 
 // This function handles blocks of width 2 or 4.
 template <int num_taps, int grade_y, int width, bool is_compound>
-void ConvolveVerticalScale4xH(const int16_t* src, const int subpixel_y,
+void ConvolveVerticalScale4xH(const int16_t* const src, const int subpixel_y,
                               const int filter_index, const int step_y,
-                              const int height, void* dest,
+                              const int height, void* const dest,
                               const ptrdiff_t dest_stride) {
   constexpr ptrdiff_t src_stride = kIntermediateStride;
   const int16_t* src_y = src;
@@ -1327,8 +1349,8 @@ void ConvolveVerticalScale4xH(const int16_t* src, const int subpixel_y,
 
   int p = subpixel_y & 1023;
   int prev_p = p;
-  int y = 0;
-  do {  // y < height
+  int y = height;
+  do {
     for (int i = 0; i < num_taps; ++i) {
       s[i] = vld1_s16(src_y + i * src_stride);
     }
@@ -1381,16 +1403,16 @@ void ConvolveVerticalScale4xH(const int16_t* src, const int subpixel_y,
     prev_p = p;
     dest16_y += dest_stride;
     dest_y += dest_stride;
-
-    y += 2;
-  } while (y < height);
+    y -= 2;
+  } while (y != 0);
 }
 
 template <int num_taps, int grade_y, bool is_compound>
-inline void ConvolveVerticalScale(const int16_t* src, const int width,
+inline void ConvolveVerticalScale(const int16_t* const src, const int width,
                                   const int subpixel_y, const int filter_index,
                                   const int step_y, const int height,
-                                  void* dest, const ptrdiff_t dest_stride) {
+                                  void* const dest,
+                                  const ptrdiff_t dest_stride) {
   constexpr ptrdiff_t src_stride = kIntermediateStride;
   // A possible improvement is to use arithmetic to decide how many times to
   // apply filters to same source before checking whether to load new srcs.
@@ -1401,15 +1423,15 @@ inline void ConvolveVerticalScale(const int16_t* src, const int width,
   uint8_t* dest_y;
 
   int x = 0;
-  do {  // x < width
-    const int16_t* src_x = src + x;
+  do {
+    const int16_t* const src_x = src + x;
     const int16_t* src_y = src_x;
     dest16_y = static_cast<uint16_t*>(dest) + x;
     dest_y = static_cast<uint8_t*>(dest) + x;
     int p = subpixel_y & 1023;
     int prev_p = p;
-    int y = 0;
-    do {  // y < height
+    int y = height;
+    do {
       for (int i = 0; i < num_taps; ++i) {
         s[i] = vld1q_s16(src_y + i * src_stride);
       }
@@ -1448,9 +1470,8 @@ inline void ConvolveVerticalScale(const int16_t* src, const int width,
       prev_p = p;
       dest16_y += dest_stride;
       dest_y += dest_stride;
-
-      y += 2;
-    } while (y < height);
+      y -= 2;
+    } while (y != 0);
     x += 8;
   } while (x < width);
 }
@@ -1462,7 +1483,7 @@ void ConvolveScale2D_NEON(const void* const reference,
                           const int vertical_filter_index, const int subpixel_x,
                           const int subpixel_y, const int step_x,
                           const int step_y, const int width, const int height,
-                          void* prediction, const ptrdiff_t pred_stride) {
+                          void* const prediction, const ptrdiff_t pred_stride) {
   const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
   const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
   assert(step_x <= 2048);
@@ -1699,12 +1720,13 @@ void ConvolveHorizontal_NEON(const void* const reference,
                              const int /*vertical_filter_index*/,
                              const int horizontal_filter_id,
                              const int /*vertical_filter_id*/, const int width,
-                             const int height, void* prediction,
+                             const int height, void* const prediction,
                              const ptrdiff_t pred_stride) {
   const int filter_index = GetFilterIndex(horizontal_filter_index, width);
   // Set |src| to the outermost tap.
-  const auto* src = static_cast<const uint8_t*>(reference) - kHorizontalOffset;
-  auto* dest = static_cast<uint8_t*>(prediction);
+  const auto* const src =
+      static_cast<const uint8_t*>(reference) - kHorizontalOffset;
+  auto* const dest = static_cast<uint8_t*>(prediction);
 
   DoHorizontalPass(src, reference_stride, dest, pred_stride, width, height,
                    horizontal_filter_id, filter_index);
@@ -1719,14 +1741,14 @@ uint16x8_t Compound1DShift(const int16x8_t sum) {
 
 template <int filter_index, bool is_compound = false,
           bool negative_outside_taps = false>
-void FilterVertical(const uint8_t* src, const ptrdiff_t src_stride,
+void FilterVertical(const uint8_t* const src, const ptrdiff_t src_stride,
                     void* const dst, const ptrdiff_t dst_stride,
                     const int width, const int height,
                     const uint8x8_t* const taps) {
   const int num_taps = GetNumTapsInFilter(filter_index);
   const int next_row = num_taps - 1;
-  auto* dst8 = static_cast<uint8_t*>(dst);
-  auto* dst16 = static_cast<uint16_t*>(dst);
+  auto* const dst8 = static_cast<uint8_t*>(dst);
+  auto* const dst16 = static_cast<uint16_t*>(dst);
   assert(width >= 8);
 
   int x = 0;
@@ -1754,6 +1776,9 @@ void FilterVertical(const uint8_t* src, const ptrdiff_t src_stride,
       }
     }
 
+    // Decreasing the y loop counter produces worse code with clang.
+    // Don't unroll this loop since it generates too much code and the decoder
+    // is even slower.
     int y = 0;
     do {
       srcs[next_row] = vld1_u8(src_x);
@@ -1804,7 +1829,7 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
     srcs[0] = Load4(src);
     src += src_stride;
 
-    int y = 0;
+    int y = height;
     do {
       srcs[0] = Load4<1>(src, srcs[0]);
       src += src_stride;
@@ -1829,8 +1854,8 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
       }
 
       srcs[0] = srcs[2];
-      y += 2;
-    } while (y < height);
+      y -= 2;
+    } while (y != 0);
   } else if (num_taps == 4) {
     srcs[4] = vdup_n_u8(0);
 
@@ -1842,7 +1867,7 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
     src += src_stride;
     srcs[1] = vext_u8(srcs[0], srcs[2], 4);
 
-    int y = 0;
+    int y = height;
     do {
       srcs[2] = Load4<1>(src, srcs[2]);
       src += src_stride;
@@ -1869,8 +1894,8 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
       srcs[0] = srcs[2];
       srcs[1] = srcs[3];
       srcs[2] = srcs[4];
-      y += 2;
-    } while (y < height);
+      y -= 2;
+    } while (y != 0);
   } else if (num_taps == 6) {
     srcs[6] = vdup_n_u8(0);
 
@@ -1887,7 +1912,7 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
     src += src_stride;
     srcs[3] = vext_u8(srcs[2], srcs[4], 4);
 
-    int y = 0;
+    int y = height;
     do {
       srcs[4] = Load4<1>(src, srcs[4]);
       src += src_stride;
@@ -1916,8 +1941,8 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
       srcs[2] = srcs[4];
       srcs[3] = srcs[5];
       srcs[4] = srcs[6];
-      y += 2;
-    } while (y < height);
+      y -= 2;
+    } while (y != 0);
   } else if (num_taps == 8) {
     srcs[8] = vdup_n_u8(0);
 
@@ -1939,7 +1964,7 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
     src += src_stride;
     srcs[5] = vext_u8(srcs[4], srcs[6], 4);
 
-    int y = 0;
+    int y = height;
     do {
       srcs[6] = Load4<1>(src, srcs[6]);
       src += src_stride;
@@ -1970,8 +1995,8 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
       srcs[4] = srcs[6];
       srcs[5] = srcs[7];
       srcs[6] = srcs[8];
-      y += 2;
-    } while (y < height);
+      y -= 2;
+    } while (y != 0);
   }
 }
 
@@ -2186,14 +2211,14 @@ void ConvolveVertical_NEON(const void* const reference,
                            const int vertical_filter_index,
                            const int /*horizontal_filter_id*/,
                            const int vertical_filter_id, const int width,
-                           const int height, void* prediction,
+                           const int height, void* const prediction,
                            const ptrdiff_t pred_stride) {
   const int filter_index = GetFilterIndex(vertical_filter_index, height);
   const int vertical_taps = GetNumTapsInFilter(filter_index);
   const ptrdiff_t src_stride = reference_stride;
   const auto* src = static_cast<const uint8_t*>(reference) -
                     (vertical_taps / 2 - 1) * src_stride;
-  auto* dest = static_cast<uint8_t*>(prediction);
+  auto* const dest = static_cast<uint8_t*>(prediction);
   const ptrdiff_t dest_stride = pred_stride;
   assert(vertical_filter_id != 0);
 
@@ -2303,7 +2328,7 @@ void ConvolveCompoundCopy_NEON(
     const void* const reference, const ptrdiff_t reference_stride,
     const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
     const int /*horizontal_filter_id*/, const int /*vertical_filter_id*/,
-    const int width, const int height, void* prediction,
+    const int width, const int height, void* const prediction,
     const ptrdiff_t /*pred_stride*/) {
   const auto* src = static_cast<const uint8_t*>(reference);
   const ptrdiff_t src_stride = reference_stride;
@@ -2312,7 +2337,7 @@ void ConvolveCompoundCopy_NEON(
       kInterRoundBitsVertical - kInterRoundBitsCompoundVertical;
 
   if (width >= 16) {
-    int y = 0;
+    int y = height;
     do {
       int x = 0;
       do {
@@ -2328,20 +2353,20 @@ void ConvolveCompoundCopy_NEON(
       } while (x < width);
       src += src_stride;
       dest += width;
-    } while (++y < height);
+    } while (--y != 0);
   } else if (width == 8) {
-    int y = 0;
+    int y = height;
     do {
       const uint8x8_t v_src = vld1_u8(&src[0]);
       const uint16x8_t v_dest = vshll_n_u8(v_src, final_shift);
       vst1q_u16(&dest[0], v_dest);
       src += src_stride;
       dest += width;
-    } while (++y < height);
-  } else { /* width == 4 */
+    } while (--y != 0);
+  } else {  // width == 4
     uint8x8_t v_src = vdup_n_u8(0);
 
-    int y = 0;
+    int y = height;
     do {
       v_src = Load4<0>(&src[0], v_src);
       src += src_stride;
@@ -2350,8 +2375,8 @@ void ConvolveCompoundCopy_NEON(
       const uint16x8_t v_dest = vshll_n_u8(v_src, final_shift);
       vst1q_u16(&dest[0], v_dest);
       dest += 4 << 1;
-      y += 2;
-    } while (y < height);
+      y -= 2;
+    } while (y != 0);
   }
 }
 
@@ -2359,14 +2384,14 @@ void ConvolveCompoundVertical_NEON(
     const void* const reference, const ptrdiff_t reference_stride,
     const int /*horizontal_filter_index*/, const int vertical_filter_index,
     const int /*horizontal_filter_id*/, const int vertical_filter_id,
-    const int width, const int height, void* prediction,
+    const int width, const int height, void* const prediction,
     const ptrdiff_t /*pred_stride*/) {
   const int filter_index = GetFilterIndex(vertical_filter_index, height);
   const int vertical_taps = GetNumTapsInFilter(filter_index);
   const ptrdiff_t src_stride = reference_stride;
   const auto* src = static_cast<const uint8_t*>(reference) -
                     (vertical_taps / 2 - 1) * src_stride;
-  auto* dest = static_cast<uint16_t*>(prediction);
+  auto* const dest = static_cast<uint16_t*>(prediction);
   assert(vertical_filter_id != 0);
 
   uint8x8_t taps[8];
@@ -2454,24 +2479,39 @@ void ConvolveCompoundHorizontal_NEON(
     const void* const reference, const ptrdiff_t reference_stride,
     const int horizontal_filter_index, const int /*vertical_filter_index*/,
     const int horizontal_filter_id, const int /*vertical_filter_id*/,
-    const int width, const int height, void* prediction,
+    const int width, const int height, void* const prediction,
     const ptrdiff_t /*pred_stride*/) {
   const int filter_index = GetFilterIndex(horizontal_filter_index, width);
-  const auto* src = static_cast<const uint8_t*>(reference) - kHorizontalOffset;
-  auto* dest = static_cast<uint16_t*>(prediction);
+  const auto* const src =
+      static_cast<const uint8_t*>(reference) - kHorizontalOffset;
+  auto* const dest = static_cast<uint16_t*>(prediction);
 
   DoHorizontalPass</*is_2d=*/false, /*is_compound=*/true>(
       src, reference_stride, dest, width, width, height, horizontal_filter_id,
       filter_index);
 }
 
+template <int vertical_taps>
+void Compound2DVertical(const uint16_t* const intermediate_result,
+                        const int width, const int height, const int16x8_t taps,
+                        void* const prediction) {
+  auto* const dest = static_cast<uint16_t*>(prediction);
+  if (width == 4) {
+    Filter2DVerticalWidth4<vertical_taps, /*is_compound=*/true>(
+        intermediate_result, dest, width, height, taps);
+  } else {
+    Filter2DVerticalWidth8AndUp<vertical_taps, /*is_compound=*/true>(
+        intermediate_result, dest, width, width, height, taps);
+  }
+}
+
 void ConvolveCompound2D_NEON(const void* const reference,
                              const ptrdiff_t reference_stride,
                              const int horizontal_filter_index,
                              const int vertical_filter_index,
                              const int horizontal_filter_id,
                              const int vertical_filter_id, const int width,
-                             const int height, void* prediction,
+                             const int height, void* const prediction,
                              const ptrdiff_t /*pred_stride*/) {
   // The output of the horizontal filter, i.e. the intermediate_result, is
   // guaranteed to fit in int16_t.
@@ -2492,55 +2532,26 @@ void ConvolveCompound2D_NEON(const void* const reference,
   const auto* const src = static_cast<const uint8_t*>(reference) -
                           (vertical_taps / 2 - 1) * src_stride -
                           kHorizontalOffset;
-
   DoHorizontalPass</*is_2d=*/true, /*is_compound=*/true>(
       src, src_stride, intermediate_result, width, width, intermediate_height,
       horizontal_filter_id, horiz_filter_index);
 
   // Vertical filter.
-  auto* dest = static_cast<uint16_t*>(prediction);
   assert(vertical_filter_id != 0);
-
-  const ptrdiff_t dest_stride = width;
   const int16x8_t taps = vmovl_s8(
       vld1_s8(kHalfSubPixelFilters[vert_filter_index][vertical_filter_id]));
-
   if (vertical_taps == 8) {
-    if (width == 4) {
-      Filter2DVertical4xH<8, /*is_compound=*/true>(intermediate_result, dest,
-                                                   dest_stride, height, taps);
-    } else {
-      Filter2DVertical<8, /*is_compound=*/true>(
-          intermediate_result, dest, dest_stride, width, height, taps);
-    }
+    Compound2DVertical<8>(intermediate_result, width, height, taps, prediction);
   } else if (vertical_taps == 6) {
-    if (width == 4) {
-      Filter2DVertical4xH<6, /*is_compound=*/true>(intermediate_result, dest,
-                                                   dest_stride, height, taps);
-    } else {
-      Filter2DVertical<6, /*is_compound=*/true>(
-          intermediate_result, dest, dest_stride, width, height, taps);
-    }
+    Compound2DVertical<6>(intermediate_result, width, height, taps, prediction);
   } else if (vertical_taps == 4) {
-    if (width == 4) {
-      Filter2DVertical4xH<4, /*is_compound=*/true>(intermediate_result, dest,
-                                                   dest_stride, height, taps);
-    } else {
-      Filter2DVertical<4, /*is_compound=*/true>(
-          intermediate_result, dest, dest_stride, width, height, taps);
-    }
+    Compound2DVertical<4>(intermediate_result, width, height, taps, prediction);
   } else {  // |vertical_taps| == 2
-    if (width == 4) {
-      Filter2DVertical4xH<2, /*is_compound=*/true>(intermediate_result, dest,
-                                                   dest_stride, height, taps);
-    } else {
-      Filter2DVertical<2, /*is_compound=*/true>(
-          intermediate_result, dest, dest_stride, width, height, taps);
-    }
+    Compound2DVertical<2>(intermediate_result, width, height, taps, prediction);
   }
 }
 
-inline void HalfAddHorizontal(const uint8_t* src, uint8_t* dst) {
+inline void HalfAddHorizontal(const uint8_t* const src, uint8_t* const dst) {
   const uint8x16_t left = vld1q_u8(src);
   const uint8x16_t right = vld1q_u8(src + 1);
   vst1q_u8(dst, vrhaddq_u8(left, right));
@@ -2554,7 +2565,7 @@ inline void IntraBlockCopyHorizontal(const uint8_t* src,
   const ptrdiff_t src_remainder_stride = src_stride - (width - 16);
   const ptrdiff_t dst_remainder_stride = dst_stride - (width - 16);
 
-  int y = 0;
+  int y = height;
   do {
     HalfAddHorizontal(src, dst);
     if (width >= 32) {
@@ -2586,7 +2597,7 @@ inline void IntraBlockCopyHorizontal(const uint8_t* src,
     }
     src += src_remainder_stride;
     dst += dst_remainder_stride;
-  } while (++y < height);
+  } while (--y != 0);
 }
 
 void ConvolveIntraBlockCopyHorizontal_NEON(
@@ -2610,7 +2621,7 @@ void ConvolveIntraBlockCopyHorizontal_NEON(
     IntraBlockCopyHorizontal<16>(src, reference_stride, height, dest,
                                  pred_stride);
   } else if (width == 8) {
-    int y = 0;
+    int y = height;
     do {
       const uint8x8_t left = vld1_u8(src);
       const uint8x8_t right = vld1_u8(src + 1);
@@ -2618,11 +2629,11 @@ void ConvolveIntraBlockCopyHorizontal_NEON(
 
       src += reference_stride;
       dest += pred_stride;
-    } while (++y < height);
+    } while (--y != 0);
   } else if (width == 4) {
     uint8x8_t left = vdup_n_u8(0);
     uint8x8_t right = vdup_n_u8(0);
-    int y = 0;
+    int y = height;
     do {
       left = Load4<0>(src, left);
       right = Load4<0>(src + 1, right);
@@ -2637,13 +2648,13 @@ void ConvolveIntraBlockCopyHorizontal_NEON(
       dest += pred_stride;
       StoreHi4(dest, result);
       dest += pred_stride;
-      y += 2;
-    } while (y < height);
+      y -= 2;
+    } while (y != 0);
   } else {
     assert(width == 2);
     uint8x8_t left = vdup_n_u8(0);
     uint8x8_t right = vdup_n_u8(0);
-    int y = 0;
+    int y = height;
     do {
       left = Load2<0>(src, left);
       right = Load2<0>(src + 1, right);
@@ -2658,8 +2669,8 @@ void ConvolveIntraBlockCopyHorizontal_NEON(
       dest += pred_stride;
       Store2<1>(dest, result);
       dest += pred_stride;
-      y += 2;
-    } while (y < height);
+      y -= 2;
+    } while (y != 0);
   }
 }
 
@@ -2694,7 +2705,7 @@ inline void IntraBlockCopyVertical(const uint8_t* src,
   }
   src += src_remainder_stride;
 
-  int y = 0;
+  int y = height;
   do {
     below[0] = vld1q_u8(src);
     if (width >= 32) {
@@ -2749,7 +2760,7 @@ inline void IntraBlockCopyVertical(const uint8_t* src,
       }
     }
     dst += dst_remainder_stride;
-  } while (++y < height);
+  } while (--y != 0);
 }
 
 void ConvolveIntraBlockCopyVertical_NEON(
@@ -2778,7 +2789,7 @@ void ConvolveIntraBlockCopyVertical_NEON(
     row = vld1_u8(src);
     src += reference_stride;
 
-    int y = 0;
+    int y = height;
     do {
       below = vld1_u8(src);
       src += reference_stride;
@@ -2787,13 +2798,13 @@ void ConvolveIntraBlockCopyVertical_NEON(
       dest += pred_stride;
 
       row = below;
-    } while (++y < height);
+    } while (--y != 0);
   } else if (width == 4) {
     uint8x8_t row = Load4(src);
     uint8x8_t below = vdup_n_u8(0);
     src += reference_stride;
 
-    int y = 0;
+    int y = height;
     do {
       below = Load4<0>(src, below);
       src += reference_stride;
@@ -2802,14 +2813,14 @@ void ConvolveIntraBlockCopyVertical_NEON(
       dest += pred_stride;
 
       row = below;
-    } while (++y < height);
+    } while (--y != 0);
   } else {
     assert(width == 2);
     uint8x8_t row = Load2(src);
     uint8x8_t below = vdup_n_u8(0);
     src += reference_stride;
 
-    int y = 0;
+    int y = height;
     do {
       below = Load2<0>(src, below);
       src += reference_stride;
@@ -2818,7 +2829,7 @@ void ConvolveIntraBlockCopyVertical_NEON(
       dest += pred_stride;
 
       row = below;
-    } while (++y < height);
+    } while (--y != 0);
   }
 }
 
@@ -2870,7 +2881,7 @@ inline void IntraBlockCopy2D(const uint8_t* src, const ptrdiff_t src_stride,
   }
   src += src_remainder_stride;
 
-  int y = 0;
+  int y = height;
   do {
     const uint16x8_t below_0 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
     vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[0], below_0), 2));
@@ -2981,7 +2992,7 @@ inline void IntraBlockCopy2D(const uint8_t* src, const ptrdiff_t src_stride,
     }
     src += src_remainder_stride;
     dst += dst_remainder_stride;
-  } while (++y < height);
+  } while (--y != 0);
 }
 
 void ConvolveIntraBlockCopy2D_NEON(
@@ -3013,7 +3024,7 @@ void ConvolveIntraBlockCopy2D_NEON(
 
     uint16x4_t row = vget_low_u16(vaddl_u8(left, right));
 
-    int y = 0;
+    int y = height;
     do {
       left = Load4<0>(src, left);
       right = Load4<0>(src + 1, right);
@@ -3032,8 +3043,8 @@ void ConvolveIntraBlockCopy2D_NEON(
       dest += pred_stride;
 
       row = vget_high_u16(below);
-      y += 2;
-    } while (y < height);
+      y -= 2;
+    } while (y != 0);
   } else {
     uint8x8_t left = Load2(src);
     uint8x8_t right = Load2(src + 1);
@@ -3041,7 +3052,7 @@ void ConvolveIntraBlockCopy2D_NEON(
 
     uint16x4_t row = vget_low_u16(vaddl_u8(left, right));
 
-    int y = 0;
+    int y = height;
     do {
       left = Load2<0>(src, left);
       right = Load2<0>(src + 1, right);
@@ -3060,8 +3071,8 @@ void ConvolveIntraBlockCopy2D_NEON(
       dest += pred_stride;
 
       row = vget_high_u16(below);
-      y += 2;
-    } while (y < height);
+      y -= 2;
+    } while (y != 0);
   }
 }
 
@@ -3093,7 +3104,7 @@ void ConvolveInit_NEON() { low_bitdepth::Init8bpp(); }
 }  // namespace dsp
 }  // namespace libgav1
 
-#else  // !LIBGAV1_ENABLE_NEON
+#else   // !LIBGAV1_ENABLE_NEON
 
 namespace libgav1 {
 namespace dsp {
diff --git a/src/dsp/arm/distance_weighted_blend_neon.cc b/src/dsp/arm/distance_weighted_blend_neon.cc
index 04952ab..a0cd0ac 100644
--- a/src/dsp/arm/distance_weighted_blend_neon.cc
+++ b/src/dsp/arm/distance_weighted_blend_neon.cc
@@ -30,10 +30,12 @@
 
 namespace libgav1 {
 namespace dsp {
-namespace {
 
 constexpr int kInterPostRoundBit = 4;
 
+namespace low_bitdepth {
+namespace {
+
 inline int16x8_t ComputeWeightedAverage8(const int16x8_t pred0,
                                          const int16x8_t pred1,
                                          const int16x4_t weights[2]) {
@@ -185,13 +187,167 @@ void Init8bpp() {
 }
 
 }  // namespace
+}  // namespace low_bitdepth
+
+//------------------------------------------------------------------------------
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+inline uint16x4x2_t ComputeWeightedAverage8(const uint16x4x2_t pred0,
+                                            const uint16x4x2_t pred1,
+                                            const uint16x4_t weights[2]) {
+  const uint32x4_t wpred0_lo = vmull_u16(weights[0], pred0.val[0]);
+  const uint32x4_t wpred0_hi = vmull_u16(weights[0], pred0.val[1]);
+  const uint32x4_t blended_lo = vmlal_u16(wpred0_lo, weights[1], pred1.val[0]);
+  const uint32x4_t blended_hi = vmlal_u16(wpred0_hi, weights[1], pred1.val[1]);
+  const int32x4_t offset = vdupq_n_s32(kCompoundOffset * 16);
+  const int32x4_t res_lo = vsubq_s32(vreinterpretq_s32_u32(blended_lo), offset);
+  const int32x4_t res_hi = vsubq_s32(vreinterpretq_s32_u32(blended_hi), offset);
+  const uint16x4_t bd_max = vdup_n_u16((1 << kBitdepth10) - 1);
+  // Clip the result at (1 << bd) - 1.
+  uint16x4x2_t result;
+  result.val[0] =
+      vmin_u16(vqrshrun_n_s32(res_lo, kInterPostRoundBit + 4), bd_max);
+  result.val[1] =
+      vmin_u16(vqrshrun_n_s32(res_hi, kInterPostRoundBit + 4), bd_max);
+  return result;
+}
+
+inline uint16x4x4_t ComputeWeightedAverage8(const uint16x4x4_t pred0,
+                                            const uint16x4x4_t pred1,
+                                            const uint16x4_t weights[2]) {
+  const int32x4_t offset = vdupq_n_s32(kCompoundOffset * 16);
+  const uint32x4_t wpred0 = vmull_u16(weights[0], pred0.val[0]);
+  const uint32x4_t wpred1 = vmull_u16(weights[0], pred0.val[1]);
+  const uint32x4_t blended0 = vmlal_u16(wpred0, weights[1], pred1.val[0]);
+  const uint32x4_t blended1 = vmlal_u16(wpred1, weights[1], pred1.val[1]);
+  const int32x4_t res0 = vsubq_s32(vreinterpretq_s32_u32(blended0), offset);
+  const int32x4_t res1 = vsubq_s32(vreinterpretq_s32_u32(blended1), offset);
+  const uint32x4_t wpred2 = vmull_u16(weights[0], pred0.val[2]);
+  const uint32x4_t wpred3 = vmull_u16(weights[0], pred0.val[3]);
+  const uint32x4_t blended2 = vmlal_u16(wpred2, weights[1], pred1.val[2]);
+  const uint32x4_t blended3 = vmlal_u16(wpred3, weights[1], pred1.val[3]);
+  const int32x4_t res2 = vsubq_s32(vreinterpretq_s32_u32(blended2), offset);
+  const int32x4_t res3 = vsubq_s32(vreinterpretq_s32_u32(blended3), offset);
+  const uint16x4_t bd_max = vdup_n_u16((1 << kBitdepth10) - 1);
+  // Clip the result at (1 << bd) - 1.
+  uint16x4x4_t result;
+  result.val[0] =
+      vmin_u16(vqrshrun_n_s32(res0, kInterPostRoundBit + 4), bd_max);
+  result.val[1] =
+      vmin_u16(vqrshrun_n_s32(res1, kInterPostRoundBit + 4), bd_max);
+  result.val[2] =
+      vmin_u16(vqrshrun_n_s32(res2, kInterPostRoundBit + 4), bd_max);
+  result.val[3] =
+      vmin_u16(vqrshrun_n_s32(res3, kInterPostRoundBit + 4), bd_max);
+
+  return result;
+}
+
+// We could use vld1_u16_x2, but for compatibility reasons, use this function
+// instead. The compiler optimizes to the correct instruction.
+inline uint16x4x2_t LoadU16x4_x2(uint16_t const* ptr) {
+  uint16x4x2_t x;
+  // gcc/clang (64 bit) optimizes the following to ldp.
+  x.val[0] = vld1_u16(ptr);
+  x.val[1] = vld1_u16(ptr + 4);
+  return x;
+}
+
+// We could use vld1_u16_x4, but for compatibility reasons, use this function
+// instead. The compiler optimizes to a pair of vld1_u16_x2, which showed better
+// performance in the speed tests.
+inline uint16x4x4_t LoadU16x4_x4(uint16_t const* ptr) {
+  uint16x4x4_t x;
+  x.val[0] = vld1_u16(ptr);
+  x.val[1] = vld1_u16(ptr + 4);
+  x.val[2] = vld1_u16(ptr + 8);
+  x.val[3] = vld1_u16(ptr + 12);
+  return x;
+}
+
+void DistanceWeightedBlend_NEON(const void* prediction_0,
+                                const void* prediction_1,
+                                const uint8_t weight_0, const uint8_t weight_1,
+                                const int width, const int height,
+                                void* const dest, const ptrdiff_t dest_stride) {
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  auto* dst = static_cast<uint16_t*>(dest);
+  const ptrdiff_t dst_stride = dest_stride / sizeof(dst[0]);
+  const uint16x4_t weights[2] = {vdup_n_u16(weight_0), vdup_n_u16(weight_1)};
 
-void DistanceWeightedBlendInit_NEON() { Init8bpp(); }
+  if (width == 4) {
+    int y = height;
+    do {
+      const uint16x4x2_t src0 = LoadU16x4_x2(pred_0);
+      const uint16x4x2_t src1 = LoadU16x4_x2(pred_1);
+      const uint16x4x2_t res = ComputeWeightedAverage8(src0, src1, weights);
+      vst1_u16(dst, res.val[0]);
+      vst1_u16(dst + dst_stride, res.val[1]);
+      dst += dst_stride << 1;
+      pred_0 += 8;
+      pred_1 += 8;
+      y -= 2;
+    } while (y != 0);
+  } else if (width == 8) {
+    int y = height;
+    do {
+      const uint16x4x4_t src0 = LoadU16x4_x4(pred_0);
+      const uint16x4x4_t src1 = LoadU16x4_x4(pred_1);
+      const uint16x4x4_t res = ComputeWeightedAverage8(src0, src1, weights);
+      vst1_u16(dst, res.val[0]);
+      vst1_u16(dst + 4, res.val[1]);
+      vst1_u16(dst + dst_stride, res.val[2]);
+      vst1_u16(dst + dst_stride + 4, res.val[3]);
+      dst += dst_stride << 1;
+      pred_0 += 16;
+      pred_1 += 16;
+      y -= 2;
+    } while (y != 0);
+  } else {
+    int y = height;
+    do {
+      int x = 0;
+      do {
+        const uint16x4x4_t src0 = LoadU16x4_x4(pred_0 + x);
+        const uint16x4x4_t src1 = LoadU16x4_x4(pred_1 + x);
+        const uint16x4x4_t res = ComputeWeightedAverage8(src0, src1, weights);
+        vst1_u16(dst + x, res.val[0]);
+        vst1_u16(dst + x + 4, res.val[1]);
+        vst1_u16(dst + x + 8, res.val[2]);
+        vst1_u16(dst + x + 12, res.val[3]);
+        x += 16;
+      } while (x < width);
+      dst += dst_stride;
+      pred_0 += width;
+      pred_1 += width;
+    } while (--y != 0);
+  }
+}
+
+void Init10bpp() {
+  Dsp* dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+  dsp->distance_weighted_blend = DistanceWeightedBlend_NEON;
+}
+
+}  // namespace
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+void DistanceWeightedBlendInit_NEON() {
+  low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif
+}
 
 }  // namespace dsp
 }  // namespace libgav1
 
-#else  // !LIBGAV1_ENABLE_NEON
+#else   // !LIBGAV1_ENABLE_NEON
 
 namespace libgav1 {
 namespace dsp {
diff --git a/src/dsp/arm/distance_weighted_blend_neon.h b/src/dsp/arm/distance_weighted_blend_neon.h
index 4d8824c..94a799c 100644
--- a/src/dsp/arm/distance_weighted_blend_neon.h
+++ b/src/dsp/arm/distance_weighted_blend_neon.h
@@ -34,6 +34,8 @@ void DistanceWeightedBlendInit_NEON();
 #if LIBGAV1_ENABLE_NEON
 #define LIBGAV1_Dsp8bpp_DistanceWeightedBlend LIBGAV1_CPU_NEON
 
+#define LIBGAV1_Dsp10bpp_DistanceWeightedBlend LIBGAV1_CPU_NEON
+
 #endif  // LIBGAV1_ENABLE_NEON
 
 #endif  // LIBGAV1_SRC_DSP_ARM_DISTANCE_WEIGHTED_BLEND_NEON_H_
diff --git a/src/dsp/arm/film_grain_neon.cc b/src/dsp/arm/film_grain_neon.cc
index 2612466..8ee3745 100644
--- a/src/dsp/arm/film_grain_neon.cc
+++ b/src/dsp/arm/film_grain_neon.cc
@@ -1176,7 +1176,7 @@ void FilmGrainInit_NEON() {
 }  // namespace dsp
 }  // namespace libgav1
 
-#else  // !LIBGAV1_ENABLE_NEON
+#else   // !LIBGAV1_ENABLE_NEON
 
 namespace libgav1 {
 namespace dsp {
diff --git a/src/dsp/arm/intra_edge_neon.cc b/src/dsp/arm/intra_edge_neon.cc
index 00b186a..074283f 100644
--- a/src/dsp/arm/intra_edge_neon.cc
+++ b/src/dsp/arm/intra_edge_neon.cc
@@ -25,7 +25,7 @@
 #include "src/dsp/arm/common_neon.h"
 #include "src/dsp/constants.h"
 #include "src/dsp/dsp.h"
-#include "src/utils/common.h"  // RightShiftWithRounding()
+#include "src/utils/common.h"
 
 namespace libgav1 {
 namespace dsp {
@@ -35,6 +35,11 @@ namespace {
 // required.
 constexpr int kKernelsNEON[3][2] = {{4, 8}, {5, 6}};
 
+}  // namespace
+
+namespace low_bitdepth {
+namespace {
+
 void IntraEdgeFilter_NEON(void* buffer, const int size, const int strength) {
   assert(strength == 1 || strength == 2 || strength == 3);
   const int kernel_index = strength - 1;
@@ -44,6 +49,8 @@ void IntraEdgeFilter_NEON(void* buffer, const int size, const int strength) {
   // elements written is |size| - 1.
   if (size == 1) return;
 
+  const uint8x16_t v_index = vcombine_u8(vcreate_u8(0x0706050403020100),
+                                         vcreate_u8(0x0f0e0d0c0b0a0908));
   // |strength| 1 and 2 use a 3 tap filter.
   if (strength < 3) {
     // The last value requires extending the buffer (duplicating
@@ -89,7 +96,6 @@ void IntraEdgeFilter_NEON(void* buffer, const int size, const int strength) {
     // |remainder| == 1 then we don't have to do anything.
     const int remainder = (size - 1) & 0xf;
     if (remainder > 1) {
-      uint8_t temp[16];
       const uint8x16_t src_1 = vld1q_u8(dst_buffer + i);
       const uint8x16_t src_2 = vld1q_u8(dst_buffer + i + 1);
 
@@ -102,9 +108,11 @@ void IntraEdgeFilter_NEON(void* buffer, const int size, const int strength) {
 
       const uint8x16_t result =
           vcombine_u8(vrshrn_n_u16(sum_lo, 4), vrshrn_n_u16(sum_hi, 4));
-
-      vst1q_u8(temp, result);
-      memcpy(dst_buffer + i, temp, remainder);
+      const uint8x16_t v_remainder = vdupq_n_u8(remainder);
+      // Create over write mask.
+      const uint8x16_t mask = vcleq_u8(v_remainder, v_index);
+      const uint8x16_t dst_remainder = vbslq_u8(mask, src_1, result);
+      vst1q_u8(dst_buffer + i, dst_remainder);
     }
 
     dst_buffer[size - 1] = last_val;
@@ -173,7 +181,6 @@ void IntraEdgeFilter_NEON(void* buffer, const int size, const int strength) {
   // Like the 3 tap but if there are two remaining values we have already
   // calculated them.
   if (remainder > 2) {
-    uint8_t temp[16];
     const uint8x16_t src_2 = vld1q_u8(dst_buffer + i);
     const uint8x16_t src_3 = vld1q_u8(dst_buffer + i + 1);
     const uint8x16_t src_4 = vld1q_u8(dst_buffer + i + 2);
@@ -193,9 +200,11 @@ void IntraEdgeFilter_NEON(void* buffer, const int size, const int strength) {
 
     const uint8x16_t result =
         vcombine_u8(vrshrn_n_u16(sum_lo, 4), vrshrn_n_u16(sum_hi, 4));
-
-    vst1q_u8(temp, result);
-    memcpy(dst_buffer + i, temp, remainder);
+    const uint8x16_t v_remainder = vdupq_n_u8(remainder);
+    // Create over write mask.
+    const uint8x16_t mask = vcleq_u8(v_remainder, v_index);
+    const uint8x16_t dst_remainder = vbslq_u8(mask, src_2, result);
+    vst1q_u8(dst_buffer + i, dst_remainder);
   }
 
   dst_buffer[1] = special_vals[0];
@@ -284,13 +293,225 @@ void Init8bpp() {
 }
 
 }  // namespace
+}  // namespace low_bitdepth
+
+//------------------------------------------------------------------------------
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+const uint16_t kRemainderMask[8][8] = {
+    {0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000},
+    {0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000},
+    {0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000},
+    {0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000},
+    {0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000},
+    {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000},
+    {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000},
+    {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000},
+};
+
+void IntraEdgeFilter_NEON(void* buffer, const int size, const int strength) {
+  assert(strength == 1 || strength == 2 || strength == 3);
+  const int kernel_index = strength - 1;
+  auto* const dst_buffer = static_cast<uint16_t*>(buffer);
+
+  // The first element is not written out (but it is input) so the number of
+  // elements written is |size| - 1.
+  if (size == 1) return;
+
+  // |strength| 1 and 2 use a 3 tap filter.
+  if (strength < 3) {
+    // The last value requires extending the buffer (duplicating
+    // |dst_buffer[size - 1]). Calculate it here to avoid extra processing in
+    // neon.
+    const uint16_t last_val = RightShiftWithRounding(
+        kKernelsNEON[kernel_index][0] * dst_buffer[size - 2] +
+            kKernelsNEON[kernel_index][1] * dst_buffer[size - 1] +
+            kKernelsNEON[kernel_index][0] * dst_buffer[size - 1],
+        4);
+
+    const uint16_t krn0 = kKernelsNEON[kernel_index][0];
+    const uint16_t krn1 = kKernelsNEON[kernel_index][1];
+
+    // The first value we need gets overwritten by the output from the
+    // previous iteration.
+    uint16x8_t src_0 = vld1q_u16(dst_buffer);
+    int i = 1;
+
+    // Process blocks until there are less than 16 values remaining.
+    for (; i < size - 7; i += 8) {
+      // Loading these at the end of the block with |src_0| will read past the
+      // end of |top_row_data[160]|, the source of |buffer|.
+      const uint16x8_t src_1 = vld1q_u16(dst_buffer + i);
+      const uint16x8_t src_2 = vld1q_u16(dst_buffer + i + 1);
+      const uint16x8_t sum_02 = vmulq_n_u16(vaddq_u16(src_0, src_2), krn0);
+      const uint16x8_t sum = vmlaq_n_u16(sum_02, src_1, krn1);
+      const uint16x8_t result = vrshrq_n_u16(sum, 4);
+      // Load the next row before overwriting. This loads an extra 7 values
+      // past |size| on the trailing iteration.
+      src_0 = vld1q_u16(dst_buffer + i + 7);
+      vst1q_u16(dst_buffer + i, result);
+    }
+
+    // The last output value |last_val| was already calculated so if
+    // |remainder| == 1 then we don't have to do anything.
+    const int remainder = (size - 1) & 0x7;
+    if (remainder > 1) {
+      const uint16x8_t src_1 = vld1q_u16(dst_buffer + i);
+      const uint16x8_t src_2 = vld1q_u16(dst_buffer + i + 1);
+      const uint16x8_t sum_02 = vmulq_n_u16(vaddq_u16(src_0, src_2), krn0);
+      const uint16x8_t sum = vmlaq_n_u16(sum_02, src_1, krn1);
+      const uint16x8_t result = vrshrq_n_u16(sum, 4);
+      const uint16x8_t mask = vld1q_u16(kRemainderMask[remainder]);
+      const uint16x8_t dst_remainder = vbslq_u16(mask, result, src_1);
+      vst1q_u16(dst_buffer + i, dst_remainder);
+    }
+
+    dst_buffer[size - 1] = last_val;
+    return;
+  }
+
+  assert(strength == 3);
+  // 5 tap filter. The first element requires duplicating |buffer[0]| and the
+  // last two elements require duplicating |buffer[size - 1]|.
+  uint16_t special_vals[3];
+  special_vals[0] = RightShiftWithRounding(
+      (dst_buffer[0] << 1) + (dst_buffer[0] << 2) + (dst_buffer[1] << 2) +
+          (dst_buffer[2] << 2) + (dst_buffer[3] << 1),
+      4);
+  // Clamp index for very small |size| values.
+  const int first_index_min = std::max(size - 4, 0);
+  const int second_index_min = std::max(size - 3, 0);
+  const int third_index_min = std::max(size - 2, 0);
+  special_vals[1] = RightShiftWithRounding(
+      (dst_buffer[first_index_min] << 1) + (dst_buffer[second_index_min] << 2) +
+          (dst_buffer[third_index_min] << 2) + (dst_buffer[size - 1] << 2) +
+          (dst_buffer[size - 1] << 1),
+      4);
+  special_vals[2] = RightShiftWithRounding(
+      (dst_buffer[second_index_min] << 1) + (dst_buffer[third_index_min] << 2) +
+          // x << 2 + x << 2 == x << 3
+          (dst_buffer[size - 1] << 3) + (dst_buffer[size - 1] << 1),
+      4);
+
+  // The first two values we need get overwritten by the output from the
+  // previous iteration.
+  uint16x8_t src_0 = vld1q_u16(dst_buffer - 1);
+  uint16x8_t src_1 = vld1q_u16(dst_buffer);
+  int i = 1;
+
+  for (; i < size - 7; i += 8) {
+    // Loading these at the end of the block with |src_[01]| will read past
+    // the end of |top_row_data[160]|, the source of |buffer|.
+    const uint16x8_t src_2 = vld1q_u16(dst_buffer + i);
+    const uint16x8_t src_3 = vld1q_u16(dst_buffer + i + 1);
+    const uint16x8_t src_4 = vld1q_u16(dst_buffer + i + 2);
+    const uint16x8_t sum_04 = vshlq_n_u16(vaddq_u16(src_0, src_4), 1);
+    const uint16x8_t sum_123 = vaddq_u16(vaddq_u16(src_1, src_2), src_3);
+    const uint16x8_t sum = vaddq_u16(sum_04, vshlq_n_u16(sum_123, 2));
+    const uint16x8_t result = vrshrq_n_u16(sum, 4);
+
+    // Load the next before overwriting.
+    src_0 = vld1q_u16(dst_buffer + i + 6);
+    src_1 = vld1q_u16(dst_buffer + i + 7);
+
+    vst1q_u16(dst_buffer + i, result);
+  }
+
+  const int remainder = (size - 1) & 0x7;
+  // Like the 3 tap but if there are two remaining values we have already
+  // calculated them.
+  if (remainder > 2) {
+    const uint16x8_t src_2 = vld1q_u16(dst_buffer + i);
+    const uint16x8_t src_3 = vld1q_u16(dst_buffer + i + 1);
+    const uint16x8_t src_4 = vld1q_u16(dst_buffer + i + 2);
+    const uint16x8_t sum_04 = vshlq_n_u16(vaddq_u16(src_0, src_4), 1);
+    const uint16x8_t sum_123 = vaddq_u16(vaddq_u16(src_1, src_2), src_3);
+    const uint16x8_t sum = vaddq_u16(sum_04, vshlq_n_u16(sum_123, 2));
+    const uint16x8_t result = vrshrq_n_u16(sum, 4);
+    const uint16x8_t mask = vld1q_u16(kRemainderMask[remainder]);
+    const uint16x8_t dst_remainder = vbslq_u16(mask, result, src_2);
+    vst1q_u16(dst_buffer + i, dst_remainder);
+  }
+
+  dst_buffer[1] = special_vals[0];
+  // Avoid overwriting |dst_buffer[0]|.
+  if (size > 2) dst_buffer[size - 2] = special_vals[1];
+  dst_buffer[size - 1] = special_vals[2];
+}
+
+void IntraEdgeUpsampler_NEON(void* buffer, const int size) {
+  assert(size % 4 == 0 && size <= 16);
+  auto* const pixel_buffer = static_cast<uint16_t*>(buffer);
 
-void IntraEdgeInit_NEON() { Init8bpp(); }
+  // Extend first/last samples
+  pixel_buffer[-2] = pixel_buffer[-1];
+  pixel_buffer[size] = pixel_buffer[size - 1];
+
+  const int16x8_t src_lo = vreinterpretq_s16_u16(vld1q_u16(pixel_buffer - 2));
+  const int16x8_t src_hi =
+      vreinterpretq_s16_u16(vld1q_u16(pixel_buffer - 2 + 8));
+  const int16x8_t src9_hi = vaddq_s16(src_hi, vshlq_n_s16(src_hi, 3));
+  const int16x8_t src9_lo = vaddq_s16(src_lo, vshlq_n_s16(src_lo, 3));
+
+  int16x8_t sum_lo = vsubq_s16(vextq_s16(src9_lo, src9_hi, 1), src_lo);
+  sum_lo = vaddq_s16(sum_lo, vextq_s16(src9_lo, src9_hi, 2));
+  sum_lo = vsubq_s16(sum_lo, vextq_s16(src_lo, src_hi, 3));
+  sum_lo = vrshrq_n_s16(sum_lo, 4);
+
+  uint16x8x2_t result_lo;
+  result_lo.val[0] =
+      vminq_u16(vreinterpretq_u16_s16(vmaxq_s16(sum_lo, vdupq_n_s16(0))),
+                vdupq_n_u16((1 << kBitdepth10) - 1));
+  result_lo.val[1] = vreinterpretq_u16_s16(vextq_s16(src_lo, src_hi, 2));
+
+  if (size > 8) {
+    const int16x8_t src_hi_extra =
+        vreinterpretq_s16_u16(vld1q_u16(pixel_buffer + 16 - 2));
+    const int16x8_t src9_hi_extra =
+        vaddq_s16(src_hi_extra, vshlq_n_s16(src_hi_extra, 3));
+
+    int16x8_t sum_hi = vsubq_s16(vextq_s16(src9_hi, src9_hi_extra, 1), src_hi);
+    sum_hi = vaddq_s16(sum_hi, vextq_s16(src9_hi, src9_hi_extra, 2));
+    sum_hi = vsubq_s16(sum_hi, vextq_s16(src_hi, src_hi_extra, 3));
+    sum_hi = vrshrq_n_s16(sum_hi, 4);
+
+    uint16x8x2_t result_hi;
+    result_hi.val[0] =
+        vminq_u16(vreinterpretq_u16_s16(vmaxq_s16(sum_hi, vdupq_n_s16(0))),
+                  vdupq_n_u16((1 << kBitdepth10) - 1));
+    result_hi.val[1] =
+        vreinterpretq_u16_s16(vextq_s16(src_hi, src_hi_extra, 2));
+    vst2q_u16(pixel_buffer - 1, result_lo);
+    vst2q_u16(pixel_buffer + 15, result_hi);
+  } else {
+    vst2q_u16(pixel_buffer - 1, result_lo);
+  }
+}
+
+void Init10bpp() {
+  Dsp* dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+  dsp->intra_edge_filter = IntraEdgeFilter_NEON;
+  dsp->intra_edge_upsampler = IntraEdgeUpsampler_NEON;
+}
+
+}  // namespace
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+void IntraEdgeInit_NEON() {
+  low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif
+}
 
 }  // namespace dsp
 }  // namespace libgav1
 
-#else  // !LIBGAV1_ENABLE_NEON
+#else   // !LIBGAV1_ENABLE_NEON
 namespace libgav1 {
 namespace dsp {
 
diff --git a/src/dsp/arm/intra_edge_neon.h b/src/dsp/arm/intra_edge_neon.h
index d3bb243..28e3494 100644
--- a/src/dsp/arm/intra_edge_neon.h
+++ b/src/dsp/arm/intra_edge_neon.h
@@ -34,6 +34,9 @@ void IntraEdgeInit_NEON();
 #define LIBGAV1_Dsp8bpp_IntraEdgeFilter LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_IntraEdgeUpsampler LIBGAV1_CPU_NEON
 
+#define LIBGAV1_Dsp10bpp_IntraEdgeFilter LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_IntraEdgeUpsampler LIBGAV1_CPU_NEON
+
 #endif  // LIBGAV1_ENABLE_NEON
 
 #endif  // LIBGAV1_SRC_DSP_ARM_INTRA_EDGE_NEON_H_
diff --git a/src/dsp/arm/intrapred_cfl_neon.cc b/src/dsp/arm/intrapred_cfl_neon.cc
index 45fe33b..8d8748f 100644
--- a/src/dsp/arm/intrapred_cfl_neon.cc
+++ b/src/dsp/arm/intrapred_cfl_neon.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "src/dsp/intrapred.h"
+#include "src/dsp/intrapred_cfl.h"
 #include "src/utils/cpu.h"
 
 #if LIBGAV1_ENABLE_NEON
@@ -27,45 +27,20 @@
 #include "src/dsp/constants.h"
 #include "src/dsp/dsp.h"
 #include "src/utils/common.h"
+#include "src/utils/constants.h"
 
 namespace libgav1 {
 namespace dsp {
-namespace low_bitdepth {
-namespace {
-
-uint8x16_t Set2ValuesQ(const uint8_t* a) {
-  uint16_t combined_values = a[0] | a[1] << 8;
-  return vreinterpretq_u8_u16(vdupq_n_u16(combined_values));
-}
-
-uint32_t SumVector(uint32x2_t a) {
-#if defined(__aarch64__)
-  return vaddv_u32(a);
-#else
-  const uint64x1_t b = vpaddl_u32(a);
-  return vget_lane_u32(vreinterpret_u32_u64(b), 0);
-#endif  // defined(__aarch64__)
-}
-
-uint32_t SumVector(uint32x4_t a) {
-#if defined(__aarch64__)
-  return vaddvq_u32(a);
-#else
-  const uint64x2_t b = vpaddlq_u32(a);
-  const uint64x1_t c = vadd_u64(vget_low_u64(b), vget_high_u64(b));
-  return vget_lane_u32(vreinterpret_u32_u64(c), 0);
-#endif  // defined(__aarch64__)
-}
 
 // Divide by the number of elements.
-uint32_t Average(const uint32_t sum, const int width, const int height) {
+inline uint32_t Average(const uint32_t sum, const int width, const int height) {
   return RightShiftWithRounding(sum, FloorLog2(width) + FloorLog2(height));
 }
 
 // Subtract |val| from every element in |a|.
-void BlockSubtract(const uint32_t val,
-                   int16_t a[kCflLumaBufferStride][kCflLumaBufferStride],
-                   const int width, const int height) {
+inline void BlockSubtract(const uint32_t val,
+                          int16_t a[kCflLumaBufferStride][kCflLumaBufferStride],
+                          const int width, const int height) {
   assert(val <= INT16_MAX);
   const int16x8_t val_v = vdupq_n_s16(static_cast<int16_t>(val));
 
@@ -94,6 +69,9 @@ void BlockSubtract(const uint32_t val,
   }
 }
 
+namespace low_bitdepth {
+namespace {
+
 template <int block_width, int block_height>
 void CflSubsampler420_NEON(
     int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
@@ -122,26 +100,27 @@ void CflSubsampler420_NEON(
 
     sum = SumVector(running_sum);
   } else if (block_width == 8) {
-    const uint8x16_t x_index = {0, 0, 2,  2,  4,  4,  6,  6,
-                                8, 8, 10, 10, 12, 12, 14, 14};
-    const uint8x16_t x_max_index = vdupq_n_u8(max_luma_width - 2);
-    const uint8x16_t x_mask = vcltq_u8(x_index, x_max_index);
+    const uint16x8_t x_index = {0, 2, 4, 6, 8, 10, 12, 14};
+    const uint16x8_t x_max_index =
+        vdupq_n_u16(max_luma_width == 8 ? max_luma_width - 2 : 16);
+    const uint16x8_t x_mask = vcltq_u16(x_index, x_max_index);
 
     uint32x4_t running_sum = vdupq_n_u32(0);
 
     for (int y = 0; y < block_height; ++y) {
-      const uint8x16_t x_max0 = Set2ValuesQ(src + max_luma_width - 2);
-      const uint8x16_t x_max1 = Set2ValuesQ(src + max_luma_width - 2 + stride);
+      const uint8x16_t row0 = vld1q_u8(src);
+      const uint8x16_t row1 = vld1q_u8(src + stride);
+      const uint16x8_t sum_row = vpadalq_u8(vpaddlq_u8(row0), row1);
+      const uint16x8_t sum_row_shifted = vshlq_n_u16(sum_row, 1);
 
-      uint8x16_t row0 = vld1q_u8(src);
-      row0 = vbslq_u8(x_mask, row0, x_max0);
-      uint8x16_t row1 = vld1q_u8(src + stride);
-      row1 = vbslq_u8(x_mask, row1, x_max1);
+      // Dup the 2x2 sum at the max luma offset.
+      const uint16x8_t max_luma_sum =
+          vdupq_lane_u16(vget_low_u16(sum_row_shifted), 3);
+      const uint16x8_t final_sum_row =
+          vbslq_u16(x_mask, sum_row_shifted, max_luma_sum);
+      vst1q_s16(luma[y], vreinterpretq_s16_u16(final_sum_row));
 
-      uint16x8_t sum_row = vpadalq_u8(vpaddlq_u8(row0), row1);
-      sum_row = vshlq_n_u16(sum_row, 1);
-      running_sum = vpadalq_u16(running_sum, sum_row);
-      vst1q_s16(luma[y], vreinterpretq_s16_u16(sum_row));
+      running_sum = vpadalq_u16(running_sum, final_sum_row);
 
       if (y << 1 < max_luma_height - 2) {
         src += stride << 1;
@@ -150,45 +129,35 @@ void CflSubsampler420_NEON(
 
     sum = SumVector(running_sum);
   } else /* block_width >= 16 */ {
-    const uint8x16_t x_max_index = vdupq_n_u8(max_luma_width - 2);
+    const uint16x8_t x_max_index = vdupq_n_u16(max_luma_width - 2);
     uint32x4_t running_sum = vdupq_n_u32(0);
 
     for (int y = 0; y < block_height; ++y) {
-      uint8x16_t x_index = {0,  2,  4,  6,  8,  10, 12, 14,
-                            16, 18, 20, 22, 24, 26, 28, 30};
-      const uint8x16_t x_max00 = vdupq_n_u8(src[max_luma_width - 2]);
-      const uint8x16_t x_max01 = vdupq_n_u8(src[max_luma_width - 2 + 1]);
-      const uint8x16_t x_max10 = vdupq_n_u8(src[stride + max_luma_width - 2]);
-      const uint8x16_t x_max11 =
-          vdupq_n_u8(src[stride + max_luma_width - 2 + 1]);
-      for (int x = 0; x < block_width; x += 16) {
-        const ptrdiff_t src_x_offset = x << 1;
-        const uint8x16_t x_mask = vcltq_u8(x_index, x_max_index);
-        const uint8x16x2_t row0 = vld2q_u8(src + src_x_offset);
-        const uint8x16x2_t row1 = vld2q_u8(src + src_x_offset + stride);
-        const uint8x16_t row_masked_00 = vbslq_u8(x_mask, row0.val[0], x_max00);
-        const uint8x16_t row_masked_01 = vbslq_u8(x_mask, row0.val[1], x_max01);
-        const uint8x16_t row_masked_10 = vbslq_u8(x_mask, row1.val[0], x_max10);
-        const uint8x16_t row_masked_11 = vbslq_u8(x_mask, row1.val[1], x_max11);
-
-        uint16x8_t sum_row_lo =
-            vaddl_u8(vget_low_u8(row_masked_00), vget_low_u8(row_masked_01));
-        sum_row_lo = vaddw_u8(sum_row_lo, vget_low_u8(row_masked_10));
-        sum_row_lo = vaddw_u8(sum_row_lo, vget_low_u8(row_masked_11));
-        sum_row_lo = vshlq_n_u16(sum_row_lo, 1);
-        running_sum = vpadalq_u16(running_sum, sum_row_lo);
-        vst1q_s16(luma[y] + x, vreinterpretq_s16_u16(sum_row_lo));
-
-        uint16x8_t sum_row_hi =
-            vaddl_u8(vget_high_u8(row_masked_00), vget_high_u8(row_masked_01));
-        sum_row_hi = vaddw_u8(sum_row_hi, vget_high_u8(row_masked_10));
-        sum_row_hi = vaddw_u8(sum_row_hi, vget_high_u8(row_masked_11));
-        sum_row_hi = vshlq_n_u16(sum_row_hi, 1);
-        running_sum = vpadalq_u16(running_sum, sum_row_hi);
-        vst1q_s16(luma[y] + x + 8, vreinterpretq_s16_u16(sum_row_hi));
-
-        x_index = vaddq_u8(x_index, vdupq_n_u8(32));
+      // Calculate the 2x2 sum at the max_luma offset
+      const uint8_t a00 = src[max_luma_width - 2];
+      const uint8_t a01 = src[max_luma_width - 1];
+      const uint8_t a10 = src[max_luma_width - 2 + stride];
+      const uint8_t a11 = src[max_luma_width - 1 + stride];
+      // Dup the 2x2 sum at the max luma offset.
+      const uint16x8_t max_luma_sum =
+          vdupq_n_u16((uint16_t)((a00 + a01 + a10 + a11) << 1));
+      uint16x8_t x_index = {0, 2, 4, 6, 8, 10, 12, 14};
+
+      ptrdiff_t src_x_offset = 0;
+      for (int x = 0; x < block_width; x += 8, src_x_offset += 16) {
+        const uint16x8_t x_mask = vcltq_u16(x_index, x_max_index);
+        const uint8x16_t row0 = vld1q_u8(src + src_x_offset);
+        const uint8x16_t row1 = vld1q_u8(src + src_x_offset + stride);
+        const uint16x8_t sum_row = vpadalq_u8(vpaddlq_u8(row0), row1);
+        const uint16x8_t sum_row_shifted = vshlq_n_u16(sum_row, 1);
+        const uint16x8_t final_sum_row =
+            vbslq_u16(x_mask, sum_row_shifted, max_luma_sum);
+        vst1q_s16(luma[y] + x, vreinterpretq_s16_u16(final_sum_row));
+
+        running_sum = vpadalq_u16(running_sum, final_sum_row);
+        x_index = vaddq_u16(x_index, vdupq_n_u16(16));
       }
+
       if (y << 1 < max_luma_height - 2) {
         src += stride << 1;
       }
@@ -209,17 +178,30 @@ void CflSubsampler444_NEON(
   uint32_t sum;
   if (block_width == 4) {
     assert(max_luma_width >= 4);
+    assert(max_luma_height <= block_height);
+    assert((max_luma_height % 2) == 0);
     uint32x4_t running_sum = vdupq_n_u32(0);
     uint8x8_t row = vdup_n_u8(0);
 
-    for (int y = 0; y < block_height; y += 2) {
+    uint16x8_t row_shifted;
+    int y = 0;
+    do {
       row = Load4<0>(src, row);
       row = Load4<1>(src + stride, row);
       if (y < (max_luma_height - 1)) {
         src += stride << 1;
       }
 
-      const uint16x8_t row_shifted = vshll_n_u8(row, 3);
+      row_shifted = vshll_n_u8(row, 3);
+      running_sum = vpadalq_u16(running_sum, row_shifted);
+      vst1_s16(luma[y], vreinterpret_s16_u16(vget_low_u16(row_shifted)));
+      vst1_s16(luma[y + 1], vreinterpret_s16_u16(vget_high_u16(row_shifted)));
+      y += 2;
+    } while (y < max_luma_height);
+
+    row_shifted =
+        vcombine_u16(vget_high_u16(row_shifted), vget_high_u16(row_shifted));
+    for (; y < block_height; y += 2) {
       running_sum = vpadalq_u16(running_sum, row_shifted);
       vst1_s16(luma[y], vreinterpret_s16_u16(vget_low_u16(row_shifted)));
       vst1_s16(luma[y + 1], vreinterpret_s16_u16(vget_high_u16(row_shifted)));
@@ -463,12 +445,874 @@ void Init8bpp() {
 }  // namespace
 }  // namespace low_bitdepth
 
-void IntraPredCflInit_NEON() { low_bitdepth::Init8bpp(); }
+//------------------------------------------------------------------------------
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+//------------------------------------------------------------------------------
+// CflSubsampler
+#ifndef __aarch64__
+uint16x8_t vpaddq_u16(uint16x8_t a, uint16x8_t b) {
+  return vcombine_u16(vpadd_u16(vget_low_u16(a), vget_high_u16(a)),
+                      vpadd_u16(vget_low_u16(b), vget_high_u16(b)));
+}
+#endif
+
+// This duplicates the last two 16-bit values in |row|.
+inline uint16x8_t LastRowSamples(const uint16x8_t row) {
+  const uint32x2_t a = vget_high_u32(vreinterpretq_u32_u16(row));
+  const uint32x4_t b = vdupq_lane_u32(a, 1);
+  return vreinterpretq_u16_u32(b);
+}
+
+// This duplicates the last unsigned 16-bit value in |row|.
+inline uint16x8_t LastRowResult(const uint16x8_t row) {
+  const uint16x4_t a = vget_high_u16(row);
+  const uint16x8_t b = vdupq_lane_u16(a, 0x3);
+  return b;
+}
+
+// This duplicates the last signed 16-bit value in |row|.
+inline int16x8_t LastRowResult(const int16x8_t row) {
+  const int16x4_t a = vget_high_s16(row);
+  const int16x8_t b = vdupq_lane_s16(a, 0x3);
+  return b;
+}
+
+// Takes in two sums of input row pairs, and completes the computation for two
+// output rows.
+inline uint16x8_t StoreLumaResults4_420(const uint16x8_t vertical_sum0,
+                                        const uint16x8_t vertical_sum1,
+                                        int16_t* luma_ptr) {
+  const uint16x8_t result = vpaddq_u16(vertical_sum0, vertical_sum1);
+  const uint16x8_t result_shifted = vshlq_n_u16(result, 1);
+  vst1_s16(luma_ptr, vreinterpret_s16_u16(vget_low_u16(result_shifted)));
+  vst1_s16(luma_ptr + kCflLumaBufferStride,
+           vreinterpret_s16_u16(vget_high_u16(result_shifted)));
+  return result_shifted;
+}
+
+// Takes two halves of a vertically added pair of rows and completes the
+// computation for one output row.
+inline uint16x8_t StoreLumaResults8_420(const uint16x8_t vertical_sum0,
+                                        const uint16x8_t vertical_sum1,
+                                        int16_t* luma_ptr) {
+  const uint16x8_t result = vpaddq_u16(vertical_sum0, vertical_sum1);
+  const uint16x8_t result_shifted = vshlq_n_u16(result, 1);
+  vst1q_s16(luma_ptr, vreinterpretq_s16_u16(result_shifted));
+  return result_shifted;
+}
+
+template <int block_height_log2, bool is_inside>
+void CflSubsampler444_4xH_NEON(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_height, const void* const source, ptrdiff_t stride) {
+  static_assert(block_height_log2 <= 4, "");
+  const int block_height = 1 << block_height_log2;
+  const int visible_height = max_luma_height;
+  const auto* src = static_cast<const uint16_t*>(source);
+  const ptrdiff_t src_stride = stride / sizeof(src[0]);
+  int16_t* luma_ptr = luma[0];
+  uint16x4_t sum = vdup_n_u16(0);
+  uint16x4_t samples[2];
+  int y = visible_height;
+
+  do {
+    samples[0] = vld1_u16(src);
+    samples[1] = vld1_u16(src + src_stride);
+    src += src_stride << 1;
+    sum = vadd_u16(sum, samples[0]);
+    sum = vadd_u16(sum, samples[1]);
+    y -= 2;
+  } while (y != 0);
+
+  if (!is_inside) {
+    y = visible_height;
+    samples[1] = vshl_n_u16(samples[1], 1);
+    do {
+      sum = vadd_u16(sum, samples[1]);
+      y += 2;
+    } while (y < block_height);
+  }
+
+  // Here the left shift by 3 (to increase precision) is nullified in right
+  // shift ((log2 of width 4) + 1).
+  const uint32_t average_sum =
+      RightShiftWithRounding(SumVector(vpaddl_u16(sum)), block_height_log2 - 1);
+  const int16x4_t averages = vdup_n_s16(static_cast<int16_t>(average_sum));
+
+  const auto* ssrc = static_cast<const int16_t*>(source);
+  int16x4_t ssample;
+  luma_ptr = luma[0];
+  y = visible_height;
+  do {
+    ssample = vld1_s16(ssrc);
+    ssample = vshl_n_s16(ssample, 3);
+    vst1_s16(luma_ptr, vsub_s16(ssample, averages));
+    ssrc += src_stride;
+    luma_ptr += kCflLumaBufferStride;
+  } while (--y != 0);
+
+  if (!is_inside) {
+    y = visible_height;
+    // Replicate last line
+    do {
+      vst1_s16(luma_ptr, vsub_s16(ssample, averages));
+      luma_ptr += kCflLumaBufferStride;
+    } while (++y < block_height);
+  }
+}
+
+template <int block_height_log2>
+void CflSubsampler444_4xH_NEON(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_width, const int max_luma_height,
+    const void* const source, ptrdiff_t stride) {
+  static_cast<void>(max_luma_width);
+  static_cast<void>(max_luma_height);
+  static_assert(block_height_log2 <= 4, "");
+  assert(max_luma_width >= 4);
+  assert(max_luma_height >= 4);
+  const int block_height = 1 << block_height_log2;
+
+  if (block_height <= max_luma_height) {
+    CflSubsampler444_4xH_NEON<block_height_log2, true>(luma, max_luma_height,
+                                                       source, stride);
+  } else {
+    CflSubsampler444_4xH_NEON<block_height_log2, false>(luma, max_luma_height,
+                                                        source, stride);
+  }
+}
+
+template <int block_height_log2, bool is_inside>
+void CflSubsampler444_8xH_NEON(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_height, const void* const source, ptrdiff_t stride) {
+  const int block_height = 1 << block_height_log2;
+  const int visible_height = max_luma_height;
+  const auto* src = static_cast<const uint16_t*>(source);
+  const ptrdiff_t src_stride = stride / sizeof(src[0]);
+  int16_t* luma_ptr = luma[0];
+  uint32x4_t sum = vdupq_n_u32(0);
+  uint16x8_t samples;
+  int y = visible_height;
+
+  do {
+    samples = vld1q_u16(src);
+    src += src_stride;
+    sum = vpadalq_u16(sum, samples);
+  } while (--y != 0);
+
+  if (!is_inside) {
+    y = visible_height;
+    do {
+      sum = vpadalq_u16(sum, samples);
+    } while (++y < block_height);
+  }
+
+  // Here the left shift by 3 (to increase precision) is nullified in right
+  // shift (log2 of width 8).
+  const uint32_t average_sum =
+      RightShiftWithRounding(SumVector(sum), block_height_log2);
+  const int16x8_t averages = vdupq_n_s16(static_cast<int16_t>(average_sum));
+
+  const auto* ssrc = static_cast<const int16_t*>(source);
+  int16x8_t ssample;
+  luma_ptr = luma[0];
+  y = visible_height;
+  do {
+    ssample = vld1q_s16(ssrc);
+    ssample = vshlq_n_s16(ssample, 3);
+    vst1q_s16(luma_ptr, vsubq_s16(ssample, averages));
+    ssrc += src_stride;
+    luma_ptr += kCflLumaBufferStride;
+  } while (--y != 0);
+
+  if (!is_inside) {
+    y = visible_height;
+    // Replicate last line
+    do {
+      vst1q_s16(luma_ptr, vsubq_s16(ssample, averages));
+      luma_ptr += kCflLumaBufferStride;
+    } while (++y < block_height);
+  }
+}
+
+template <int block_height_log2>
+void CflSubsampler444_8xH_NEON(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_width, const int max_luma_height,
+    const void* const source, ptrdiff_t stride) {
+  static_cast<void>(max_luma_width);
+  static_cast<void>(max_luma_height);
+  static_assert(block_height_log2 <= 5, "");
+  assert(max_luma_width >= 4);
+  assert(max_luma_height >= 4);
+  const int block_height = 1 << block_height_log2;
+  const int block_width = 8;
+
+  const int horz_inside = block_width <= max_luma_width;
+  const int vert_inside = block_height <= max_luma_height;
+  if (horz_inside && vert_inside) {
+    CflSubsampler444_8xH_NEON<block_height_log2, true>(luma, max_luma_height,
+                                                       source, stride);
+  } else {
+    CflSubsampler444_8xH_NEON<block_height_log2, false>(luma, max_luma_height,
+                                                        source, stride);
+  }
+}
+
+template <int block_width_log2, int block_height_log2, bool is_inside>
+void CflSubsampler444_WxH_NEON(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_width, const int max_luma_height,
+    const void* const source, ptrdiff_t stride) {
+  const int block_height = 1 << block_height_log2;
+  const int visible_height = max_luma_height;
+  const int block_width = 1 << block_width_log2;
+  const auto* src = static_cast<const uint16_t*>(source);
+  const ptrdiff_t src_stride = stride / sizeof(src[0]);
+  int16_t* luma_ptr = luma[0];
+  uint32x4_t sum = vdupq_n_u32(0);
+  uint16x8_t samples[4];
+  int y = visible_height;
+
+  do {
+    samples[0] = vld1q_u16(src);
+    samples[1] =
+        (max_luma_width >= 16) ? vld1q_u16(src + 8) : LastRowResult(samples[0]);
+    uint16x8_t inner_sum = vaddq_u16(samples[0], samples[1]);
+    if (block_width == 32) {
+      samples[2] = (max_luma_width >= 24) ? vld1q_u16(src + 16)
+                                          : LastRowResult(samples[1]);
+      samples[3] = (max_luma_width == 32) ? vld1q_u16(src + 24)
+                                          : LastRowResult(samples[2]);
+      inner_sum = vaddq_u16(samples[2], inner_sum);
+      inner_sum = vaddq_u16(samples[3], inner_sum);
+    }
+    sum = vpadalq_u16(sum, inner_sum);
+    src += src_stride;
+  } while (--y != 0);
+
+  if (!is_inside) {
+    y = visible_height;
+    uint16x8_t inner_sum = vaddq_u16(samples[0], samples[1]);
+    if (block_width == 32) {
+      inner_sum = vaddq_u16(samples[2], inner_sum);
+      inner_sum = vaddq_u16(samples[3], inner_sum);
+    }
+    do {
+      sum = vpadalq_u16(sum, inner_sum);
+    } while (++y < block_height);
+  }
+
+  // Here the left shift by 3 (to increase precision) is subtracted in right
+  // shift factor (block_width_log2 + block_height_log2 - 3).
+  const uint32_t average_sum = RightShiftWithRounding(
+      SumVector(sum), block_width_log2 + block_height_log2 - 3);
+  const int16x8_t averages = vdupq_n_s16(static_cast<int16_t>(average_sum));
+
+  const auto* ssrc = static_cast<const int16_t*>(source);
+  int16x8_t ssamples_ext = vdupq_n_s16(0);
+  int16x8_t ssamples[4];
+  luma_ptr = luma[0];
+  y = visible_height;
+  do {
+    int idx = 0;
+    for (int x = 0; x < block_width; x += 8) {
+      if (max_luma_width > x) {
+        ssamples[idx] = vld1q_s16(&ssrc[x]);
+        ssamples[idx] = vshlq_n_s16(ssamples[idx], 3);
+        ssamples_ext = ssamples[idx];
+      } else {
+        ssamples[idx] = LastRowResult(ssamples_ext);
+      }
+      vst1q_s16(&luma_ptr[x], vsubq_s16(ssamples[idx++], averages));
+    }
+    ssrc += src_stride;
+    luma_ptr += kCflLumaBufferStride;
+  } while (--y != 0);
+
+  if (!is_inside) {
+    y = visible_height;
+    // Replicate last line
+    do {
+      int idx = 0;
+      for (int x = 0; x < block_width; x += 8) {
+        vst1q_s16(&luma_ptr[x], vsubq_s16(ssamples[idx++], averages));
+      }
+      luma_ptr += kCflLumaBufferStride;
+    } while (++y < block_height);
+  }
+}
+
+template <int block_width_log2, int block_height_log2>
+void CflSubsampler444_WxH_NEON(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_width, const int max_luma_height,
+    const void* const source, ptrdiff_t stride) {
+  static_assert(block_width_log2 == 4 || block_width_log2 == 5,
+                "This function will only work for block_width 16 and 32.");
+  static_assert(block_height_log2 <= 5, "");
+  assert(max_luma_width >= 4);
+  assert(max_luma_height >= 4);
+
+  const int block_height = 1 << block_height_log2;
+  const int vert_inside = block_height <= max_luma_height;
+  if (vert_inside) {
+    CflSubsampler444_WxH_NEON<block_width_log2, block_height_log2, true>(
+        luma, max_luma_width, max_luma_height, source, stride);
+  } else {
+    CflSubsampler444_WxH_NEON<block_width_log2, block_height_log2, false>(
+        luma, max_luma_width, max_luma_height, source, stride);
+  }
+}
+
+template <int block_height_log2>
+void CflSubsampler420_4xH_NEON(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int /*max_luma_width*/, const int max_luma_height,
+    const void* const source, ptrdiff_t stride) {
+  const int block_height = 1 << block_height_log2;
+  const auto* src = static_cast<const uint16_t*>(source);
+  const ptrdiff_t src_stride = stride / sizeof(src[0]);
+  int16_t* luma_ptr = luma[0];
+  const int luma_height = std::min(block_height, max_luma_height >> 1);
+  int y = luma_height;
+
+  uint32x4_t final_sum = vdupq_n_u32(0);
+  do {
+    const uint16x8_t samples_row0 = vld1q_u16(src);
+    src += src_stride;
+    const uint16x8_t samples_row1 = vld1q_u16(src);
+    src += src_stride;
+    const uint16x8_t luma_sum01 = vaddq_u16(samples_row0, samples_row1);
+
+    const uint16x8_t samples_row2 = vld1q_u16(src);
+    src += src_stride;
+    const uint16x8_t samples_row3 = vld1q_u16(src);
+    src += src_stride;
+    const uint16x8_t luma_sum23 = vaddq_u16(samples_row2, samples_row3);
+    uint16x8_t sum = StoreLumaResults4_420(luma_sum01, luma_sum23, luma_ptr);
+    luma_ptr += kCflLumaBufferStride << 1;
+
+    const uint16x8_t samples_row4 = vld1q_u16(src);
+    src += src_stride;
+    const uint16x8_t samples_row5 = vld1q_u16(src);
+    src += src_stride;
+    const uint16x8_t luma_sum45 = vaddq_u16(samples_row4, samples_row5);
+
+    const uint16x8_t samples_row6 = vld1q_u16(src);
+    src += src_stride;
+    const uint16x8_t samples_row7 = vld1q_u16(src);
+    src += src_stride;
+    const uint16x8_t luma_sum67 = vaddq_u16(samples_row6, samples_row7);
+    sum =
+        vaddq_u16(sum, StoreLumaResults4_420(luma_sum45, luma_sum67, luma_ptr));
+    luma_ptr += kCflLumaBufferStride << 1;
+
+    final_sum = vpadalq_u16(final_sum, sum);
+    y -= 4;
+  } while (y != 0);
+
+  const uint16x4_t final_fill =
+      vreinterpret_u16_s16(vld1_s16(luma_ptr - kCflLumaBufferStride));
+  const uint32x4_t final_fill_to_sum = vmovl_u16(final_fill);
+  for (y = luma_height; y < block_height; ++y) {
+    vst1_s16(luma_ptr, vreinterpret_s16_u16(final_fill));
+    luma_ptr += kCflLumaBufferStride;
+    final_sum = vaddq_u32(final_sum, final_fill_to_sum);
+  }
+  const uint32_t average_sum = RightShiftWithRounding(
+      SumVector(final_sum), block_height_log2 + 2 /*log2 of width 4*/);
+  const int16x4_t averages = vdup_n_s16(static_cast<int16_t>(average_sum));
+  luma_ptr = luma[0];
+  y = block_height;
+  do {
+    const int16x4_t samples = vld1_s16(luma_ptr);
+    vst1_s16(luma_ptr, vsub_s16(samples, averages));
+    luma_ptr += kCflLumaBufferStride;
+  } while (--y != 0);
+}
+
+template <int block_height_log2, int max_luma_width>
+inline void CflSubsampler420Impl_8xH_NEON(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_height, const void* const source, ptrdiff_t stride) {
+  const int block_height = 1 << block_height_log2;
+  const auto* src = static_cast<const uint16_t*>(source);
+  const ptrdiff_t src_stride = stride / sizeof(src[0]);
+  int16_t* luma_ptr = luma[0];
+  const int luma_height = std::min(block_height, max_luma_height >> 1);
+  int y = luma_height;
+
+  uint32x4_t final_sum = vdupq_n_u32(0);
+  do {
+    const uint16x8_t samples_row00 = vld1q_u16(src);
+    const uint16x8_t samples_row01 = (max_luma_width == 16)
+                                         ? vld1q_u16(src + 8)
+                                         : LastRowSamples(samples_row00);
+    src += src_stride;
+    const uint16x8_t samples_row10 = vld1q_u16(src);
+    const uint16x8_t samples_row11 = (max_luma_width == 16)
+                                         ? vld1q_u16(src + 8)
+                                         : LastRowSamples(samples_row10);
+    src += src_stride;
+    const uint16x8_t luma_sum00 = vaddq_u16(samples_row00, samples_row10);
+    const uint16x8_t luma_sum01 = vaddq_u16(samples_row01, samples_row11);
+    uint16x8_t sum = StoreLumaResults8_420(luma_sum00, luma_sum01, luma_ptr);
+    luma_ptr += kCflLumaBufferStride;
+
+    const uint16x8_t samples_row20 = vld1q_u16(src);
+    const uint16x8_t samples_row21 = (max_luma_width == 16)
+                                         ? vld1q_u16(src + 8)
+                                         : LastRowSamples(samples_row20);
+    src += src_stride;
+    const uint16x8_t samples_row30 = vld1q_u16(src);
+    const uint16x8_t samples_row31 = (max_luma_width == 16)
+                                         ? vld1q_u16(src + 8)
+                                         : LastRowSamples(samples_row30);
+    src += src_stride;
+    const uint16x8_t luma_sum10 = vaddq_u16(samples_row20, samples_row30);
+    const uint16x8_t luma_sum11 = vaddq_u16(samples_row21, samples_row31);
+    sum =
+        vaddq_u16(sum, StoreLumaResults8_420(luma_sum10, luma_sum11, luma_ptr));
+    luma_ptr += kCflLumaBufferStride;
+
+    const uint16x8_t samples_row40 = vld1q_u16(src);
+    const uint16x8_t samples_row41 = (max_luma_width == 16)
+                                         ? vld1q_u16(src + 8)
+                                         : LastRowSamples(samples_row40);
+    src += src_stride;
+    const uint16x8_t samples_row50 = vld1q_u16(src);
+    const uint16x8_t samples_row51 = (max_luma_width == 16)
+                                         ? vld1q_u16(src + 8)
+                                         : LastRowSamples(samples_row50);
+    src += src_stride;
+    const uint16x8_t luma_sum20 = vaddq_u16(samples_row40, samples_row50);
+    const uint16x8_t luma_sum21 = vaddq_u16(samples_row41, samples_row51);
+    sum =
+        vaddq_u16(sum, StoreLumaResults8_420(luma_sum20, luma_sum21, luma_ptr));
+    luma_ptr += kCflLumaBufferStride;
+
+    const uint16x8_t samples_row60 = vld1q_u16(src);
+    const uint16x8_t samples_row61 = (max_luma_width == 16)
+                                         ? vld1q_u16(src + 8)
+                                         : LastRowSamples(samples_row60);
+    src += src_stride;
+    const uint16x8_t samples_row70 = vld1q_u16(src);
+    const uint16x8_t samples_row71 = (max_luma_width == 16)
+                                         ? vld1q_u16(src + 8)
+                                         : LastRowSamples(samples_row70);
+    src += src_stride;
+    const uint16x8_t luma_sum30 = vaddq_u16(samples_row60, samples_row70);
+    const uint16x8_t luma_sum31 = vaddq_u16(samples_row61, samples_row71);
+    sum =
+        vaddq_u16(sum, StoreLumaResults8_420(luma_sum30, luma_sum31, luma_ptr));
+    luma_ptr += kCflLumaBufferStride;
+
+    final_sum = vpadalq_u16(final_sum, sum);
+    y -= 4;
+  } while (y != 0);
+
+  // Duplicate the final row downward to the end after max_luma_height.
+  const uint16x8_t final_fill =
+      vreinterpretq_u16_s16(vld1q_s16(luma_ptr - kCflLumaBufferStride));
+  const uint32x4_t final_fill_to_sum =
+      vaddl_u16(vget_low_u16(final_fill), vget_high_u16(final_fill));
+
+  for (y = luma_height; y < block_height; ++y) {
+    vst1q_s16(luma_ptr, vreinterpretq_s16_u16(final_fill));
+    luma_ptr += kCflLumaBufferStride;
+    final_sum = vaddq_u32(final_sum, final_fill_to_sum);
+  }
+
+  const uint32_t average_sum = RightShiftWithRounding(
+      SumVector(final_sum), block_height_log2 + 3 /*log2 of width 8*/);
+  const int16x8_t averages = vdupq_n_s16(static_cast<int16_t>(average_sum));
+  luma_ptr = luma[0];
+  y = block_height;
+  do {
+    const int16x8_t samples = vld1q_s16(luma_ptr);
+    vst1q_s16(luma_ptr, vsubq_s16(samples, averages));
+    luma_ptr += kCflLumaBufferStride;
+  } while (--y != 0);
+}
+
+template <int block_height_log2>
+void CflSubsampler420_8xH_NEON(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_width, const int max_luma_height,
+    const void* const source, ptrdiff_t stride) {
+  if (max_luma_width == 8) {
+    CflSubsampler420Impl_8xH_NEON<block_height_log2, 8>(luma, max_luma_height,
+                                                        source, stride);
+  } else {
+    CflSubsampler420Impl_8xH_NEON<block_height_log2, 16>(luma, max_luma_height,
+                                                         source, stride);
+  }
+}
+
+template <int block_width_log2, int block_height_log2, int max_luma_width>
+inline void CflSubsampler420Impl_WxH_NEON(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_height, const void* const source, ptrdiff_t stride) {
+  const auto* src = static_cast<const uint16_t*>(source);
+  const ptrdiff_t src_stride = stride / sizeof(src[0]);
+  const int block_height = 1 << block_height_log2;
+  const int luma_height = std::min(block_height, max_luma_height >> 1);
+  int16_t* luma_ptr = luma[0];
+  // Begin first y section, covering width up to 32.
+  int y = luma_height;
+
+  uint16x8_t final_fill0, final_fill1;
+  uint32x4_t final_sum = vdupq_n_u32(0);
+  do {
+    const uint16_t* src_next = src + src_stride;
+    const uint16x8_t samples_row00 = vld1q_u16(src);
+    const uint16x8_t samples_row01 = (max_luma_width >= 16)
+                                         ? vld1q_u16(src + 8)
+                                         : LastRowSamples(samples_row00);
+    const uint16x8_t samples_row02 = (max_luma_width >= 24)
+                                         ? vld1q_u16(src + 16)
+                                         : LastRowSamples(samples_row01);
+    const uint16x8_t samples_row03 = (max_luma_width == 32)
+                                         ? vld1q_u16(src + 24)
+                                         : LastRowSamples(samples_row02);
+    const uint16x8_t samples_row10 = vld1q_u16(src_next);
+    const uint16x8_t samples_row11 = (max_luma_width >= 16)
+                                         ? vld1q_u16(src_next + 8)
+                                         : LastRowSamples(samples_row10);
+    const uint16x8_t samples_row12 = (max_luma_width >= 24)
+                                         ? vld1q_u16(src_next + 16)
+                                         : LastRowSamples(samples_row11);
+    const uint16x8_t samples_row13 = (max_luma_width == 32)
+                                         ? vld1q_u16(src_next + 24)
+                                         : LastRowSamples(samples_row12);
+    const uint16x8_t luma_sum0 = vaddq_u16(samples_row00, samples_row10);
+    const uint16x8_t luma_sum1 = vaddq_u16(samples_row01, samples_row11);
+    const uint16x8_t luma_sum2 = vaddq_u16(samples_row02, samples_row12);
+    const uint16x8_t luma_sum3 = vaddq_u16(samples_row03, samples_row13);
+    final_fill0 = StoreLumaResults8_420(luma_sum0, luma_sum1, luma_ptr);
+    final_fill1 = StoreLumaResults8_420(luma_sum2, luma_sum3, luma_ptr + 8);
+    const uint16x8_t sum = vaddq_u16(final_fill0, final_fill1);
+
+    final_sum = vpadalq_u16(final_sum, sum);
+
+    // Because max_luma_width is at most 32, any values beyond x=16 will
+    // necessarily be duplicated.
+    if (block_width_log2 == 5) {
+      const uint16x8_t wide_fill = LastRowResult(final_fill1);
+      final_sum = vpadalq_u16(final_sum, vshlq_n_u16(wide_fill, 1));
+    }
+    src += src_stride << 1;
+    luma_ptr += kCflLumaBufferStride;
+  } while (--y != 0);
+
+  // Begin second y section.
+  y = luma_height;
+  if (y < block_height) {
+    uint32x4_t wide_fill;
+    if (block_width_log2 == 5) {
+      // There are 16 16-bit fill values per row, shifting by 2 accounts for
+      // the widening to 32-bit.  (a << 2) = (a + a) << 1.
+      wide_fill = vshll_n_u16(vget_low_u16(LastRowResult(final_fill1)), 2);
+    }
+    const uint16x8_t final_inner_sum = vaddq_u16(final_fill0, final_fill1);
+    const uint32x4_t final_fill_to_sum = vaddl_u16(
+        vget_low_u16(final_inner_sum), vget_high_u16(final_inner_sum));
+
+    do {
+      vst1q_s16(luma_ptr, vreinterpretq_s16_u16(final_fill0));
+      vst1q_s16(luma_ptr + 8, vreinterpretq_s16_u16(final_fill1));
+      if (block_width_log2 == 5) {
+        final_sum = vaddq_u32(final_sum, wide_fill);
+      }
+      luma_ptr += kCflLumaBufferStride;
+      final_sum = vaddq_u32(final_sum, final_fill_to_sum);
+    } while (++y < block_height);
+  }  // End second y section.
+
+  const uint32_t average_sum = RightShiftWithRounding(
+      SumVector(final_sum), block_width_log2 + block_height_log2);
+  const int16x8_t averages = vdupq_n_s16(static_cast<int16_t>(average_sum));
+
+  luma_ptr = luma[0];
+  y = block_height;
+  do {
+    const int16x8_t samples0 = vld1q_s16(luma_ptr);
+    vst1q_s16(luma_ptr, vsubq_s16(samples0, averages));
+    const int16x8_t samples1 = vld1q_s16(luma_ptr + 8);
+    const int16x8_t final_row_result = vsubq_s16(samples1, averages);
+    vst1q_s16(luma_ptr + 8, final_row_result);
+
+    if (block_width_log2 == 5) {
+      const int16x8_t wide_fill = LastRowResult(final_row_result);
+      vst1q_s16(luma_ptr + 16, wide_fill);
+      vst1q_s16(luma_ptr + 24, wide_fill);
+    }
+    luma_ptr += kCflLumaBufferStride;
+  } while (--y != 0);
+}
+
+//------------------------------------------------------------------------------
+// Choose subsampler based on max_luma_width
+template <int block_width_log2, int block_height_log2>
+void CflSubsampler420_WxH_NEON(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_width, const int max_luma_height,
+    const void* const source, ptrdiff_t stride) {
+  switch (max_luma_width) {
+    case 8:
+      CflSubsampler420Impl_WxH_NEON<block_width_log2, block_height_log2, 8>(
+          luma, max_luma_height, source, stride);
+      return;
+    case 16:
+      CflSubsampler420Impl_WxH_NEON<block_width_log2, block_height_log2, 16>(
+          luma, max_luma_height, source, stride);
+      return;
+    case 24:
+      CflSubsampler420Impl_WxH_NEON<block_width_log2, block_height_log2, 24>(
+          luma, max_luma_height, source, stride);
+      return;
+    default:
+      assert(max_luma_width == 32);
+      CflSubsampler420Impl_WxH_NEON<block_width_log2, block_height_log2, 32>(
+          luma, max_luma_height, source, stride);
+      return;
+  }
+}
+
+//------------------------------------------------------------------------------
+// CflIntraPredictor
+
+// |luma| can be within +/-(((1 << bitdepth) - 1) << 3), inclusive.
+// |alpha| can be -16 to 16 (inclusive).
+// Clip |dc + ((alpha * luma) >> 6))| to 0, (1 << bitdepth) - 1.
+inline uint16x8_t Combine8(const int16x8_t luma, const int16x8_t alpha_abs,
+                           const int16x8_t alpha_signed, const int16x8_t dc,
+                           const uint16x8_t max_value) {
+  const int16x8_t luma_abs = vabsq_s16(luma);
+  const int16x8_t luma_alpha_sign =
+      vshrq_n_s16(veorq_s16(luma, alpha_signed), 15);
+  // (alpha * luma) >> 6
+  const int16x8_t la_abs = vqrdmulhq_s16(luma_abs, alpha_abs);
+  // Convert back to signed values.
+  const int16x8_t la =
+      vsubq_s16(veorq_s16(la_abs, luma_alpha_sign), luma_alpha_sign);
+  const int16x8_t result = vaddq_s16(la, dc);
+  const int16x8_t zero = vdupq_n_s16(0);
+  // Clip.
+  return vminq_u16(vreinterpretq_u16_s16(vmaxq_s16(result, zero)), max_value);
+}
+
+template <int block_height, int bitdepth = 10>
+inline void CflIntraPredictor4xN_NEON(
+    void* const dest, const ptrdiff_t stride,
+    const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int alpha) {
+  auto* dst = static_cast<uint16_t*>(dest);
+  const ptrdiff_t dst_stride = stride >> 1;
+  const uint16x8_t max_value = vdupq_n_u16((1 << bitdepth) - 1);
+  const int16x8_t alpha_signed = vdupq_n_s16(alpha << 9);
+  const int16x8_t alpha_abs = vabsq_s16(alpha_signed);
+  const int16x8_t dc = vdupq_n_s16(dst[0]);
+  for (int y = 0; y < block_height; y += 2) {
+    const int16x4_t luma_row0 = vld1_s16(luma[y]);
+    const int16x4_t luma_row1 = vld1_s16(luma[y + 1]);
+    const int16x8_t combined_luma = vcombine_s16(luma_row0, luma_row1);
+    const uint16x8_t sum =
+        Combine8(combined_luma, alpha_abs, alpha_signed, dc, max_value);
+    vst1_u16(dst, vget_low_u16(sum));
+    dst += dst_stride;
+    vst1_u16(dst, vget_high_u16(sum));
+    dst += dst_stride;
+  }
+}
+
+template <int block_height, int bitdepth = 10>
+inline void CflIntraPredictor8xN_NEON(
+    void* const dest, const ptrdiff_t stride,
+    const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int alpha) {
+  auto* dst = static_cast<uint16_t*>(dest);
+  const ptrdiff_t dst_stride = stride >> 1;
+  const uint16x8_t max_value = vdupq_n_u16((1 << bitdepth) - 1);
+  const int16x8_t alpha_signed = vdupq_n_s16(alpha << 9);
+  const int16x8_t alpha_abs = vabsq_s16(alpha_signed);
+  const int16x8_t dc = vdupq_n_s16(dst[0]);
+  for (int y = 0; y < block_height; ++y) {
+    const int16x8_t luma_row = vld1q_s16(luma[y]);
+    const uint16x8_t sum =
+        Combine8(luma_row, alpha_abs, alpha_signed, dc, max_value);
+    vst1q_u16(dst, sum);
+    dst += dst_stride;
+  }
+}
+
+template <int block_height, int bitdepth = 10>
+inline void CflIntraPredictor16xN_NEON(
+    void* const dest, const ptrdiff_t stride,
+    const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int alpha) {
+  auto* dst = static_cast<uint16_t*>(dest);
+  const ptrdiff_t dst_stride = stride >> 1;
+  const uint16x8_t max_value = vdupq_n_u16((1 << bitdepth) - 1);
+  const int16x8_t alpha_signed = vdupq_n_s16(alpha << 9);
+  const int16x8_t alpha_abs = vabsq_s16(alpha_signed);
+  const int16x8_t dc = vdupq_n_s16(dst[0]);
+  for (int y = 0; y < block_height; ++y) {
+    const int16x8_t luma_row_0 = vld1q_s16(luma[y]);
+    const int16x8_t luma_row_1 = vld1q_s16(luma[y] + 8);
+    const uint16x8_t sum_0 =
+        Combine8(luma_row_0, alpha_abs, alpha_signed, dc, max_value);
+    const uint16x8_t sum_1 =
+        Combine8(luma_row_1, alpha_abs, alpha_signed, dc, max_value);
+    vst1q_u16(dst, sum_0);
+    vst1q_u16(dst + 8, sum_1);
+    dst += dst_stride;
+  }
+}
+
+template <int block_height, int bitdepth = 10>
+inline void CflIntraPredictor32xN_NEON(
+    void* const dest, const ptrdiff_t stride,
+    const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int alpha) {
+  auto* dst = static_cast<uint16_t*>(dest);
+  const ptrdiff_t dst_stride = stride >> 1;
+  const uint16x8_t max_value = vdupq_n_u16((1 << bitdepth) - 1);
+  const int16x8_t alpha_signed = vdupq_n_s16(alpha << 9);
+  const int16x8_t alpha_abs = vabsq_s16(alpha_signed);
+  const int16x8_t dc = vdupq_n_s16(dst[0]);
+  for (int y = 0; y < block_height; ++y) {
+    const int16x8_t luma_row_0 = vld1q_s16(luma[y]);
+    const int16x8_t luma_row_1 = vld1q_s16(luma[y] + 8);
+    const int16x8_t luma_row_2 = vld1q_s16(luma[y] + 16);
+    const int16x8_t luma_row_3 = vld1q_s16(luma[y] + 24);
+    const uint16x8_t sum_0 =
+        Combine8(luma_row_0, alpha_abs, alpha_signed, dc, max_value);
+    const uint16x8_t sum_1 =
+        Combine8(luma_row_1, alpha_abs, alpha_signed, dc, max_value);
+    const uint16x8_t sum_2 =
+        Combine8(luma_row_2, alpha_abs, alpha_signed, dc, max_value);
+    const uint16x8_t sum_3 =
+        Combine8(luma_row_3, alpha_abs, alpha_signed, dc, max_value);
+    vst1q_u16(dst, sum_0);
+    vst1q_u16(dst + 8, sum_1);
+    vst1q_u16(dst + 16, sum_2);
+    vst1q_u16(dst + 24, sum_3);
+    dst += dst_stride;
+  }
+}
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+
+  dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType420] =
+      CflSubsampler420_4xH_NEON<2>;
+  dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType420] =
+      CflSubsampler420_4xH_NEON<3>;
+  dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType420] =
+      CflSubsampler420_4xH_NEON<4>;
+
+  dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType420] =
+      CflSubsampler420_8xH_NEON<2>;
+  dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType420] =
+      CflSubsampler420_8xH_NEON<3>;
+  dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType420] =
+      CflSubsampler420_8xH_NEON<4>;
+  dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType420] =
+      CflSubsampler420_8xH_NEON<5>;
+
+  dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType420] =
+      CflSubsampler420_WxH_NEON<4, 2>;
+  dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType420] =
+      CflSubsampler420_WxH_NEON<4, 3>;
+  dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType420] =
+      CflSubsampler420_WxH_NEON<4, 4>;
+  dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType420] =
+      CflSubsampler420_WxH_NEON<4, 5>;
+
+  dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType420] =
+      CflSubsampler420_WxH_NEON<5, 3>;
+  dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType420] =
+      CflSubsampler420_WxH_NEON<5, 4>;
+  dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType420] =
+      CflSubsampler420_WxH_NEON<5, 5>;
+
+  dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType444] =
+      CflSubsampler444_4xH_NEON<2>;
+  dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType444] =
+      CflSubsampler444_4xH_NEON<3>;
+  dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType444] =
+      CflSubsampler444_4xH_NEON<4>;
+
+  dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType444] =
+      CflSubsampler444_8xH_NEON<2>;
+  dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType444] =
+      CflSubsampler444_8xH_NEON<3>;
+  dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType444] =
+      CflSubsampler444_8xH_NEON<4>;
+  dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType444] =
+      CflSubsampler444_8xH_NEON<5>;
+
+  dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType444] =
+      CflSubsampler444_WxH_NEON<4, 2>;
+  dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType444] =
+      CflSubsampler444_WxH_NEON<4, 3>;
+  dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType444] =
+      CflSubsampler444_WxH_NEON<4, 4>;
+  dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType444] =
+      CflSubsampler444_WxH_NEON<4, 5>;
+
+  dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType444] =
+      CflSubsampler444_WxH_NEON<5, 3>;
+  dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType444] =
+      CflSubsampler444_WxH_NEON<5, 4>;
+  dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType444] =
+      CflSubsampler444_WxH_NEON<5, 5>;
+
+  dsp->cfl_intra_predictors[kTransformSize4x4] = CflIntraPredictor4xN_NEON<4>;
+  dsp->cfl_intra_predictors[kTransformSize4x8] = CflIntraPredictor4xN_NEON<8>;
+  dsp->cfl_intra_predictors[kTransformSize4x16] = CflIntraPredictor4xN_NEON<16>;
+
+  dsp->cfl_intra_predictors[kTransformSize8x4] = CflIntraPredictor8xN_NEON<4>;
+  dsp->cfl_intra_predictors[kTransformSize8x8] = CflIntraPredictor8xN_NEON<8>;
+  dsp->cfl_intra_predictors[kTransformSize8x16] = CflIntraPredictor8xN_NEON<16>;
+  dsp->cfl_intra_predictors[kTransformSize8x32] = CflIntraPredictor8xN_NEON<32>;
+
+  dsp->cfl_intra_predictors[kTransformSize16x4] = CflIntraPredictor16xN_NEON<4>;
+  dsp->cfl_intra_predictors[kTransformSize16x8] = CflIntraPredictor16xN_NEON<8>;
+  dsp->cfl_intra_predictors[kTransformSize16x16] =
+      CflIntraPredictor16xN_NEON<16>;
+  dsp->cfl_intra_predictors[kTransformSize16x32] =
+      CflIntraPredictor16xN_NEON<32>;
+  dsp->cfl_intra_predictors[kTransformSize32x8] = CflIntraPredictor32xN_NEON<8>;
+  dsp->cfl_intra_predictors[kTransformSize32x16] =
+      CflIntraPredictor32xN_NEON<16>;
+  dsp->cfl_intra_predictors[kTransformSize32x32] =
+      CflIntraPredictor32xN_NEON<32>;
+  // Max Cfl predictor size is 32x32.
+}
+
+}  // namespace
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+void IntraPredCflInit_NEON() {
+  low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif
+}
 
 }  // namespace dsp
 }  // namespace libgav1
 
-#else  // !LIBGAV1_ENABLE_NEON
+#else   // !LIBGAV1_ENABLE_NEON
 namespace libgav1 {
 namespace dsp {
 
diff --git a/src/dsp/arm/intrapred_cfl_neon.h b/src/dsp/arm/intrapred_cfl_neon.h
new file mode 100644
index 0000000..b4f983a
--- /dev/null
+++ b/src/dsp/arm/intrapred_cfl_neon.h
@@ -0,0 +1,179 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_INTRAPRED_CFL_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_INTRAPRED_CFL_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::cfl_intra_predictors and Dsp::cfl_subsamplers, see the
+// defines below for specifics. These functions are not thread-safe.
+void IntraPredCflInit_NEON();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+// 4x4
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 4x8
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 4x16
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 8x4
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 8x8
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 8x16
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 8x32
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 16x4
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 16x8
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 16x16
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 16x32
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 32x8
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 32x16
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 32x32
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// -----------------------------------------------------------------------------
+// 10bpp
+
+// 4x4
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 4x8
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 4x16
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 8x4
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 8x8
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 8x16
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 8x32
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 16x4
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 16x8
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 16x16
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 16x32
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 32x8
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 32x16
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 32x32
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler444 LIBGAV1_CPU_NEON
+
+#endif  // LIBGAV1_ENABLE_NEON
+
+#endif  // LIBGAV1_SRC_DSP_ARM_INTRAPRED_CFL_NEON_H_
diff --git a/src/dsp/arm/intrapred_directional_neon.cc b/src/dsp/arm/intrapred_directional_neon.cc
index 805ba81..3f5edbd 100644
--- a/src/dsp/arm/intrapred_directional_neon.cc
+++ b/src/dsp/arm/intrapred_directional_neon.cc
@@ -12,18 +12,18 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "src/dsp/intrapred.h"
+#include "src/dsp/intrapred_directional.h"
 #include "src/utils/cpu.h"
 
 #if LIBGAV1_ENABLE_NEON
 
 #include <arm_neon.h>
 
-#include <algorithm>  // std::min
+#include <algorithm>
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
-#include <cstring>  // memset
+#include <cstring>
 
 #include "src/dsp/arm/common_neon.h"
 #include "src/dsp/constants.h"
@@ -35,14 +35,14 @@ namespace dsp {
 namespace low_bitdepth {
 namespace {
 
-// Blend two values based on a 32 bit weight.
+// Blend two values based on weights that sum to 32.
 inline uint8x8_t WeightedBlend(const uint8x8_t a, const uint8x8_t b,
                                const uint8x8_t a_weight,
                                const uint8x8_t b_weight) {
   const uint16x8_t a_product = vmull_u8(a, a_weight);
   const uint16x8_t b_product = vmull_u8(b, b_weight);
 
-  return vrshrn_n_u16(vaddq_u16(a_product, b_product), 5);
+  return vrshrn_n_u16(vaddq_u16(a_product, b_product), 5 /*log2(32)*/);
 }
 
 // For vertical operations the weights are one constant value.
@@ -112,7 +112,7 @@ inline void DirectionalZone1_WxH(uint8_t* dst, const ptrdiff_t stride,
     // 4 wide subsamples the output. 8 wide subsamples the input.
     if (width == 4) {
       const uint8x8_t left_values = vld1_u8(top + top_base_x);
-      const uint8x8_t right_values = RightShift<8>(left_values);
+      const uint8x8_t right_values = RightShiftVector<8>(left_values);
       const uint8x8_t value = WeightedBlend(left_values, right_values, shift);
 
       // If |upsampled| is true then extract every other value for output.
@@ -910,12 +910,590 @@ void Init8bpp() {
 }  // namespace
 }  // namespace low_bitdepth
 
-void IntraPredDirectionalInit_NEON() { low_bitdepth::Init8bpp(); }
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+// Blend two values based on weights that sum to 32.
+inline uint16x4_t WeightedBlend(const uint16x4_t a, const uint16x4_t b,
+                                const int a_weight, const int b_weight) {
+  const uint16x4_t a_product = vmul_n_u16(a, a_weight);
+  const uint16x4_t sum = vmla_n_u16(a_product, b, b_weight);
+
+  return vrshr_n_u16(sum, 5 /*log2(32)*/);
+}
+
+// Blend two values based on weights that sum to 32.
+inline uint16x8_t WeightedBlend(const uint16x8_t a, const uint16x8_t b,
+                                const uint16_t a_weight,
+                                const uint16_t b_weight) {
+  const uint16x8_t a_product = vmulq_n_u16(a, a_weight);
+  const uint16x8_t sum = vmlaq_n_u16(a_product, b, b_weight);
+
+  return vrshrq_n_u16(sum, 5 /*log2(32)*/);
+}
+
+// Each element of |dest| contains values associated with one weight value.
+inline void LoadEdgeVals(uint16x4x2_t* dest, const uint16_t* const source,
+                         const bool upsampled) {
+  if (upsampled) {
+    *dest = vld2_u16(source);
+  } else {
+    dest->val[0] = vld1_u16(source);
+    dest->val[1] = vld1_u16(source + 1);
+  }
+}
+
+// Each element of |dest| contains values associated with one weight value.
+inline void LoadEdgeVals(uint16x8x2_t* dest, const uint16_t* const source,
+                         const bool upsampled) {
+  if (upsampled) {
+    *dest = vld2q_u16(source);
+  } else {
+    dest->val[0] = vld1q_u16(source);
+    dest->val[1] = vld1q_u16(source + 1);
+  }
+}
+
+template <bool upsampled>
+inline void DirectionalZone1_4xH(uint16_t* dst, const ptrdiff_t stride,
+                                 const int height, const uint16_t* const top,
+                                 const int xstep) {
+  const int upsample_shift = static_cast<int>(upsampled);
+  const int index_scale_bits = 6 - upsample_shift;
+
+  const int max_base_x = (4 + height - 1) << upsample_shift;
+  const int16x4_t max_base = vdup_n_s16(max_base_x);
+  const uint16x4_t final_top_val = vdup_n_u16(top[max_base_x]);
+  const int16x4_t index_offset = {0, 1, 2, 3};
+
+  // All rows from |min_corner_only_y| down will simply use Memset.
+  // |max_base_x| is always greater than |height|, so clipping the denominator
+  // to 1 is enough to make the logic work.
+  const int xstep_units = std::max(xstep >> index_scale_bits, 1);
+  const int min_corner_only_y = std::min(max_base_x / xstep_units, height);
+
+  int top_x = xstep;
+  int y = 0;
+  for (; y < min_corner_only_y; ++y, dst += stride, top_x += xstep) {
+    const int top_base_x = top_x >> index_scale_bits;
+
+    // To accommodate reuse of this function in Zone2, permit negative values
+    // for |xstep|.
+    const uint16_t shift_0 = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+    const uint16_t shift_1 = 32 - shift_0;
+
+    // Use signed values to compare |top_base_x| to |max_base_x|.
+    const int16x4_t base_x = vadd_s16(vdup_n_s16(top_base_x), index_offset);
+    const uint16x4_t max_base_mask = vclt_s16(base_x, max_base);
+
+    uint16x4x2_t sampled_top_row;
+    LoadEdgeVals(&sampled_top_row, top + top_base_x, upsampled);
+    const uint16x4_t combined = WeightedBlend(
+        sampled_top_row.val[0], sampled_top_row.val[1], shift_1, shift_0);
+
+    // If |upsampled| is true then extract every other value for output.
+    const uint16x4_t masked_result =
+        vbsl_u16(max_base_mask, combined, final_top_val);
+
+    vst1_u16(dst, masked_result);
+  }
+  for (; y < height; ++y) {
+    Memset(dst, top[max_base_x], 4 /* width */);
+    dst += stride;
+  }
+}
+
+// Process a multiple of 8 |width| by any |height|. Processes horizontally
+// before vertically in the hopes of being a little more cache friendly.
+template <bool upsampled>
+inline void DirectionalZone1_WxH(uint16_t* dst, const ptrdiff_t stride,
+                                 const int width, const int height,
+                                 const uint16_t* const top, const int xstep) {
+  assert(width % 8 == 0);
+  const int upsample_shift = static_cast<int>(upsampled);
+  const int index_scale_bits = 6 - upsample_shift;
+
+  const int max_base_index = (width + height - 1) << upsample_shift;
+  const int16x8_t max_base_x = vdupq_n_s16(max_base_index);
+  const uint16x8_t final_top_val = vdupq_n_u16(top[max_base_index]);
+  const int16x8_t index_offset = {0, 1, 2, 3, 4, 5, 6, 7};
+
+  const int base_step = 1 << upsample_shift;
+  const int base_step8 = base_step << 3;
+  const int16x8_t block_step = vdupq_n_s16(base_step8);
+
+  // All rows from |min_corner_only_y| down will simply use Memset.
+  // |max_base_x| is always greater than |height|, so clipping the denominator
+  // to 1 is enough to make the logic work.
+  const int xstep_units = std::max(xstep >> index_scale_bits, 1);
+  const int min_corner_only_y = std::min(max_base_index / xstep_units, height);
+
+  int top_x = xstep;
+  int y = 0;
+  for (; y < min_corner_only_y; ++y, dst += stride, top_x += xstep) {
+    int top_base_x = top_x >> index_scale_bits;
+
+    // To accommodate reuse of this function in Zone2, permit negative values
+    // for |xstep|.
+    const uint16_t shift_0 = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+    const uint16_t shift_1 = 32 - shift_0;
+
+    // Use signed values to compare |top_base_x| to |max_base_x|.
+    int16x8_t base_x = vaddq_s16(vdupq_n_s16(top_base_x), index_offset);
+
+    int x = 0;
+    do {
+      const uint16x8_t max_base_mask = vcltq_s16(base_x, max_base_x);
+
+      uint16x8x2_t sampled_top_row;
+      LoadEdgeVals(&sampled_top_row, top + top_base_x, upsampled);
+      const uint16x8_t combined = WeightedBlend(
+          sampled_top_row.val[0], sampled_top_row.val[1], shift_1, shift_0);
+
+      const uint16x8_t masked_result =
+          vbslq_u16(max_base_mask, combined, final_top_val);
+      vst1q_u16(dst + x, masked_result);
+
+      base_x = vaddq_s16(base_x, block_step);
+      top_base_x += base_step8;
+      x += 8;
+    } while (x < width);
+  }
+  for (int i = y; i < height; ++i) {
+    Memset(dst, top[max_base_index], width);
+    dst += stride;
+  }
+}
+
+// Process a multiple of 8 |width| by any |height|. Processes horizontally
+// before vertically in the hopes of being a little more cache friendly.
+inline void DirectionalZone1_Large(uint16_t* dst, const ptrdiff_t stride,
+                                   const int width, const int height,
+                                   const uint16_t* const top, const int xstep,
+                                   const bool upsampled) {
+  assert(width % 8 == 0);
+  const int upsample_shift = static_cast<int>(upsampled);
+  const int index_scale_bits = 6 - upsample_shift;
+
+  const int max_base_index = (width + height - 1) << upsample_shift;
+  const int16x8_t max_base_x = vdupq_n_s16(max_base_index);
+  const uint16x8_t final_top_val = vdupq_n_u16(top[max_base_index]);
+  const int16x8_t index_offset = {0, 1, 2, 3, 4, 5, 6, 7};
+
+  const int base_step = 1 << upsample_shift;
+  const int base_step8 = base_step << 3;
+  const int16x8_t block_step = vdupq_n_s16(base_step8);
+
+  // All rows from |min_corner_only_y| down will simply use Memset.
+  // |max_base_x| is always greater than |height|, so clipping the denominator
+  // to 1 is enough to make the logic work.
+  const int xstep_units = std::max(xstep >> index_scale_bits, 1);
+  const int min_corner_only_y = std::min(max_base_index / xstep_units, height);
+
+  // Rows up to this y-value can be computed without checking for bounds.
+  const int max_no_corner_y = std::min(
+      ((max_base_index - (base_step * width)) << index_scale_bits) / xstep,
+      height);
+  // No need to check for exceeding |max_base_x| in the first loop.
+  int y = 0;
+  int top_x = xstep;
+  for (; y < max_no_corner_y; ++y, dst += stride, top_x += xstep) {
+    int top_base_x = top_x >> index_scale_bits;
+    // To accommodate reuse of this function in Zone2, permit negative values
+    // for |xstep|.
+    const uint16_t shift_0 = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+    const uint16_t shift_1 = 32 - shift_0;
+
+    int x = 0;
+    do {
+      uint16x8x2_t sampled_top_row;
+      LoadEdgeVals(&sampled_top_row, top + top_base_x, upsampled);
+      const uint16x8_t combined = WeightedBlend(
+          sampled_top_row.val[0], sampled_top_row.val[1], shift_1, shift_0);
+
+      vst1q_u16(dst + x, combined);
+
+      top_base_x += base_step8;
+      x += 8;
+    } while (x < width);
+  }
+
+  for (; y < min_corner_only_y; ++y, dst += stride, top_x += xstep) {
+    int top_base_x = top_x >> index_scale_bits;
+
+    // To accommodate reuse of this function in Zone2, permit negative values
+    // for |xstep|.
+    const uint16_t shift_0 = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+    const uint16_t shift_1 = 32 - shift_0;
+
+    // Use signed values to compare |top_base_x| to |max_base_x|.
+    int16x8_t base_x = vaddq_s16(vdupq_n_s16(top_base_x), index_offset);
+
+    int x = 0;
+    const int min_corner_only_x =
+        std::min(width, ((max_base_index - top_base_x) >> upsample_shift) + 7) &
+        ~7;
+    for (; x < min_corner_only_x; x += 8, top_base_x += base_step8,
+                                  base_x = vaddq_s16(base_x, block_step)) {
+      const uint16x8_t max_base_mask = vcltq_s16(base_x, max_base_x);
+
+      uint16x8x2_t sampled_top_row;
+      LoadEdgeVals(&sampled_top_row, top + top_base_x, upsampled);
+      const uint16x8_t combined = WeightedBlend(
+          sampled_top_row.val[0], sampled_top_row.val[1], shift_1, shift_0);
+
+      const uint16x8_t masked_result =
+          vbslq_u16(max_base_mask, combined, final_top_val);
+      vst1q_u16(dst + x, masked_result);
+    }
+    // Corner-only section of the row.
+    Memset(dst + x, top[max_base_index], width - x);
+  }
+  for (; y < height; ++y) {
+    Memset(dst, top[max_base_index], width);
+    dst += stride;
+  }
+}
+
+void DirectionalIntraPredictorZone1_NEON(void* const dest, ptrdiff_t stride,
+                                         const void* const top_row,
+                                         const int width, const int height,
+                                         const int xstep,
+                                         const bool upsampled_top) {
+  const uint16_t* const top = static_cast<const uint16_t*>(top_row);
+  uint16_t* dst = static_cast<uint16_t*>(dest);
+  stride /= sizeof(top[0]);
+
+  assert(xstep > 0);
+
+  if (xstep == 64) {
+    assert(!upsampled_top);
+    const uint16_t* top_ptr = top + 1;
+    const int width_bytes = width * sizeof(top[0]);
+    int y = height;
+    do {
+      memcpy(dst, top_ptr, width_bytes);
+      memcpy(dst + stride, top_ptr + 1, width_bytes);
+      memcpy(dst + 2 * stride, top_ptr + 2, width_bytes);
+      memcpy(dst + 3 * stride, top_ptr + 3, width_bytes);
+      dst += 4 * stride;
+      top_ptr += 4;
+      y -= 4;
+    } while (y != 0);
+  } else {
+    if (width == 4) {
+      if (upsampled_top) {
+        DirectionalZone1_4xH<true>(dst, stride, height, top, xstep);
+      } else {
+        DirectionalZone1_4xH<false>(dst, stride, height, top, xstep);
+      }
+    } else if (width >= 32) {
+      if (upsampled_top) {
+        DirectionalZone1_Large(dst, stride, width, height, top, xstep, true);
+      } else {
+        DirectionalZone1_Large(dst, stride, width, height, top, xstep, false);
+      }
+    } else if (upsampled_top) {
+      DirectionalZone1_WxH<true>(dst, stride, width, height, top, xstep);
+    } else {
+      DirectionalZone1_WxH<false>(dst, stride, width, height, top, xstep);
+    }
+  }
+}
+
+// -----------------------------------------------------------------------------
+// Zone 3
+// This can be considered "the transpose of Zone 1." In Zone 1, the fractional
+// step applies when moving vertically in the destination block, connected to
+// the change in |y|, whereas in this mode, the step applies when moving
+// horizontally, connected to the change in |x|. This makes vectorization very
+// complicated in row-order, because a given vector may need source pixels that
+// span 16 or 32 pixels in steep angles, requiring multiple expensive table
+// lookups and checked loads. Rather than work in row order, it is simpler to
+// compute |dest| in column order, and then store the transposed results.
+
+// Compute 4x4 sub-blocks.
+// Example of computed sub-blocks of a 4x8 block before and after transpose:
+// 00 10 20 30             00 01 02 03
+// 01 11 21 31             10 11 12 13
+// 02 12 22 32             20 21 22 23
+// 03 13 23 33             30 31 32 33
+// -----------     -->     -----------
+// 40 50 60 70             40 41 42 43
+// 41 51 61 71             50 51 52 53
+// 42 52 62 72             60 61 62 63
+// 43 53 63 73             70 71 72 73
+template <bool upsampled>
+inline void DirectionalZone3_4x4(uint8_t* dst, const ptrdiff_t stride,
+                                 const uint16_t* const left, const int ystep,
+                                 const int base_left_y = 0) {
+  const int upsample_shift = static_cast<int>(upsampled);
+  const int index_scale_bits = 6 - upsample_shift;
+
+  // Compute one column at a time, then transpose for storage.
+  uint16x4_t result[4];
+
+  int left_y = base_left_y + ystep;
+  int left_offset = left_y >> index_scale_bits;
+  int shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+  int shift_1 = 32 - shift_0;
+  uint16x4x2_t sampled_left_col;
+  LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+  result[0] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+                            shift_1, shift_0);
+
+  left_y += ystep;
+  left_offset = left_y >> index_scale_bits;
+  shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+  shift_1 = 32 - shift_0;
+  LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+  result[1] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+                            shift_1, shift_0);
+
+  left_y += ystep;
+  left_offset = left_y >> index_scale_bits;
+  shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+  shift_1 = 32 - shift_0;
+  LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+  result[2] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+                            shift_1, shift_0);
+
+  left_y += ystep;
+  left_offset = left_y >> index_scale_bits;
+  shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+  shift_1 = 32 - shift_0;
+  LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+  result[3] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+                            shift_1, shift_0);
+
+  Transpose4x4(result);
+  Store4(dst, result[0]);
+  dst += stride;
+  Store4(dst, result[1]);
+  dst += stride;
+  Store4(dst, result[2]);
+  dst += stride;
+  Store4(dst, result[3]);
+}
+
+template <bool upsampled>
+inline void DirectionalZone3_4xH(uint8_t* dest, const ptrdiff_t stride,
+                                 const int height, const uint16_t* const left,
+                                 const int ystep) {
+  const int upsample_shift = static_cast<int>(upsampled);
+  int y = 0;
+  do {
+    DirectionalZone3_4x4<upsampled>(dest, stride, left + (y << upsample_shift),
+                                    ystep);
+    dest += 4 * stride;
+    y += 4;
+  } while (y < height);
+}
+
+template <bool upsampled>
+inline void DirectionalZone3_Wx4(uint8_t* dest, const ptrdiff_t stride,
+                                 const int width, const uint16_t* const left,
+                                 const int ystep) {
+  int x = 0;
+  int base_left_y = 0;
+  do {
+    // TODO(petersonab): Establish 8x4 transpose to reserve this function for
+    // 8x4 and 16x4.
+    DirectionalZone3_4x4<upsampled>(dest + 2 * x, stride, left, ystep,
+                                    base_left_y);
+    base_left_y += 4 * ystep;
+    x += 4;
+  } while (x < width);
+}
+
+template <bool upsampled>
+inline void DirectionalZone3_8x8(uint8_t* dest, const ptrdiff_t stride,
+                                 const uint16_t* const left, const int ystep,
+                                 const int base_left_y = 0) {
+  const int upsample_shift = static_cast<int>(upsampled);
+  const int index_scale_bits = 6 - upsample_shift;
+
+  // Compute one column at a time, then transpose for storage.
+  uint16x8_t result[8];
+
+  int left_y = base_left_y + ystep;
+  uint16x8x2_t sampled_left_col;
+  int left_offset = left_y >> index_scale_bits;
+  int shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+  int shift_1 = 32 - shift_0;
+  LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+  result[0] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+                            shift_1, shift_0);
+  left_y += ystep;
+  left_offset = left_y >> index_scale_bits;
+  shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+  shift_1 = 32 - shift_0;
+  LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+  result[1] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+                            shift_1, shift_0);
+
+  left_y += ystep;
+  left_offset = left_y >> index_scale_bits;
+  shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+  shift_1 = 32 - shift_0;
+  LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+  result[2] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+                            shift_1, shift_0);
+
+  left_y += ystep;
+  left_offset = left_y >> index_scale_bits;
+  shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+  shift_1 = 32 - shift_0;
+  LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+  result[3] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+                            shift_1, shift_0);
+
+  left_y += ystep;
+  left_offset = left_y >> index_scale_bits;
+  shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+  shift_1 = 32 - shift_0;
+  LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+  result[4] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+                            shift_1, shift_0);
+
+  left_y += ystep;
+  left_offset = left_y >> index_scale_bits;
+  shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+  shift_1 = 32 - shift_0;
+  LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+  result[5] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+                            shift_1, shift_0);
+
+  left_y += ystep;
+  left_offset = left_y >> index_scale_bits;
+  shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+  shift_1 = 32 - shift_0;
+  LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+  result[6] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+                            shift_1, shift_0);
+
+  left_y += ystep;
+  left_offset = left_y >> index_scale_bits;
+  shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+  shift_1 = 32 - shift_0;
+  LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+  result[7] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+                            shift_1, shift_0);
+
+  Transpose8x8(result);
+  Store8(dest, result[0]);
+  dest += stride;
+  Store8(dest, result[1]);
+  dest += stride;
+  Store8(dest, result[2]);
+  dest += stride;
+  Store8(dest, result[3]);
+  dest += stride;
+  Store8(dest, result[4]);
+  dest += stride;
+  Store8(dest, result[5]);
+  dest += stride;
+  Store8(dest, result[6]);
+  dest += stride;
+  Store8(dest, result[7]);
+}
+
+template <bool upsampled>
+inline void DirectionalZone3_WxH(uint8_t* dest, const ptrdiff_t stride,
+                                 const int width, const int height,
+                                 const uint16_t* const left, const int ystep) {
+  const int upsample_shift = static_cast<int>(upsampled);
+  // Zone3 never runs out of left_column values.
+  assert((width + height - 1) << upsample_shift >  // max_base_y
+         ((ystep * width) >> (6 - upsample_shift)) +
+             (/* base_step */ 1 << upsample_shift) *
+                 (height - 1));  // left_base_y
+  int y = 0;
+  do {
+    int x = 0;
+    uint8_t* dst_x = dest + y * stride;
+    do {
+      const int base_left_y = ystep * x;
+      DirectionalZone3_8x8<upsampled>(
+          dst_x, stride, left + (y << upsample_shift), ystep, base_left_y);
+      dst_x += 8 * sizeof(uint16_t);
+      x += 8;
+    } while (x < width);
+    y += 8;
+  } while (y < height);
+}
+
+void DirectionalIntraPredictorZone3_NEON(void* const dest,
+                                         const ptrdiff_t stride,
+                                         const void* const left_column,
+                                         const int width, const int height,
+                                         const int ystep,
+                                         const bool upsampled_left) {
+  const uint16_t* const left = static_cast<const uint16_t*>(left_column);
+  uint8_t* dst = static_cast<uint8_t*>(dest);
+
+  if (ystep == 64) {
+    assert(!upsampled_left);
+    const int width_bytes = width * sizeof(left[0]);
+    int y = height;
+    do {
+      const uint16_t* left_ptr = left + 1;
+      memcpy(dst, left_ptr, width_bytes);
+      memcpy(dst + stride, left_ptr + 1, width_bytes);
+      memcpy(dst + 2 * stride, left_ptr + 2, width_bytes);
+      memcpy(dst + 3 * stride, left_ptr + 3, width_bytes);
+      dst += 4 * stride;
+      left_ptr += 4;
+      y -= 4;
+    } while (y != 0);
+    return;
+  }
+  if (width == 4) {
+    if (upsampled_left) {
+      DirectionalZone3_4xH<true>(dst, stride, height, left, ystep);
+    } else {
+      DirectionalZone3_4xH<false>(dst, stride, height, left, ystep);
+    }
+  } else if (height == 4) {
+    if (upsampled_left) {
+      DirectionalZone3_Wx4<true>(dst, stride, width, left, ystep);
+    } else {
+      DirectionalZone3_Wx4<false>(dst, stride, width, left, ystep);
+    }
+  } else {
+    if (upsampled_left) {
+      // |upsampled_left| can only be true if |width| + |height| <= 16,
+      // therefore this is 8x8.
+      DirectionalZone3_8x8<true>(dst, stride, left, ystep);
+    } else {
+      DirectionalZone3_WxH<false>(dst, stride, width, height, left, ystep);
+    }
+  }
+}
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+  dsp->directional_intra_predictor_zone1 = DirectionalIntraPredictorZone1_NEON;
+  dsp->directional_intra_predictor_zone3 = DirectionalIntraPredictorZone3_NEON;
+}
+
+}  // namespace
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+void IntraPredDirectionalInit_NEON() {
+  low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+}
 
 }  // namespace dsp
 }  // namespace libgav1
 
-#else  // !LIBGAV1_ENABLE_NEON
+#else   // !LIBGAV1_ENABLE_NEON
 namespace libgav1 {
 namespace dsp {
 
diff --git a/src/dsp/arm/intrapred_directional_neon.h b/src/dsp/arm/intrapred_directional_neon.h
new file mode 100644
index 0000000..f7d6235
--- /dev/null
+++ b/src/dsp/arm/intrapred_directional_neon.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_INTRAPRED_DIRECTIONAL_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_INTRAPRED_DIRECTIONAL_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::directional_intra_predictor_zone*, see the defines below for
+// specifics. These functions are not thread-safe.
+void IntraPredDirectionalInit_NEON();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone1
+#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone1 LIBGAV1_CPU_NEON
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone2
+#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone2 LIBGAV1_CPU_NEON
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone3
+#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone3 LIBGAV1_CPU_NEON
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone1
+#define LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone1 LIBGAV1_CPU_NEON
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone3
+#define LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone3 LIBGAV1_CPU_NEON
+#endif
+
+#endif  // LIBGAV1_ENABLE_NEON
+
+#endif  // LIBGAV1_SRC_DSP_ARM_INTRAPRED_DIRECTIONAL_NEON_H_
diff --git a/src/dsp/arm/intrapred_filter_intra_neon.cc b/src/dsp/arm/intrapred_filter_neon.cc
index 411708e..bd9f61d 100644
--- a/src/dsp/arm/intrapred_filter_intra_neon.cc
+++ b/src/dsp/arm/intrapred_filter_neon.cc
@@ -1,4 +1,4 @@
-// Copyright 2019 The libgav1 Authors
+// Copyright 2021 The libgav1 Authors
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "src/dsp/intrapred.h"
+#include "src/dsp/intrapred_filter.h"
 #include "src/utils/cpu.h"
 
 #if LIBGAV1_ENABLE_NEON
@@ -160,16 +160,16 @@ void Init8bpp() {
 }  // namespace
 }  // namespace low_bitdepth
 
-void IntraPredFilterIntraInit_NEON() { low_bitdepth::Init8bpp(); }
+void IntraPredFilterInit_NEON() { low_bitdepth::Init8bpp(); }
 
 }  // namespace dsp
 }  // namespace libgav1
 
-#else  // !LIBGAV1_ENABLE_NEON
+#else   // !LIBGAV1_ENABLE_NEON
 namespace libgav1 {
 namespace dsp {
 
-void IntraPredFilterIntraInit_NEON() {}
+void IntraPredFilterInit_NEON() {}
 
 }  // namespace dsp
 }  // namespace libgav1
diff --git a/src/dsp/arm/intrapred_filter_neon.h b/src/dsp/arm/intrapred_filter_neon.h
new file mode 100644
index 0000000..283c1b1
--- /dev/null
+++ b/src/dsp/arm/intrapred_filter_neon.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_INTRAPRED_FILTER_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_INTRAPRED_FILTER_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::filter_intra_predictor, see the defines below for specifics.
+// These functions are not thread-safe.
+void IntraPredFilterInit_NEON();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+#define LIBGAV1_Dsp8bpp_FilterIntraPredictor LIBGAV1_CPU_NEON
+#endif  // LIBGAV1_ENABLE_NEON
+
+#endif  // LIBGAV1_SRC_DSP_ARM_INTRAPRED_FILTER_NEON_H_
diff --git a/src/dsp/arm/intrapred_neon.cc b/src/dsp/arm/intrapred_neon.cc
index c967d82..c143648 100644
--- a/src/dsp/arm/intrapred_neon.cc
+++ b/src/dsp/arm/intrapred_neon.cc
@@ -26,6 +26,7 @@
 #include "src/dsp/arm/common_neon.h"
 #include "src/dsp/constants.h"
 #include "src/dsp/dsp.h"
+#include "src/utils/constants.h"
 
 namespace libgav1 {
 namespace dsp {
@@ -964,6 +965,200 @@ struct DcDefs {
   using _64x64 = DcPredFuncs_NEON<6, 6, DcSum_NEON, DcStore_NEON<64, 64>>;
 };
 
+// IntraPredFuncs_NEON::Horizontal -- duplicate left column across all rows
+
+template <int block_height>
+void Horizontal4xH_NEON(void* const dest, ptrdiff_t stride,
+                        const void* /*top_row*/,
+                        const void* const left_column) {
+  const auto* const left = static_cast<const uint16_t*>(left_column);
+  auto* dst = static_cast<uint8_t*>(dest);
+  int y = 0;
+  do {
+    auto* dst16 = reinterpret_cast<uint16_t*>(dst);
+    const uint16x4_t row = vld1_dup_u16(left + y);
+    vst1_u16(dst16, row);
+    dst += stride;
+  } while (++y < block_height);
+}
+
+template <int block_height>
+void Horizontal8xH_NEON(void* const dest, ptrdiff_t stride,
+                        const void* /*top_row*/,
+                        const void* const left_column) {
+  const auto* const left = static_cast<const uint16_t*>(left_column);
+  auto* dst = static_cast<uint8_t*>(dest);
+  int y = 0;
+  do {
+    auto* dst16 = reinterpret_cast<uint16_t*>(dst);
+    const uint16x8_t row = vld1q_dup_u16(left + y);
+    vst1q_u16(dst16, row);
+    dst += stride;
+  } while (++y < block_height);
+}
+
+template <int block_height>
+void Horizontal16xH_NEON(void* const dest, ptrdiff_t stride,
+                         const void* /*top_row*/,
+                         const void* const left_column) {
+  const auto* const left = static_cast<const uint16_t*>(left_column);
+  auto* dst = static_cast<uint8_t*>(dest);
+  int y = 0;
+  do {
+    const uint16x8_t row0 = vld1q_dup_u16(left + y);
+    const uint16x8_t row1 = vld1q_dup_u16(left + y + 1);
+    auto* dst16 = reinterpret_cast<uint16_t*>(dst);
+    vst1q_u16(dst16, row0);
+    vst1q_u16(dst16 + 8, row0);
+    dst += stride;
+    dst16 = reinterpret_cast<uint16_t*>(dst);
+    vst1q_u16(dst16, row1);
+    vst1q_u16(dst16 + 8, row1);
+    dst += stride;
+    y += 2;
+  } while (y < block_height);
+}
+
+template <int block_height>
+void Horizontal32xH_NEON(void* const dest, ptrdiff_t stride,
+                         const void* /*top_row*/,
+                         const void* const left_column) {
+  const auto* const left = static_cast<const uint16_t*>(left_column);
+  auto* dst = static_cast<uint8_t*>(dest);
+  int y = 0;
+  do {
+    const uint16x8_t row0 = vld1q_dup_u16(left + y);
+    const uint16x8_t row1 = vld1q_dup_u16(left + y + 1);
+    auto* dst16 = reinterpret_cast<uint16_t*>(dst);
+    vst1q_u16(dst16, row0);
+    vst1q_u16(dst16 + 8, row0);
+    vst1q_u16(dst16 + 16, row0);
+    vst1q_u16(dst16 + 24, row0);
+    dst += stride;
+    dst16 = reinterpret_cast<uint16_t*>(dst);
+    vst1q_u16(dst16, row1);
+    vst1q_u16(dst16 + 8, row1);
+    vst1q_u16(dst16 + 16, row1);
+    vst1q_u16(dst16 + 24, row1);
+    dst += stride;
+    y += 2;
+  } while (y < block_height);
+}
+
+// IntraPredFuncs_NEON::Vertical -- copy top row to all rows
+
+template <int block_height>
+void Vertical4xH_NEON(void* const dest, ptrdiff_t stride,
+                      const void* const top_row,
+                      const void* const /*left_column*/) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  auto* dst = static_cast<uint8_t*>(dest);
+  const uint8x8_t row = vld1_u8(top);
+  int y = block_height;
+  do {
+    vst1_u8(dst, row);
+    dst += stride;
+  } while (--y != 0);
+}
+
+template <int block_height>
+void Vertical8xH_NEON(void* const dest, ptrdiff_t stride,
+                      const void* const top_row,
+                      const void* const /*left_column*/) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  auto* dst = static_cast<uint8_t*>(dest);
+  const uint8x16_t row = vld1q_u8(top);
+  int y = block_height;
+  do {
+    vst1q_u8(dst, row);
+    dst += stride;
+  } while (--y != 0);
+}
+
+template <int block_height>
+void Vertical16xH_NEON(void* const dest, ptrdiff_t stride,
+                       const void* const top_row,
+                       const void* const /*left_column*/) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  auto* dst = static_cast<uint8_t*>(dest);
+  const uint8x16_t row0 = vld1q_u8(top);
+  const uint8x16_t row1 = vld1q_u8(top + 16);
+  int y = block_height;
+  do {
+    vst1q_u8(dst, row0);
+    vst1q_u8(dst + 16, row1);
+    dst += stride;
+    vst1q_u8(dst, row0);
+    vst1q_u8(dst + 16, row1);
+    dst += stride;
+    y -= 2;
+  } while (y != 0);
+}
+
+template <int block_height>
+void Vertical32xH_NEON(void* const dest, ptrdiff_t stride,
+                       const void* const top_row,
+                       const void* const /*left_column*/) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  auto* dst = static_cast<uint8_t*>(dest);
+  const uint8x16_t row0 = vld1q_u8(top);
+  const uint8x16_t row1 = vld1q_u8(top + 16);
+  const uint8x16_t row2 = vld1q_u8(top + 32);
+  const uint8x16_t row3 = vld1q_u8(top + 48);
+  int y = block_height;
+  do {
+    vst1q_u8(dst, row0);
+    vst1q_u8(dst + 16, row1);
+    vst1q_u8(dst + 32, row2);
+    vst1q_u8(dst + 48, row3);
+    dst += stride;
+    vst1q_u8(dst, row0);
+    vst1q_u8(dst + 16, row1);
+    vst1q_u8(dst + 32, row2);
+    vst1q_u8(dst + 48, row3);
+    dst += stride;
+    y -= 2;
+  } while (y != 0);
+}
+
+template <int block_height>
+void Vertical64xH_NEON(void* const dest, ptrdiff_t stride,
+                       const void* const top_row,
+                       const void* const /*left_column*/) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  auto* dst = static_cast<uint8_t*>(dest);
+  const uint8x16_t row0 = vld1q_u8(top);
+  const uint8x16_t row1 = vld1q_u8(top + 16);
+  const uint8x16_t row2 = vld1q_u8(top + 32);
+  const uint8x16_t row3 = vld1q_u8(top + 48);
+  const uint8x16_t row4 = vld1q_u8(top + 64);
+  const uint8x16_t row5 = vld1q_u8(top + 80);
+  const uint8x16_t row6 = vld1q_u8(top + 96);
+  const uint8x16_t row7 = vld1q_u8(top + 112);
+  int y = block_height;
+  do {
+    vst1q_u8(dst, row0);
+    vst1q_u8(dst + 16, row1);
+    vst1q_u8(dst + 32, row2);
+    vst1q_u8(dst + 48, row3);
+    vst1q_u8(dst + 64, row4);
+    vst1q_u8(dst + 80, row5);
+    vst1q_u8(dst + 96, row6);
+    vst1q_u8(dst + 112, row7);
+    dst += stride;
+    vst1q_u8(dst, row0);
+    vst1q_u8(dst + 16, row1);
+    vst1q_u8(dst + 32, row2);
+    vst1q_u8(dst + 48, row3);
+    vst1q_u8(dst + 64, row4);
+    vst1q_u8(dst + 80, row5);
+    vst1q_u8(dst + 96, row6);
+    vst1q_u8(dst + 112, row7);
+    dst += stride;
+    y -= 2;
+  } while (y != 0);
+}
+
 void Init10bpp() {
   Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
   assert(dsp != nullptr);
@@ -973,6 +1168,8 @@ void Init10bpp() {
       DcDefs::_4x4::DcLeft;
   dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDc] =
       DcDefs::_4x4::Dc;
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorVertical] =
+      Vertical4xH_NEON<4>;
 
   // 4x8
   dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcTop] =
@@ -981,6 +1178,10 @@ void Init10bpp() {
       DcDefs::_4x8::DcLeft;
   dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDc] =
       DcDefs::_4x8::Dc;
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorHorizontal] =
+      Horizontal4xH_NEON<8>;
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorVertical] =
+      Vertical4xH_NEON<8>;
 
   // 4x16
   dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcTop] =
@@ -989,6 +1190,10 @@ void Init10bpp() {
       DcDefs::_4x16::DcLeft;
   dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDc] =
       DcDefs::_4x16::Dc;
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorHorizontal] =
+      Horizontal4xH_NEON<16>;
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorVertical] =
+      Vertical4xH_NEON<16>;
 
   // 8x4
   dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcTop] =
@@ -997,6 +1202,8 @@ void Init10bpp() {
       DcDefs::_8x4::DcLeft;
   dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDc] =
       DcDefs::_8x4::Dc;
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorVertical] =
+      Vertical8xH_NEON<4>;
 
   // 8x8
   dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcTop] =
@@ -1005,6 +1212,10 @@ void Init10bpp() {
       DcDefs::_8x8::DcLeft;
   dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDc] =
       DcDefs::_8x8::Dc;
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorHorizontal] =
+      Horizontal8xH_NEON<8>;
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorVertical] =
+      Vertical8xH_NEON<8>;
 
   // 8x16
   dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcTop] =
@@ -1013,6 +1224,8 @@ void Init10bpp() {
       DcDefs::_8x16::DcLeft;
   dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDc] =
       DcDefs::_8x16::Dc;
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorVertical] =
+      Vertical8xH_NEON<16>;
 
   // 8x32
   dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcTop] =
@@ -1021,6 +1234,10 @@ void Init10bpp() {
       DcDefs::_8x32::DcLeft;
   dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDc] =
       DcDefs::_8x32::Dc;
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorHorizontal] =
+      Horizontal8xH_NEON<32>;
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorVertical] =
+      Vertical8xH_NEON<32>;
 
   // 16x4
   dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcTop] =
@@ -1029,6 +1246,8 @@ void Init10bpp() {
       DcDefs::_16x4::DcLeft;
   dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDc] =
       DcDefs::_16x4::Dc;
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorVertical] =
+      Vertical16xH_NEON<4>;
 
   // 16x8
   dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcTop] =
@@ -1037,6 +1256,10 @@ void Init10bpp() {
       DcDefs::_16x8::DcLeft;
   dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDc] =
       DcDefs::_16x8::Dc;
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorHorizontal] =
+      Horizontal16xH_NEON<8>;
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorVertical] =
+      Vertical16xH_NEON<8>;
 
   // 16x16
   dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcTop] =
@@ -1045,6 +1268,8 @@ void Init10bpp() {
       DcDefs::_16x16::DcLeft;
   dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDc] =
       DcDefs::_16x16::Dc;
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorVertical] =
+      Vertical16xH_NEON<16>;
 
   // 16x32
   dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcTop] =
@@ -1053,6 +1278,8 @@ void Init10bpp() {
       DcDefs::_16x32::DcLeft;
   dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDc] =
       DcDefs::_16x32::Dc;
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorVertical] =
+      Vertical16xH_NEON<32>;
 
   // 16x64
   dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcTop] =
@@ -1061,6 +1288,8 @@ void Init10bpp() {
       DcDefs::_16x64::DcLeft;
   dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDc] =
       DcDefs::_16x64::Dc;
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorVertical] =
+      Vertical16xH_NEON<64>;
 
   // 32x8
   dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcTop] =
@@ -1069,6 +1298,8 @@ void Init10bpp() {
       DcDefs::_32x8::DcLeft;
   dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDc] =
       DcDefs::_32x8::Dc;
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorVertical] =
+      Vertical32xH_NEON<8>;
 
   // 32x16
   dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcTop] =
@@ -1077,6 +1308,8 @@ void Init10bpp() {
       DcDefs::_32x16::DcLeft;
   dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDc] =
       DcDefs::_32x16::Dc;
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorVertical] =
+      Vertical32xH_NEON<16>;
 
   // 32x32
   dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcTop] =
@@ -1085,6 +1318,8 @@ void Init10bpp() {
       DcDefs::_32x32::DcLeft;
   dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDc] =
       DcDefs::_32x32::Dc;
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorVertical] =
+      Vertical32xH_NEON<32>;
 
   // 32x64
   dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcTop] =
@@ -1093,6 +1328,10 @@ void Init10bpp() {
       DcDefs::_32x64::DcLeft;
   dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDc] =
       DcDefs::_32x64::Dc;
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorHorizontal] =
+      Horizontal32xH_NEON<64>;
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorVertical] =
+      Vertical32xH_NEON<64>;
 
   // 64x16
   dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcTop] =
@@ -1101,6 +1340,8 @@ void Init10bpp() {
       DcDefs::_64x16::DcLeft;
   dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDc] =
       DcDefs::_64x16::Dc;
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorVertical] =
+      Vertical64xH_NEON<16>;
 
   // 64x32
   dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcTop] =
@@ -1109,6 +1350,8 @@ void Init10bpp() {
       DcDefs::_64x32::DcLeft;
   dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDc] =
       DcDefs::_64x32::Dc;
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorVertical] =
+      Vertical64xH_NEON<32>;
 
   // 64x64
   dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcTop] =
@@ -1117,6 +1360,8 @@ void Init10bpp() {
       DcDefs::_64x64::DcLeft;
   dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDc] =
       DcDefs::_64x64::Dc;
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorVertical] =
+      Vertical64xH_NEON<64>;
 }
 
 }  // namespace
@@ -1133,7 +1378,7 @@ void IntraPredInit_NEON() {
 }  // namespace dsp
 }  // namespace libgav1
 
-#else  // !LIBGAV1_ENABLE_NEON
+#else   // !LIBGAV1_ENABLE_NEON
 namespace libgav1 {
 namespace dsp {
 
diff --git a/src/dsp/arm/intrapred_neon.h b/src/dsp/arm/intrapred_neon.h
index 16f858c..b27f29f 100644
--- a/src/dsp/arm/intrapred_neon.h
+++ b/src/dsp/arm/intrapred_neon.h
@@ -23,396 +23,282 @@
 namespace libgav1 {
 namespace dsp {
 
-// Initializes Dsp::intra_predictors, Dsp::directional_intra_predictor_zone*,
-// Dsp::cfl_intra_predictors, Dsp::cfl_subsamplers and
-// Dsp::filter_intra_predictor, see the defines below for specifics. These
-// functions are not thread-safe.
-void IntraPredCflInit_NEON();
-void IntraPredDirectionalInit_NEON();
-void IntraPredFilterIntraInit_NEON();
+// Initializes Dsp::intra_predictors.
+// See the defines below for specifics. These functions are not thread-safe.
 void IntraPredInit_NEON();
-void IntraPredSmoothInit_NEON();
 
 }  // namespace dsp
 }  // namespace libgav1
 
 #if LIBGAV1_ENABLE_NEON
-// 8 bit
-#define LIBGAV1_Dsp8bpp_FilterIntraPredictor LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone1 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone2 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone3 LIBGAV1_CPU_NEON
-
 // 4x4
 #define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcLeft LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDc LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflIntraPredictor LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler420 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler444 LIBGAV1_CPU_NEON
 
 // 4x8
 #define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDc LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflIntraPredictor LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler420 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler444 LIBGAV1_CPU_NEON
 
 // 4x16
 #define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDc LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflIntraPredictor LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler420 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler444 LIBGAV1_CPU_NEON
 
 // 8x4
 #define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDcLeft LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDc LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflIntraPredictor LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler420 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler444 LIBGAV1_CPU_NEON
 
 // 8x8
 #define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDc LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflIntraPredictor LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler420 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler444 LIBGAV1_CPU_NEON
 
 // 8x16
 #define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDc LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflIntraPredictor LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler420 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler444 LIBGAV1_CPU_NEON
 
 // 8x32
 #define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDcLeft LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDc LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflIntraPredictor LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler420 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler444 LIBGAV1_CPU_NEON
 
 // 16x4
 #define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDcLeft LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDc LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflIntraPredictor LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler420 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler444 LIBGAV1_CPU_NEON
 
 // 16x8
 #define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDc LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflIntraPredictor LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler420 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler444 LIBGAV1_CPU_NEON
 
 // 16x16
 #define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDc LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflIntraPredictor LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler420 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler444 LIBGAV1_CPU_NEON
 
 // 16x32
 #define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDcLeft LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDc LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflIntraPredictor LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler420 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler444 LIBGAV1_CPU_NEON
 
 // 16x64
 #define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDcLeft LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDc LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_NEON
 
 // 32x8
 #define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDc LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflIntraPredictor LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler420 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler444 LIBGAV1_CPU_NEON
 
 // 32x16
 #define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDc LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflIntraPredictor LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler420 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler444 LIBGAV1_CPU_NEON
 
 // 32x32
 #define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDcLeft LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDc LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflIntraPredictor LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler420 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler444 LIBGAV1_CPU_NEON
 
 // 32x64
 #define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDcLeft LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDc LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_NEON
 
 // 64x16
 #define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDc LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_NEON
 
 // 64x32
 #define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDcLeft LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDc LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_NEON
 
 // 64x64
 #define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDcLeft LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDc LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_NEON
 
 // 10 bit
 // 4x4
 #define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDcLeft LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
 
 // 4x8
 #define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorHorizontal \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
 
 // 4x16
 #define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorHorizontal \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
 
 // 8x4
 #define LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorDcLeft LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
 
 // 8x8
 #define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorHorizontal \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
 
 // 8x16
 #define LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
 
 // 8x32
 #define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorDcLeft LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorHorizontal \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
 
 // 16x4
 #define LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorDcLeft LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
 
 // 16x8
 #define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorHorizontal \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
 
 // 16x16
 #define LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorDcLeft \
   LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
 
 // 16x32
 #define LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorDcLeft \
   LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
 
 // 16x64
 #define LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorDcLeft \
   LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
 
 // 32x8
 #define LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
 
 // 32x16
 #define LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorDcLeft \
   LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
 
 // 32x32
 #define LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorDcLeft \
   LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
 
 // 32x64
 #define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorDcLeft \
   LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorHorizontal \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
 
 // 64x16
 #define LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorDcLeft \
   LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
 
 // 64x32
 #define LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorDcLeft \
   LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
 
 // 64x64
 #define LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorDcLeft \
   LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
 #endif  // LIBGAV1_ENABLE_NEON
 
 #endif  // LIBGAV1_SRC_DSP_ARM_INTRAPRED_NEON_H_
diff --git a/src/dsp/arm/intrapred_smooth_neon.cc b/src/dsp/arm/intrapred_smooth_neon.cc
index abc93e8..c33f333 100644
--- a/src/dsp/arm/intrapred_smooth_neon.cc
+++ b/src/dsp/arm/intrapred_smooth_neon.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "src/dsp/intrapred.h"
+#include "src/dsp/intrapred_smooth.h"
 #include "src/utils/cpu.h"
 
 #if LIBGAV1_ENABLE_NEON
@@ -26,6 +26,7 @@
 #include "src/dsp/arm/common_neon.h"
 #include "src/dsp/constants.h"
 #include "src/dsp/dsp.h"
+#include "src/utils/constants.h"
 
 namespace libgav1 {
 namespace dsp {
@@ -605,7 +606,7 @@ void IntraPredSmoothInit_NEON() { low_bitdepth::Init8bpp(); }
 }  // namespace dsp
 }  // namespace libgav1
 
-#else  // !LIBGAV1_ENABLE_NEON
+#else   // !LIBGAV1_ENABLE_NEON
 namespace libgav1 {
 namespace dsp {
 
diff --git a/src/dsp/arm/intrapred_smooth_neon.h b/src/dsp/arm/intrapred_smooth_neon.h
new file mode 100644
index 0000000..edd01be
--- /dev/null
+++ b/src/dsp/arm/intrapred_smooth_neon.h
@@ -0,0 +1,149 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_INTRAPRED_SMOOTH_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_INTRAPRED_SMOOTH_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::intra_predictors[][kIntraPredictorSmooth.*].
+// This function is not thread-safe.
+void IntraPredSmoothInit_NEON();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+#endif  // LIBGAV1_ENABLE_NEON
+
+#endif  // LIBGAV1_SRC_DSP_ARM_INTRAPRED_SMOOTH_NEON_H_
diff --git a/src/dsp/arm/inverse_transform_10bit_neon.cc b/src/dsp/arm/inverse_transform_10bit_neon.cc
new file mode 100644
index 0000000..ff184a1
--- /dev/null
+++ b/src/dsp/arm/inverse_transform_10bit_neon.cc
@@ -0,0 +1,2543 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/inverse_transform.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON && LIBGAV1_MAX_BITDEPTH >= 10
+
+#include <arm_neon.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+// Include the constants and utility functions inside the anonymous namespace.
+#include "src/dsp/inverse_transform.inc"
+
+//------------------------------------------------------------------------------
+
+LIBGAV1_ALWAYS_INLINE void Transpose4x4(const int32x4_t in[4],
+                                        int32x4_t out[4]) {
+  // in:
+  // 00 01 02 03
+  // 10 11 12 13
+  // 20 21 22 23
+  // 30 31 32 33
+
+  // 00 10 02 12   a.val[0]
+  // 01 11 03 13   a.val[1]
+  // 20 30 22 32   b.val[0]
+  // 21 31 23 33   b.val[1]
+  const int32x4x2_t a = vtrnq_s32(in[0], in[1]);
+  const int32x4x2_t b = vtrnq_s32(in[2], in[3]);
+  out[0] = vextq_s32(vextq_s32(a.val[0], a.val[0], 2), b.val[0], 2);
+  out[1] = vextq_s32(vextq_s32(a.val[1], a.val[1], 2), b.val[1], 2);
+  out[2] = vextq_s32(a.val[0], vextq_s32(b.val[0], b.val[0], 2), 2);
+  out[3] = vextq_s32(a.val[1], vextq_s32(b.val[1], b.val[1], 2), 2);
+  // out:
+  // 00 10 20 30
+  // 01 11 21 31
+  // 02 12 22 32
+  // 03 13 23 33
+}
+
+//------------------------------------------------------------------------------
+template <int store_count>
+LIBGAV1_ALWAYS_INLINE void StoreDst(int32_t* dst, int32_t stride, int32_t idx,
+                                    const int32x4_t* const s) {
+  assert(store_count % 4 == 0);
+  for (int i = 0; i < store_count; i += 4) {
+    vst1q_s32(&dst[i * stride + idx], s[i]);
+    vst1q_s32(&dst[(i + 1) * stride + idx], s[i + 1]);
+    vst1q_s32(&dst[(i + 2) * stride + idx], s[i + 2]);
+    vst1q_s32(&dst[(i + 3) * stride + idx], s[i + 3]);
+  }
+}
+
+template <int load_count>
+LIBGAV1_ALWAYS_INLINE void LoadSrc(const int32_t* src, int32_t stride,
+                                   int32_t idx, int32x4_t* x) {
+  assert(load_count % 4 == 0);
+  for (int i = 0; i < load_count; i += 4) {
+    x[i] = vld1q_s32(&src[i * stride + idx]);
+    x[i + 1] = vld1q_s32(&src[(i + 1) * stride + idx]);
+    x[i + 2] = vld1q_s32(&src[(i + 2) * stride + idx]);
+    x[i + 3] = vld1q_s32(&src[(i + 3) * stride + idx]);
+  }
+}
+
+// Butterfly rotate 4 values.
+LIBGAV1_ALWAYS_INLINE void ButterflyRotation_4(int32x4_t* a, int32x4_t* b,
+                                               const int angle,
+                                               const bool flip) {
+  const int32_t cos128 = Cos128(angle);
+  const int32_t sin128 = Sin128(angle);
+  const int32x4_t acc_x = vmulq_n_s32(*a, cos128);
+  const int32x4_t acc_y = vmulq_n_s32(*a, sin128);
+  // The max range for the input is 18 bits. The cos128/sin128 is 13 bits,
+  // which leaves 1 bit for the add/subtract. For 10bpp, x/y will fit in a 32
+  // bit lane.
+  const int32x4_t x0 = vmlsq_n_s32(acc_x, *b, sin128);
+  const int32x4_t y0 = vmlaq_n_s32(acc_y, *b, cos128);
+  const int32x4_t x = vrshrq_n_s32(x0, 12);
+  const int32x4_t y = vrshrq_n_s32(y0, 12);
+  if (flip) {
+    *a = y;
+    *b = x;
+  } else {
+    *a = x;
+    *b = y;
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE void ButterflyRotation_FirstIsZero(int32x4_t* a,
+                                                         int32x4_t* b,
+                                                         const int angle,
+                                                         const bool flip) {
+  const int32_t cos128 = Cos128(angle);
+  const int32_t sin128 = Sin128(angle);
+  assert(sin128 <= 0xfff);
+  const int32x4_t x0 = vmulq_n_s32(*b, -sin128);
+  const int32x4_t y0 = vmulq_n_s32(*b, cos128);
+  const int32x4_t x = vrshrq_n_s32(x0, 12);
+  const int32x4_t y = vrshrq_n_s32(y0, 12);
+  if (flip) {
+    *a = y;
+    *b = x;
+  } else {
+    *a = x;
+    *b = y;
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE void ButterflyRotation_SecondIsZero(int32x4_t* a,
+                                                          int32x4_t* b,
+                                                          const int angle,
+                                                          const bool flip) {
+  const int32_t cos128 = Cos128(angle);
+  const int32_t sin128 = Sin128(angle);
+  const int32x4_t x0 = vmulq_n_s32(*a, cos128);
+  const int32x4_t y0 = vmulq_n_s32(*a, sin128);
+  const int32x4_t x = vrshrq_n_s32(x0, 12);
+  const int32x4_t y = vrshrq_n_s32(y0, 12);
+  if (flip) {
+    *a = y;
+    *b = x;
+  } else {
+    *a = x;
+    *b = y;
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE void HadamardRotation(int32x4_t* a, int32x4_t* b,
+                                            bool flip) {
+  int32x4_t x, y;
+  if (flip) {
+    y = vqaddq_s32(*b, *a);
+    x = vqsubq_s32(*b, *a);
+  } else {
+    x = vqaddq_s32(*a, *b);
+    y = vqsubq_s32(*a, *b);
+  }
+  *a = x;
+  *b = y;
+}
+
+LIBGAV1_ALWAYS_INLINE void HadamardRotation(int32x4_t* a, int32x4_t* b,
+                                            bool flip, const int32x4_t* min,
+                                            const int32x4_t* max) {
+  int32x4_t x, y;
+  if (flip) {
+    y = vqaddq_s32(*b, *a);
+    x = vqsubq_s32(*b, *a);
+  } else {
+    x = vqaddq_s32(*a, *b);
+    y = vqsubq_s32(*a, *b);
+  }
+  *a = vmaxq_s32(vminq_s32(x, *max), *min);
+  *b = vmaxq_s32(vminq_s32(y, *max), *min);
+}
+
+using ButterflyRotationFunc = void (*)(int32x4_t* a, int32x4_t* b, int angle,
+                                       bool flip);
+
+//------------------------------------------------------------------------------
+// Discrete Cosine Transforms (DCT).
+
+template <int width>
+LIBGAV1_ALWAYS_INLINE bool DctDcOnly(void* dest, int adjusted_tx_height,
+                                     bool should_round, int row_shift) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int32_t*>(dest);
+  const int32x4_t v_src = vdupq_n_s32(dst[0]);
+  const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0);
+  const int32x4_t v_src_round =
+      vqrdmulhq_n_s32(v_src, kTransformRowMultiplier << (31 - 12));
+  const int32x4_t s0 = vbslq_s32(v_mask, v_src_round, v_src);
+  const int32_t cos128 = Cos128(32);
+  const int32x4_t xy = vqrdmulhq_n_s32(s0, cos128 << (31 - 12));
+  // vqrshlq_s32 will shift right if shift value is negative.
+  const int32x4_t xy_shifted = vqrshlq_s32(xy, vdupq_n_s32(-row_shift));
+  // Clamp result to signed 16 bits.
+  const int32x4_t result = vmovl_s16(vqmovn_s32(xy_shifted));
+  if (width == 4) {
+    vst1q_s32(dst, result);
+  } else {
+    for (int i = 0; i < width; i += 4) {
+      vst1q_s32(dst, result);
+      dst += 4;
+    }
+  }
+  return true;
+}
+
+template <int height>
+LIBGAV1_ALWAYS_INLINE bool DctDcOnlyColumn(void* dest, int adjusted_tx_height,
+                                           int width) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int32_t*>(dest);
+  const int32_t cos128 = Cos128(32);
+
+  // Calculate dc values for first row.
+  if (width == 4) {
+    const int32x4_t v_src = vld1q_s32(dst);
+    const int32x4_t xy = vqrdmulhq_n_s32(v_src, cos128 << (31 - 12));
+    vst1q_s32(dst, xy);
+  } else {
+    int i = 0;
+    do {
+      const int32x4_t v_src = vld1q_s32(&dst[i]);
+      const int32x4_t xy = vqrdmulhq_n_s32(v_src, cos128 << (31 - 12));
+      vst1q_s32(&dst[i], xy);
+      i += 4;
+    } while (i < width);
+  }
+
+  // Copy first row to the rest of the block.
+  for (int y = 1; y < height; ++y) {
+    memcpy(&dst[y * width], dst, width * sizeof(dst[0]));
+  }
+  return true;
+}
+
+template <ButterflyRotationFunc butterfly_rotation,
+          bool is_fast_butterfly = false>
+LIBGAV1_ALWAYS_INLINE void Dct4Stages(int32x4_t* s, const int32x4_t* min,
+                                      const int32x4_t* max,
+                                      const bool is_last_stage) {
+  // stage 12.
+  if (is_fast_butterfly) {
+    ButterflyRotation_SecondIsZero(&s[0], &s[1], 32, true);
+    ButterflyRotation_SecondIsZero(&s[2], &s[3], 48, false);
+  } else {
+    butterfly_rotation(&s[0], &s[1], 32, true);
+    butterfly_rotation(&s[2], &s[3], 48, false);
+  }
+
+  // stage 17.
+  if (is_last_stage) {
+    HadamardRotation(&s[0], &s[3], false);
+    HadamardRotation(&s[1], &s[2], false);
+  } else {
+    HadamardRotation(&s[0], &s[3], false, min, max);
+    HadamardRotation(&s[1], &s[2], false, min, max);
+  }
+}
+
+template <ButterflyRotationFunc butterfly_rotation>
+LIBGAV1_ALWAYS_INLINE void Dct4_NEON(void* dest, int32_t step, bool is_row,
+                                     int row_shift) {
+  auto* const dst = static_cast<int32_t*>(dest);
+  // When |is_row| is true, set range to the row range, otherwise, set to the
+  // column range.
+  const int32_t range = is_row ? kBitdepth10 + 7 : 15;
+  const int32x4_t min = vdupq_n_s32(-(1 << range));
+  const int32x4_t max = vdupq_n_s32((1 << range) - 1);
+  int32x4_t s[4], x[4];
+
+  LoadSrc<4>(dst, step, 0, x);
+  if (is_row) {
+    Transpose4x4(x, x);
+  }
+
+  // stage 1.
+  // kBitReverseLookup 0, 2, 1, 3
+  s[0] = x[0];
+  s[1] = x[2];
+  s[2] = x[1];
+  s[3] = x[3];
+
+  Dct4Stages<butterfly_rotation>(s, &min, &max, /*is_last_stage=*/true);
+
+  if (is_row) {
+    const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
+    for (int i = 0; i < 4; ++i) {
+      s[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(s[i], v_row_shift)));
+    }
+    Transpose4x4(s, s);
+  }
+  StoreDst<4>(dst, step, 0, s);
+}
+
+template <ButterflyRotationFunc butterfly_rotation,
+          bool is_fast_butterfly = false>
+LIBGAV1_ALWAYS_INLINE void Dct8Stages(int32x4_t* s, const int32x4_t* min,
+                                      const int32x4_t* max,
+                                      const bool is_last_stage) {
+  // stage 8.
+  if (is_fast_butterfly) {
+    ButterflyRotation_SecondIsZero(&s[4], &s[7], 56, false);
+    ButterflyRotation_FirstIsZero(&s[5], &s[6], 24, false);
+  } else {
+    butterfly_rotation(&s[4], &s[7], 56, false);
+    butterfly_rotation(&s[5], &s[6], 24, false);
+  }
+
+  // stage 13.
+  HadamardRotation(&s[4], &s[5], false, min, max);
+  HadamardRotation(&s[6], &s[7], true, min, max);
+
+  // stage 18.
+  butterfly_rotation(&s[6], &s[5], 32, true);
+
+  // stage 22.
+  if (is_last_stage) {
+    HadamardRotation(&s[0], &s[7], false);
+    HadamardRotation(&s[1], &s[6], false);
+    HadamardRotation(&s[2], &s[5], false);
+    HadamardRotation(&s[3], &s[4], false);
+  } else {
+    HadamardRotation(&s[0], &s[7], false, min, max);
+    HadamardRotation(&s[1], &s[6], false, min, max);
+    HadamardRotation(&s[2], &s[5], false, min, max);
+    HadamardRotation(&s[3], &s[4], false, min, max);
+  }
+}
+
+// Process dct8 rows or columns, depending on the |is_row| flag.
+template <ButterflyRotationFunc butterfly_rotation>
+LIBGAV1_ALWAYS_INLINE void Dct8_NEON(void* dest, int32_t step, bool is_row,
+                                     int row_shift) {
+  auto* const dst = static_cast<int32_t*>(dest);
+  const int32_t range = is_row ? kBitdepth10 + 7 : 15;
+  const int32x4_t min = vdupq_n_s32(-(1 << range));
+  const int32x4_t max = vdupq_n_s32((1 << range) - 1);
+  int32x4_t s[8], x[8];
+
+  if (is_row) {
+    LoadSrc<4>(dst, step, 0, &x[0]);
+    LoadSrc<4>(dst, step, 4, &x[4]);
+    Transpose4x4(&x[0], &x[0]);
+    Transpose4x4(&x[4], &x[4]);
+  } else {
+    LoadSrc<8>(dst, step, 0, &x[0]);
+  }
+
+  // stage 1.
+  // kBitReverseLookup 0, 4, 2, 6, 1, 5, 3, 7,
+  s[0] = x[0];
+  s[1] = x[4];
+  s[2] = x[2];
+  s[3] = x[6];
+  s[4] = x[1];
+  s[5] = x[5];
+  s[6] = x[3];
+  s[7] = x[7];
+
+  Dct4Stages<butterfly_rotation>(s, &min, &max, /*is_last_stage=*/false);
+  Dct8Stages<butterfly_rotation>(s, &min, &max, /*is_last_stage=*/true);
+
+  if (is_row) {
+    const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
+    for (int i = 0; i < 8; ++i) {
+      s[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(s[i], v_row_shift)));
+    }
+    Transpose4x4(&s[0], &s[0]);
+    Transpose4x4(&s[4], &s[4]);
+    StoreDst<4>(dst, step, 0, &s[0]);
+    StoreDst<4>(dst, step, 4, &s[4]);
+  } else {
+    StoreDst<8>(dst, step, 0, &s[0]);
+  }
+}
+
+template <ButterflyRotationFunc butterfly_rotation,
+          bool is_fast_butterfly = false>
+LIBGAV1_ALWAYS_INLINE void Dct16Stages(int32x4_t* s, const int32x4_t* min,
+                                       const int32x4_t* max,
+                                       const bool is_last_stage) {
+  // stage 5.
+  if (is_fast_butterfly) {
+    ButterflyRotation_SecondIsZero(&s[8], &s[15], 60, false);
+    ButterflyRotation_FirstIsZero(&s[9], &s[14], 28, false);
+    ButterflyRotation_SecondIsZero(&s[10], &s[13], 44, false);
+    ButterflyRotation_FirstIsZero(&s[11], &s[12], 12, false);
+  } else {
+    butterfly_rotation(&s[8], &s[15], 60, false);
+    butterfly_rotation(&s[9], &s[14], 28, false);
+    butterfly_rotation(&s[10], &s[13], 44, false);
+    butterfly_rotation(&s[11], &s[12], 12, false);
+  }
+
+  // stage 9.
+  HadamardRotation(&s[8], &s[9], false, min, max);
+  HadamardRotation(&s[10], &s[11], true, min, max);
+  HadamardRotation(&s[12], &s[13], false, min, max);
+  HadamardRotation(&s[14], &s[15], true, min, max);
+
+  // stage 14.
+  butterfly_rotation(&s[14], &s[9], 48, true);
+  butterfly_rotation(&s[13], &s[10], 112, true);
+
+  // stage 19.
+  HadamardRotation(&s[8], &s[11], false, min, max);
+  HadamardRotation(&s[9], &s[10], false, min, max);
+  HadamardRotation(&s[12], &s[15], true, min, max);
+  HadamardRotation(&s[13], &s[14], true, min, max);
+
+  // stage 23.
+  butterfly_rotation(&s[13], &s[10], 32, true);
+  butterfly_rotation(&s[12], &s[11], 32, true);
+
+  // stage 26.
+  if (is_last_stage) {
+    HadamardRotation(&s[0], &s[15], false);
+    HadamardRotation(&s[1], &s[14], false);
+    HadamardRotation(&s[2], &s[13], false);
+    HadamardRotation(&s[3], &s[12], false);
+    HadamardRotation(&s[4], &s[11], false);
+    HadamardRotation(&s[5], &s[10], false);
+    HadamardRotation(&s[6], &s[9], false);
+    HadamardRotation(&s[7], &s[8], false);
+  } else {
+    HadamardRotation(&s[0], &s[15], false, min, max);
+    HadamardRotation(&s[1], &s[14], false, min, max);
+    HadamardRotation(&s[2], &s[13], false, min, max);
+    HadamardRotation(&s[3], &s[12], false, min, max);
+    HadamardRotation(&s[4], &s[11], false, min, max);
+    HadamardRotation(&s[5], &s[10], false, min, max);
+    HadamardRotation(&s[6], &s[9], false, min, max);
+    HadamardRotation(&s[7], &s[8], false, min, max);
+  }
+}
+
+// Process dct16 rows or columns, depending on the |is_row| flag.
+template <ButterflyRotationFunc butterfly_rotation>
+LIBGAV1_ALWAYS_INLINE void Dct16_NEON(void* dest, int32_t step, bool is_row,
+                                      int row_shift) {
+  auto* const dst = static_cast<int32_t*>(dest);
+  const int32_t range = is_row ? kBitdepth10 + 7 : 15;
+  const int32x4_t min = vdupq_n_s32(-(1 << range));
+  const int32x4_t max = vdupq_n_s32((1 << range) - 1);
+  int32x4_t s[16], x[16];
+
+  if (is_row) {
+    for (int idx = 0; idx < 16; idx += 8) {
+      LoadSrc<4>(dst, step, idx, &x[idx]);
+      LoadSrc<4>(dst, step, idx + 4, &x[idx + 4]);
+      Transpose4x4(&x[idx], &x[idx]);
+      Transpose4x4(&x[idx + 4], &x[idx + 4]);
+    }
+  } else {
+    LoadSrc<16>(dst, step, 0, &x[0]);
+  }
+
+  // stage 1
+  // kBitReverseLookup 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15,
+  s[0] = x[0];
+  s[1] = x[8];
+  s[2] = x[4];
+  s[3] = x[12];
+  s[4] = x[2];
+  s[5] = x[10];
+  s[6] = x[6];
+  s[7] = x[14];
+  s[8] = x[1];
+  s[9] = x[9];
+  s[10] = x[5];
+  s[11] = x[13];
+  s[12] = x[3];
+  s[13] = x[11];
+  s[14] = x[7];
+  s[15] = x[15];
+
+  Dct4Stages<butterfly_rotation>(s, &min, &max, /*is_last_stage=*/false);
+  Dct8Stages<butterfly_rotation>(s, &min, &max, /*is_last_stage=*/false);
+  Dct16Stages<butterfly_rotation>(s, &min, &max, /*is_last_stage=*/true);
+
+  if (is_row) {
+    const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
+    for (int i = 0; i < 16; ++i) {
+      s[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(s[i], v_row_shift)));
+    }
+    for (int idx = 0; idx < 16; idx += 8) {
+      Transpose4x4(&s[idx], &s[idx]);
+      Transpose4x4(&s[idx + 4], &s[idx + 4]);
+      StoreDst<4>(dst, step, idx, &s[idx]);
+      StoreDst<4>(dst, step, idx + 4, &s[idx + 4]);
+    }
+  } else {
+    StoreDst<16>(dst, step, 0, &s[0]);
+  }
+}
+
+template <ButterflyRotationFunc butterfly_rotation,
+          bool is_fast_butterfly = false>
+LIBGAV1_ALWAYS_INLINE void Dct32Stages(int32x4_t* s, const int32x4_t* min,
+                                       const int32x4_t* max,
+                                       const bool is_last_stage) {
+  // stage 3
+  if (is_fast_butterfly) {
+    ButterflyRotation_SecondIsZero(&s[16], &s[31], 62, false);
+    ButterflyRotation_FirstIsZero(&s[17], &s[30], 30, false);
+    ButterflyRotation_SecondIsZero(&s[18], &s[29], 46, false);
+    ButterflyRotation_FirstIsZero(&s[19], &s[28], 14, false);
+    ButterflyRotation_SecondIsZero(&s[20], &s[27], 54, false);
+    ButterflyRotation_FirstIsZero(&s[21], &s[26], 22, false);
+    ButterflyRotation_SecondIsZero(&s[22], &s[25], 38, false);
+    ButterflyRotation_FirstIsZero(&s[23], &s[24], 6, false);
+  } else {
+    butterfly_rotation(&s[16], &s[31], 62, false);
+    butterfly_rotation(&s[17], &s[30], 30, false);
+    butterfly_rotation(&s[18], &s[29], 46, false);
+    butterfly_rotation(&s[19], &s[28], 14, false);
+    butterfly_rotation(&s[20], &s[27], 54, false);
+    butterfly_rotation(&s[21], &s[26], 22, false);
+    butterfly_rotation(&s[22], &s[25], 38, false);
+    butterfly_rotation(&s[23], &s[24], 6, false);
+  }
+
+  // stage 6.
+  HadamardRotation(&s[16], &s[17], false, min, max);
+  HadamardRotation(&s[18], &s[19], true, min, max);
+  HadamardRotation(&s[20], &s[21], false, min, max);
+  HadamardRotation(&s[22], &s[23], true, min, max);
+  HadamardRotation(&s[24], &s[25], false, min, max);
+  HadamardRotation(&s[26], &s[27], true, min, max);
+  HadamardRotation(&s[28], &s[29], false, min, max);
+  HadamardRotation(&s[30], &s[31], true, min, max);
+
+  // stage 10.
+  butterfly_rotation(&s[30], &s[17], 24 + 32, true);
+  butterfly_rotation(&s[29], &s[18], 24 + 64 + 32, true);
+  butterfly_rotation(&s[26], &s[21], 24, true);
+  butterfly_rotation(&s[25], &s[22], 24 + 64, true);
+
+  // stage 15.
+  HadamardRotation(&s[16], &s[19], false, min, max);
+  HadamardRotation(&s[17], &s[18], false, min, max);
+  HadamardRotation(&s[20], &s[23], true, min, max);
+  HadamardRotation(&s[21], &s[22], true, min, max);
+  HadamardRotation(&s[24], &s[27], false, min, max);
+  HadamardRotation(&s[25], &s[26], false, min, max);
+  HadamardRotation(&s[28], &s[31], true, min, max);
+  HadamardRotation(&s[29], &s[30], true, min, max);
+
+  // stage 20.
+  butterfly_rotation(&s[29], &s[18], 48, true);
+  butterfly_rotation(&s[28], &s[19], 48, true);
+  butterfly_rotation(&s[27], &s[20], 48 + 64, true);
+  butterfly_rotation(&s[26], &s[21], 48 + 64, true);
+
+  // stage 24.
+  HadamardRotation(&s[16], &s[23], false, min, max);
+  HadamardRotation(&s[17], &s[22], false, min, max);
+  HadamardRotation(&s[18], &s[21], false, min, max);
+  HadamardRotation(&s[19], &s[20], false, min, max);
+  HadamardRotation(&s[24], &s[31], true, min, max);
+  HadamardRotation(&s[25], &s[30], true, min, max);
+  HadamardRotation(&s[26], &s[29], true, min, max);
+  HadamardRotation(&s[27], &s[28], true, min, max);
+
+  // stage 27.
+  butterfly_rotation(&s[27], &s[20], 32, true);
+  butterfly_rotation(&s[26], &s[21], 32, true);
+  butterfly_rotation(&s[25], &s[22], 32, true);
+  butterfly_rotation(&s[24], &s[23], 32, true);
+
+  // stage 29.
+  if (is_last_stage) {
+    HadamardRotation(&s[0], &s[31], false);
+    HadamardRotation(&s[1], &s[30], false);
+    HadamardRotation(&s[2], &s[29], false);
+    HadamardRotation(&s[3], &s[28], false);
+    HadamardRotation(&s[4], &s[27], false);
+    HadamardRotation(&s[5], &s[26], false);
+    HadamardRotation(&s[6], &s[25], false);
+    HadamardRotation(&s[7], &s[24], false);
+    HadamardRotation(&s[8], &s[23], false);
+    HadamardRotation(&s[9], &s[22], false);
+    HadamardRotation(&s[10], &s[21], false);
+    HadamardRotation(&s[11], &s[20], false);
+    HadamardRotation(&s[12], &s[19], false);
+    HadamardRotation(&s[13], &s[18], false);
+    HadamardRotation(&s[14], &s[17], false);
+    HadamardRotation(&s[15], &s[16], false);
+  } else {
+    HadamardRotation(&s[0], &s[31], false, min, max);
+    HadamardRotation(&s[1], &s[30], false, min, max);
+    HadamardRotation(&s[2], &s[29], false, min, max);
+    HadamardRotation(&s[3], &s[28], false, min, max);
+    HadamardRotation(&s[4], &s[27], false, min, max);
+    HadamardRotation(&s[5], &s[26], false, min, max);
+    HadamardRotation(&s[6], &s[25], false, min, max);
+    HadamardRotation(&s[7], &s[24], false, min, max);
+    HadamardRotation(&s[8], &s[23], false, min, max);
+    HadamardRotation(&s[9], &s[22], false, min, max);
+    HadamardRotation(&s[10], &s[21], false, min, max);
+    HadamardRotation(&s[11], &s[20], false, min, max);
+    HadamardRotation(&s[12], &s[19], false, min, max);
+    HadamardRotation(&s[13], &s[18], false, min, max);
+    HadamardRotation(&s[14], &s[17], false, min, max);
+    HadamardRotation(&s[15], &s[16], false, min, max);
+  }
+}
+
+// Process dct32 rows or columns, depending on the |is_row| flag.
+LIBGAV1_ALWAYS_INLINE void Dct32_NEON(void* dest, const int32_t step,
+                                      const bool is_row, int row_shift) {
+  auto* const dst = static_cast<int32_t*>(dest);
+  const int32_t range = is_row ? kBitdepth10 + 7 : 15;
+  const int32x4_t min = vdupq_n_s32(-(1 << range));
+  const int32x4_t max = vdupq_n_s32((1 << range) - 1);
+  int32x4_t s[32], x[32];
+
+  if (is_row) {
+    for (int idx = 0; idx < 32; idx += 8) {
+      LoadSrc<4>(dst, step, idx, &x[idx]);
+      LoadSrc<4>(dst, step, idx + 4, &x[idx + 4]);
+      Transpose4x4(&x[idx], &x[idx]);
+      Transpose4x4(&x[idx + 4], &x[idx + 4]);
+    }
+  } else {
+    LoadSrc<32>(dst, step, 0, &x[0]);
+  }
+
+  // stage 1
+  // kBitReverseLookup
+  // 0, 16, 8, 24, 4, 20, 12, 28, 2, 18, 10, 26, 6, 22, 14, 30,
+  s[0] = x[0];
+  s[1] = x[16];
+  s[2] = x[8];
+  s[3] = x[24];
+  s[4] = x[4];
+  s[5] = x[20];
+  s[6] = x[12];
+  s[7] = x[28];
+  s[8] = x[2];
+  s[9] = x[18];
+  s[10] = x[10];
+  s[11] = x[26];
+  s[12] = x[6];
+  s[13] = x[22];
+  s[14] = x[14];
+  s[15] = x[30];
+
+  // 1, 17, 9, 25, 5, 21, 13, 29, 3, 19, 11, 27, 7, 23, 15, 31,
+  s[16] = x[1];
+  s[17] = x[17];
+  s[18] = x[9];
+  s[19] = x[25];
+  s[20] = x[5];
+  s[21] = x[21];
+  s[22] = x[13];
+  s[23] = x[29];
+  s[24] = x[3];
+  s[25] = x[19];
+  s[26] = x[11];
+  s[27] = x[27];
+  s[28] = x[7];
+  s[29] = x[23];
+  s[30] = x[15];
+  s[31] = x[31];
+
+  Dct4Stages<ButterflyRotation_4>(s, &min, &max, /*is_last_stage=*/false);
+  Dct8Stages<ButterflyRotation_4>(s, &min, &max, /*is_last_stage=*/false);
+  Dct16Stages<ButterflyRotation_4>(s, &min, &max, /*is_last_stage=*/false);
+  Dct32Stages<ButterflyRotation_4>(s, &min, &max, /*is_last_stage=*/true);
+
+  if (is_row) {
+    const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
+    for (int idx = 0; idx < 32; idx += 8) {
+      int32x4_t output[8];
+      Transpose4x4(&s[idx], &output[0]);
+      Transpose4x4(&s[idx + 4], &output[4]);
+      for (int i = 0; i < 8; ++i) {
+        output[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(output[i], v_row_shift)));
+      }
+      StoreDst<4>(dst, step, idx, &output[0]);
+      StoreDst<4>(dst, step, idx + 4, &output[4]);
+    }
+  } else {
+    StoreDst<32>(dst, step, 0, &s[0]);
+  }
+}
+
+void Dct64_NEON(void* dest, int32_t step, bool is_row, int row_shift) {
+  auto* const dst = static_cast<int32_t*>(dest);
+  const int32_t range = is_row ? kBitdepth10 + 7 : 15;
+  const int32x4_t min = vdupq_n_s32(-(1 << range));
+  const int32x4_t max = vdupq_n_s32((1 << range) - 1);
+  int32x4_t s[64], x[32];
+
+  if (is_row) {
+    // The last 32 values of every row are always zero if the |tx_width| is
+    // 64.
+    for (int idx = 0; idx < 32; idx += 8) {
+      LoadSrc<4>(dst, step, idx, &x[idx]);
+      LoadSrc<4>(dst, step, idx + 4, &x[idx + 4]);
+      Transpose4x4(&x[idx], &x[idx]);
+      Transpose4x4(&x[idx + 4], &x[idx + 4]);
+    }
+  } else {
+    // The last 32 values of every column are always zero if the |tx_height| is
+    // 64.
+    LoadSrc<32>(dst, step, 0, &x[0]);
+  }
+
+  // stage 1
+  // kBitReverseLookup
+  // 0, 32, 16, 48, 8, 40, 24, 56, 4, 36, 20, 52, 12, 44, 28, 60,
+  s[0] = x[0];
+  s[2] = x[16];
+  s[4] = x[8];
+  s[6] = x[24];
+  s[8] = x[4];
+  s[10] = x[20];
+  s[12] = x[12];
+  s[14] = x[28];
+
+  // 2, 34, 18, 50, 10, 42, 26, 58, 6, 38, 22, 54, 14, 46, 30, 62,
+  s[16] = x[2];
+  s[18] = x[18];
+  s[20] = x[10];
+  s[22] = x[26];
+  s[24] = x[6];
+  s[26] = x[22];
+  s[28] = x[14];
+  s[30] = x[30];
+
+  // 1, 33, 17, 49, 9, 41, 25, 57, 5, 37, 21, 53, 13, 45, 29, 61,
+  s[32] = x[1];
+  s[34] = x[17];
+  s[36] = x[9];
+  s[38] = x[25];
+  s[40] = x[5];
+  s[42] = x[21];
+  s[44] = x[13];
+  s[46] = x[29];
+
+  // 3, 35, 19, 51, 11, 43, 27, 59, 7, 39, 23, 55, 15, 47, 31, 63
+  s[48] = x[3];
+  s[50] = x[19];
+  s[52] = x[11];
+  s[54] = x[27];
+  s[56] = x[7];
+  s[58] = x[23];
+  s[60] = x[15];
+  s[62] = x[31];
+
+  Dct4Stages<ButterflyRotation_4, /*is_fast_butterfly=*/true>(
+      s, &min, &max, /*is_last_stage=*/false);
+  Dct8Stages<ButterflyRotation_4, /*is_fast_butterfly=*/true>(
+      s, &min, &max, /*is_last_stage=*/false);
+  Dct16Stages<ButterflyRotation_4, /*is_fast_butterfly=*/true>(
+      s, &min, &max, /*is_last_stage=*/false);
+  Dct32Stages<ButterflyRotation_4, /*is_fast_butterfly=*/true>(
+      s, &min, &max, /*is_last_stage=*/false);
+
+  //-- start dct 64 stages
+  // stage 2.
+  ButterflyRotation_SecondIsZero(&s[32], &s[63], 63 - 0, false);
+  ButterflyRotation_FirstIsZero(&s[33], &s[62], 63 - 32, false);
+  ButterflyRotation_SecondIsZero(&s[34], &s[61], 63 - 16, false);
+  ButterflyRotation_FirstIsZero(&s[35], &s[60], 63 - 48, false);
+  ButterflyRotation_SecondIsZero(&s[36], &s[59], 63 - 8, false);
+  ButterflyRotation_FirstIsZero(&s[37], &s[58], 63 - 40, false);
+  ButterflyRotation_SecondIsZero(&s[38], &s[57], 63 - 24, false);
+  ButterflyRotation_FirstIsZero(&s[39], &s[56], 63 - 56, false);
+  ButterflyRotation_SecondIsZero(&s[40], &s[55], 63 - 4, false);
+  ButterflyRotation_FirstIsZero(&s[41], &s[54], 63 - 36, false);
+  ButterflyRotation_SecondIsZero(&s[42], &s[53], 63 - 20, false);
+  ButterflyRotation_FirstIsZero(&s[43], &s[52], 63 - 52, false);
+  ButterflyRotation_SecondIsZero(&s[44], &s[51], 63 - 12, false);
+  ButterflyRotation_FirstIsZero(&s[45], &s[50], 63 - 44, false);
+  ButterflyRotation_SecondIsZero(&s[46], &s[49], 63 - 28, false);
+  ButterflyRotation_FirstIsZero(&s[47], &s[48], 63 - 60, false);
+
+  // stage 4.
+  HadamardRotation(&s[32], &s[33], false, &min, &max);
+  HadamardRotation(&s[34], &s[35], true, &min, &max);
+  HadamardRotation(&s[36], &s[37], false, &min, &max);
+  HadamardRotation(&s[38], &s[39], true, &min, &max);
+  HadamardRotation(&s[40], &s[41], false, &min, &max);
+  HadamardRotation(&s[42], &s[43], true, &min, &max);
+  HadamardRotation(&s[44], &s[45], false, &min, &max);
+  HadamardRotation(&s[46], &s[47], true, &min, &max);
+  HadamardRotation(&s[48], &s[49], false, &min, &max);
+  HadamardRotation(&s[50], &s[51], true, &min, &max);
+  HadamardRotation(&s[52], &s[53], false, &min, &max);
+  HadamardRotation(&s[54], &s[55], true, &min, &max);
+  HadamardRotation(&s[56], &s[57], false, &min, &max);
+  HadamardRotation(&s[58], &s[59], true, &min, &max);
+  HadamardRotation(&s[60], &s[61], false, &min, &max);
+  HadamardRotation(&s[62], &s[63], true, &min, &max);
+
+  // stage 7.
+  ButterflyRotation_4(&s[62], &s[33], 60 - 0, true);
+  ButterflyRotation_4(&s[61], &s[34], 60 - 0 + 64, true);
+  ButterflyRotation_4(&s[58], &s[37], 60 - 32, true);
+  ButterflyRotation_4(&s[57], &s[38], 60 - 32 + 64, true);
+  ButterflyRotation_4(&s[54], &s[41], 60 - 16, true);
+  ButterflyRotation_4(&s[53], &s[42], 60 - 16 + 64, true);
+  ButterflyRotation_4(&s[50], &s[45], 60 - 48, true);
+  ButterflyRotation_4(&s[49], &s[46], 60 - 48 + 64, true);
+
+  // stage 11.
+  HadamardRotation(&s[32], &s[35], false, &min, &max);
+  HadamardRotation(&s[33], &s[34], false, &min, &max);
+  HadamardRotation(&s[36], &s[39], true, &min, &max);
+  HadamardRotation(&s[37], &s[38], true, &min, &max);
+  HadamardRotation(&s[40], &s[43], false, &min, &max);
+  HadamardRotation(&s[41], &s[42], false, &min, &max);
+  HadamardRotation(&s[44], &s[47], true, &min, &max);
+  HadamardRotation(&s[45], &s[46], true, &min, &max);
+  HadamardRotation(&s[48], &s[51], false, &min, &max);
+  HadamardRotation(&s[49], &s[50], false, &min, &max);
+  HadamardRotation(&s[52], &s[55], true, &min, &max);
+  HadamardRotation(&s[53], &s[54], true, &min, &max);
+  HadamardRotation(&s[56], &s[59], false, &min, &max);
+  HadamardRotation(&s[57], &s[58], false, &min, &max);
+  HadamardRotation(&s[60], &s[63], true, &min, &max);
+  HadamardRotation(&s[61], &s[62], true, &min, &max);
+
+  // stage 16.
+  ButterflyRotation_4(&s[61], &s[34], 56, true);
+  ButterflyRotation_4(&s[60], &s[35], 56, true);
+  ButterflyRotation_4(&s[59], &s[36], 56 + 64, true);
+  ButterflyRotation_4(&s[58], &s[37], 56 + 64, true);
+  ButterflyRotation_4(&s[53], &s[42], 56 - 32, true);
+  ButterflyRotation_4(&s[52], &s[43], 56 - 32, true);
+  ButterflyRotation_4(&s[51], &s[44], 56 - 32 + 64, true);
+  ButterflyRotation_4(&s[50], &s[45], 56 - 32 + 64, true);
+
+  // stage 21.
+  HadamardRotation(&s[32], &s[39], false, &min, &max);
+  HadamardRotation(&s[33], &s[38], false, &min, &max);
+  HadamardRotation(&s[34], &s[37], false, &min, &max);
+  HadamardRotation(&s[35], &s[36], false, &min, &max);
+  HadamardRotation(&s[40], &s[47], true, &min, &max);
+  HadamardRotation(&s[41], &s[46], true, &min, &max);
+  HadamardRotation(&s[42], &s[45], true, &min, &max);
+  HadamardRotation(&s[43], &s[44], true, &min, &max);
+  HadamardRotation(&s[48], &s[55], false, &min, &max);
+  HadamardRotation(&s[49], &s[54], false, &min, &max);
+  HadamardRotation(&s[50], &s[53], false, &min, &max);
+  HadamardRotation(&s[51], &s[52], false, &min, &max);
+  HadamardRotation(&s[56], &s[63], true, &min, &max);
+  HadamardRotation(&s[57], &s[62], true, &min, &max);
+  HadamardRotation(&s[58], &s[61], true, &min, &max);
+  HadamardRotation(&s[59], &s[60], true, &min, &max);
+
+  // stage 25.
+  ButterflyRotation_4(&s[59], &s[36], 48, true);
+  ButterflyRotation_4(&s[58], &s[37], 48, true);
+  ButterflyRotation_4(&s[57], &s[38], 48, true);
+  ButterflyRotation_4(&s[56], &s[39], 48, true);
+  ButterflyRotation_4(&s[55], &s[40], 112, true);
+  ButterflyRotation_4(&s[54], &s[41], 112, true);
+  ButterflyRotation_4(&s[53], &s[42], 112, true);
+  ButterflyRotation_4(&s[52], &s[43], 112, true);
+
+  // stage 28.
+  HadamardRotation(&s[32], &s[47], false, &min, &max);
+  HadamardRotation(&s[33], &s[46], false, &min, &max);
+  HadamardRotation(&s[34], &s[45], false, &min, &max);
+  HadamardRotation(&s[35], &s[44], false, &min, &max);
+  HadamardRotation(&s[36], &s[43], false, &min, &max);
+  HadamardRotation(&s[37], &s[42], false, &min, &max);
+  HadamardRotation(&s[38], &s[41], false, &min, &max);
+  HadamardRotation(&s[39], &s[40], false, &min, &max);
+  HadamardRotation(&s[48], &s[63], true, &min, &max);
+  HadamardRotation(&s[49], &s[62], true, &min, &max);
+  HadamardRotation(&s[50], &s[61], true, &min, &max);
+  HadamardRotation(&s[51], &s[60], true, &min, &max);
+  HadamardRotation(&s[52], &s[59], true, &min, &max);
+  HadamardRotation(&s[53], &s[58], true, &min, &max);
+  HadamardRotation(&s[54], &s[57], true, &min, &max);
+  HadamardRotation(&s[55], &s[56], true, &min, &max);
+
+  // stage 30.
+  ButterflyRotation_4(&s[55], &s[40], 32, true);
+  ButterflyRotation_4(&s[54], &s[41], 32, true);
+  ButterflyRotation_4(&s[53], &s[42], 32, true);
+  ButterflyRotation_4(&s[52], &s[43], 32, true);
+  ButterflyRotation_4(&s[51], &s[44], 32, true);
+  ButterflyRotation_4(&s[50], &s[45], 32, true);
+  ButterflyRotation_4(&s[49], &s[46], 32, true);
+  ButterflyRotation_4(&s[48], &s[47], 32, true);
+
+  // stage 31.
+  for (int i = 0; i < 32; i += 4) {
+    HadamardRotation(&s[i], &s[63 - i], false, &min, &max);
+    HadamardRotation(&s[i + 1], &s[63 - i - 1], false, &min, &max);
+    HadamardRotation(&s[i + 2], &s[63 - i - 2], false, &min, &max);
+    HadamardRotation(&s[i + 3], &s[63 - i - 3], false, &min, &max);
+  }
+  //-- end dct 64 stages
+  if (is_row) {
+    const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
+    for (int idx = 0; idx < 64; idx += 8) {
+      int32x4_t output[8];
+      Transpose4x4(&s[idx], &output[0]);
+      Transpose4x4(&s[idx + 4], &output[4]);
+      for (int i = 0; i < 8; ++i) {
+        output[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(output[i], v_row_shift)));
+      }
+      StoreDst<4>(dst, step, idx, &output[0]);
+      StoreDst<4>(dst, step, idx + 4, &output[4]);
+    }
+  } else {
+    StoreDst<64>(dst, step, 0, &s[0]);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Asymmetric Discrete Sine Transforms (ADST).
+LIBGAV1_ALWAYS_INLINE void Adst4_NEON(void* dest, int32_t step, bool is_row,
+                                      int row_shift) {
+  auto* const dst = static_cast<int32_t*>(dest);
+  int32x4_t s[8];
+  int32x4_t x[4];
+
+  LoadSrc<4>(dst, step, 0, x);
+  if (is_row) {
+    Transpose4x4(x, x);
+  }
+
+  // stage 1.
+  s[5] = vmulq_n_s32(x[3], kAdst4Multiplier[1]);
+  s[6] = vmulq_n_s32(x[3], kAdst4Multiplier[3]);
+
+  // stage 2.
+  const int32x4_t a7 = vsubq_s32(x[0], x[2]);
+  const int32x4_t b7 = vaddq_s32(a7, x[3]);
+
+  // stage 3.
+  s[0] = vmulq_n_s32(x[0], kAdst4Multiplier[0]);
+  s[1] = vmulq_n_s32(x[0], kAdst4Multiplier[1]);
+  // s[0] = s[0] + s[3]
+  s[0] = vmlaq_n_s32(s[0], x[2], kAdst4Multiplier[3]);
+  // s[1] = s[1] - s[4]
+  s[1] = vmlsq_n_s32(s[1], x[2], kAdst4Multiplier[0]);
+
+  s[3] = vmulq_n_s32(x[1], kAdst4Multiplier[2]);
+  s[2] = vmulq_n_s32(b7, kAdst4Multiplier[2]);
+
+  // stage 4.
+  s[0] = vaddq_s32(s[0], s[5]);
+  s[1] = vsubq_s32(s[1], s[6]);
+
+  // stages 5 and 6.
+  const int32x4_t x0 = vaddq_s32(s[0], s[3]);
+  const int32x4_t x1 = vaddq_s32(s[1], s[3]);
+  const int32x4_t x3_a = vaddq_s32(s[0], s[1]);
+  const int32x4_t x3 = vsubq_s32(x3_a, s[3]);
+  x[0] = vrshrq_n_s32(x0, 12);
+  x[1] = vrshrq_n_s32(x1, 12);
+  x[2] = vrshrq_n_s32(s[2], 12);
+  x[3] = vrshrq_n_s32(x3, 12);
+
+  if (is_row) {
+    const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
+    x[0] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[0], v_row_shift)));
+    x[1] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[1], v_row_shift)));
+    x[2] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[2], v_row_shift)));
+    x[3] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[3], v_row_shift)));
+    Transpose4x4(x, x);
+  }
+  StoreDst<4>(dst, step, 0, x);
+}
+
+alignas(16) constexpr int32_t kAdst4DcOnlyMultiplier[4] = {1321, 2482, 3344,
+                                                           2482};
+
+LIBGAV1_ALWAYS_INLINE bool Adst4DcOnly(void* dest, int adjusted_tx_height,
+                                       bool should_round, int row_shift) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int32_t*>(dest);
+  int32x4_t s[2];
+
+  const int32x4_t v_src0 = vdupq_n_s32(dst[0]);
+  const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0);
+  const int32x4_t v_src0_round =
+      vqrdmulhq_n_s32(v_src0, kTransformRowMultiplier << (31 - 12));
+
+  const int32x4_t v_src = vbslq_s32(v_mask, v_src0_round, v_src0);
+  const int32x4_t kAdst4DcOnlyMultipliers = vld1q_s32(kAdst4DcOnlyMultiplier);
+  s[1] = vdupq_n_s32(0);
+
+  // s0*k0 s0*k1 s0*k2 s0*k1
+  s[0] = vmulq_s32(kAdst4DcOnlyMultipliers, v_src);
+  // 0     0     0     s0*k0
+  s[1] = vextq_s32(s[1], s[0], 1);
+
+  const int32x4_t x3 = vaddq_s32(s[0], s[1]);
+  const int32x4_t dst_0 = vrshrq_n_s32(x3, 12);
+
+  // vqrshlq_s32 will shift right if shift value is negative.
+  vst1q_s32(dst,
+            vmovl_s16(vqmovn_s32(vqrshlq_s32(dst_0, vdupq_n_s32(-row_shift)))));
+
+  return true;
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst4DcOnlyColumn(void* dest, int adjusted_tx_height,
+                                             int width) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int32_t*>(dest);
+  int32x4_t s[4];
+
+  int i = 0;
+  do {
+    const int32x4_t v_src = vld1q_s32(&dst[i]);
+
+    s[0] = vmulq_n_s32(v_src, kAdst4Multiplier[0]);
+    s[1] = vmulq_n_s32(v_src, kAdst4Multiplier[1]);
+    s[2] = vmulq_n_s32(v_src, kAdst4Multiplier[2]);
+
+    const int32x4_t x0 = s[0];
+    const int32x4_t x1 = s[1];
+    const int32x4_t x2 = s[2];
+    const int32x4_t x3 = vaddq_s32(s[0], s[1]);
+    const int32x4_t dst_0 = vrshrq_n_s32(x0, 12);
+    const int32x4_t dst_1 = vrshrq_n_s32(x1, 12);
+    const int32x4_t dst_2 = vrshrq_n_s32(x2, 12);
+    const int32x4_t dst_3 = vrshrq_n_s32(x3, 12);
+
+    vst1q_s32(&dst[i], dst_0);
+    vst1q_s32(&dst[i + width * 1], dst_1);
+    vst1q_s32(&dst[i + width * 2], dst_2);
+    vst1q_s32(&dst[i + width * 3], dst_3);
+
+    i += 4;
+  } while (i < width);
+
+  return true;
+}
+
+template <ButterflyRotationFunc butterfly_rotation>
+LIBGAV1_ALWAYS_INLINE void Adst8_NEON(void* dest, int32_t step, bool is_row,
+                                      int row_shift) {
+  auto* const dst = static_cast<int32_t*>(dest);
+  const int32_t range = is_row ? kBitdepth10 + 7 : 15;
+  const int32x4_t min = vdupq_n_s32(-(1 << range));
+  const int32x4_t max = vdupq_n_s32((1 << range) - 1);
+  int32x4_t s[8], x[8];
+
+  if (is_row) {
+    LoadSrc<4>(dst, step, 0, &x[0]);
+    LoadSrc<4>(dst, step, 4, &x[4]);
+    Transpose4x4(&x[0], &x[0]);
+    Transpose4x4(&x[4], &x[4]);
+  } else {
+    LoadSrc<8>(dst, step, 0, &x[0]);
+  }
+
+  // stage 1.
+  s[0] = x[7];
+  s[1] = x[0];
+  s[2] = x[5];
+  s[3] = x[2];
+  s[4] = x[3];
+  s[5] = x[4];
+  s[6] = x[1];
+  s[7] = x[6];
+
+  // stage 2.
+  butterfly_rotation(&s[0], &s[1], 60 - 0, true);
+  butterfly_rotation(&s[2], &s[3], 60 - 16, true);
+  butterfly_rotation(&s[4], &s[5], 60 - 32, true);
+  butterfly_rotation(&s[6], &s[7], 60 - 48, true);
+
+  // stage 3.
+  HadamardRotation(&s[0], &s[4], false, &min, &max);
+  HadamardRotation(&s[1], &s[5], false, &min, &max);
+  HadamardRotation(&s[2], &s[6], false, &min, &max);
+  HadamardRotation(&s[3], &s[7], false, &min, &max);
+
+  // stage 4.
+  butterfly_rotation(&s[4], &s[5], 48 - 0, true);
+  butterfly_rotation(&s[7], &s[6], 48 - 32, true);
+
+  // stage 5.
+  HadamardRotation(&s[0], &s[2], false, &min, &max);
+  HadamardRotation(&s[4], &s[6], false, &min, &max);
+  HadamardRotation(&s[1], &s[3], false, &min, &max);
+  HadamardRotation(&s[5], &s[7], false, &min, &max);
+
+  // stage 6.
+  butterfly_rotation(&s[2], &s[3], 32, true);
+  butterfly_rotation(&s[6], &s[7], 32, true);
+
+  // stage 7.
+  x[0] = s[0];
+  x[1] = vqnegq_s32(s[4]);
+  x[2] = s[6];
+  x[3] = vqnegq_s32(s[2]);
+  x[4] = s[3];
+  x[5] = vqnegq_s32(s[7]);
+  x[6] = s[5];
+  x[7] = vqnegq_s32(s[1]);
+
+  if (is_row) {
+    const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
+    for (int i = 0; i < 8; ++i) {
+      x[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[i], v_row_shift)));
+    }
+    Transpose4x4(&x[0], &x[0]);
+    Transpose4x4(&x[4], &x[4]);
+    StoreDst<4>(dst, step, 0, &x[0]);
+    StoreDst<4>(dst, step, 4, &x[4]);
+  } else {
+    StoreDst<8>(dst, step, 0, &x[0]);
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst8DcOnly(void* dest, int adjusted_tx_height,
+                                       bool should_round, int row_shift) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int32_t*>(dest);
+  int32x4_t s[8];
+
+  const int32x4_t v_src = vdupq_n_s32(dst[0]);
+  const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0);
+  const int32x4_t v_src_round =
+      vqrdmulhq_n_s32(v_src, kTransformRowMultiplier << (31 - 12));
+  // stage 1.
+  s[1] = vbslq_s32(v_mask, v_src_round, v_src);
+
+  // stage 2.
+  ButterflyRotation_FirstIsZero(&s[0], &s[1], 60, true);
+
+  // stage 3.
+  s[4] = s[0];
+  s[5] = s[1];
+
+  // stage 4.
+  ButterflyRotation_4(&s[4], &s[5], 48, true);
+
+  // stage 5.
+  s[2] = s[0];
+  s[3] = s[1];
+  s[6] = s[4];
+  s[7] = s[5];
+
+  // stage 6.
+  ButterflyRotation_4(&s[2], &s[3], 32, true);
+  ButterflyRotation_4(&s[6], &s[7], 32, true);
+
+  // stage 7.
+  int32x4_t x[8];
+  x[0] = s[0];
+  x[1] = vqnegq_s32(s[4]);
+  x[2] = s[6];
+  x[3] = vqnegq_s32(s[2]);
+  x[4] = s[3];
+  x[5] = vqnegq_s32(s[7]);
+  x[6] = s[5];
+  x[7] = vqnegq_s32(s[1]);
+
+  for (int i = 0; i < 8; ++i) {
+    // vqrshlq_s32 will shift right if shift value is negative.
+    x[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[i], vdupq_n_s32(-row_shift))));
+    vst1q_lane_s32(&dst[i], x[i], 0);
+  }
+
+  return true;
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst8DcOnlyColumn(void* dest, int adjusted_tx_height,
+                                             int width) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int32_t*>(dest);
+  int32x4_t s[8];
+
+  int i = 0;
+  do {
+    const int32x4_t v_src = vld1q_s32(dst);
+    // stage 1.
+    s[1] = v_src;
+
+    // stage 2.
+    ButterflyRotation_FirstIsZero(&s[0], &s[1], 60, true);
+
+    // stage 3.
+    s[4] = s[0];
+    s[5] = s[1];
+
+    // stage 4.
+    ButterflyRotation_4(&s[4], &s[5], 48, true);
+
+    // stage 5.
+    s[2] = s[0];
+    s[3] = s[1];
+    s[6] = s[4];
+    s[7] = s[5];
+
+    // stage 6.
+    ButterflyRotation_4(&s[2], &s[3], 32, true);
+    ButterflyRotation_4(&s[6], &s[7], 32, true);
+
+    // stage 7.
+    int32x4_t x[8];
+    x[0] = s[0];
+    x[1] = vqnegq_s32(s[4]);
+    x[2] = s[6];
+    x[3] = vqnegq_s32(s[2]);
+    x[4] = s[3];
+    x[5] = vqnegq_s32(s[7]);
+    x[6] = s[5];
+    x[7] = vqnegq_s32(s[1]);
+
+    for (int j = 0; j < 8; ++j) {
+      vst1q_s32(&dst[j * width], x[j]);
+    }
+    i += 4;
+    dst += 4;
+  } while (i < width);
+
+  return true;
+}
+
+template <ButterflyRotationFunc butterfly_rotation>
+LIBGAV1_ALWAYS_INLINE void Adst16_NEON(void* dest, int32_t step, bool is_row,
+                                       int row_shift) {
+  auto* const dst = static_cast<int32_t*>(dest);
+  const int32_t range = is_row ? kBitdepth10 + 7 : 15;
+  const int32x4_t min = vdupq_n_s32(-(1 << range));
+  const int32x4_t max = vdupq_n_s32((1 << range) - 1);
+  int32x4_t s[16], x[16];
+
+  if (is_row) {
+    for (int idx = 0; idx < 16; idx += 8) {
+      LoadSrc<4>(dst, step, idx, &x[idx]);
+      LoadSrc<4>(dst, step, idx + 4, &x[idx + 4]);
+      Transpose4x4(&x[idx], &x[idx]);
+      Transpose4x4(&x[idx + 4], &x[idx + 4]);
+    }
+  } else {
+    LoadSrc<16>(dst, step, 0, &x[0]);
+  }
+
+  // stage 1.
+  s[0] = x[15];
+  s[1] = x[0];
+  s[2] = x[13];
+  s[3] = x[2];
+  s[4] = x[11];
+  s[5] = x[4];
+  s[6] = x[9];
+  s[7] = x[6];
+  s[8] = x[7];
+  s[9] = x[8];
+  s[10] = x[5];
+  s[11] = x[10];
+  s[12] = x[3];
+  s[13] = x[12];
+  s[14] = x[1];
+  s[15] = x[14];
+
+  // stage 2.
+  butterfly_rotation(&s[0], &s[1], 62 - 0, true);
+  butterfly_rotation(&s[2], &s[3], 62 - 8, true);
+  butterfly_rotation(&s[4], &s[5], 62 - 16, true);
+  butterfly_rotation(&s[6], &s[7], 62 - 24, true);
+  butterfly_rotation(&s[8], &s[9], 62 - 32, true);
+  butterfly_rotation(&s[10], &s[11], 62 - 40, true);
+  butterfly_rotation(&s[12], &s[13], 62 - 48, true);
+  butterfly_rotation(&s[14], &s[15], 62 - 56, true);
+
+  // stage 3.
+  HadamardRotation(&s[0], &s[8], false, &min, &max);
+  HadamardRotation(&s[1], &s[9], false, &min, &max);
+  HadamardRotation(&s[2], &s[10], false, &min, &max);
+  HadamardRotation(&s[3], &s[11], false, &min, &max);
+  HadamardRotation(&s[4], &s[12], false, &min, &max);
+  HadamardRotation(&s[5], &s[13], false, &min, &max);
+  HadamardRotation(&s[6], &s[14], false, &min, &max);
+  HadamardRotation(&s[7], &s[15], false, &min, &max);
+
+  // stage 4.
+  butterfly_rotation(&s[8], &s[9], 56 - 0, true);
+  butterfly_rotation(&s[13], &s[12], 8 + 0, true);
+  butterfly_rotation(&s[10], &s[11], 56 - 32, true);
+  butterfly_rotation(&s[15], &s[14], 8 + 32, true);
+
+  // stage 5.
+  HadamardRotation(&s[0], &s[4], false, &min, &max);
+  HadamardRotation(&s[8], &s[12], false, &min, &max);
+  HadamardRotation(&s[1], &s[5], false, &min, &max);
+  HadamardRotation(&s[9], &s[13], false, &min, &max);
+  HadamardRotation(&s[2], &s[6], false, &min, &max);
+  HadamardRotation(&s[10], &s[14], false, &min, &max);
+  HadamardRotation(&s[3], &s[7], false, &min, &max);
+  HadamardRotation(&s[11], &s[15], false, &min, &max);
+
+  // stage 6.
+  butterfly_rotation(&s[4], &s[5], 48 - 0, true);
+  butterfly_rotation(&s[12], &s[13], 48 - 0, true);
+  butterfly_rotation(&s[7], &s[6], 48 - 32, true);
+  butterfly_rotation(&s[15], &s[14], 48 - 32, true);
+
+  // stage 7.
+  HadamardRotation(&s[0], &s[2], false, &min, &max);
+  HadamardRotation(&s[4], &s[6], false, &min, &max);
+  HadamardRotation(&s[8], &s[10], false, &min, &max);
+  HadamardRotation(&s[12], &s[14], false, &min, &max);
+  HadamardRotation(&s[1], &s[3], false, &min, &max);
+  HadamardRotation(&s[5], &s[7], false, &min, &max);
+  HadamardRotation(&s[9], &s[11], false, &min, &max);
+  HadamardRotation(&s[13], &s[15], false, &min, &max);
+
+  // stage 8.
+  butterfly_rotation(&s[2], &s[3], 32, true);
+  butterfly_rotation(&s[6], &s[7], 32, true);
+  butterfly_rotation(&s[10], &s[11], 32, true);
+  butterfly_rotation(&s[14], &s[15], 32, true);
+
+  // stage 9.
+  x[0] = s[0];
+  x[1] = vqnegq_s32(s[8]);
+  x[2] = s[12];
+  x[3] = vqnegq_s32(s[4]);
+  x[4] = s[6];
+  x[5] = vqnegq_s32(s[14]);
+  x[6] = s[10];
+  x[7] = vqnegq_s32(s[2]);
+  x[8] = s[3];
+  x[9] = vqnegq_s32(s[11]);
+  x[10] = s[15];
+  x[11] = vqnegq_s32(s[7]);
+  x[12] = s[5];
+  x[13] = vqnegq_s32(s[13]);
+  x[14] = s[9];
+  x[15] = vqnegq_s32(s[1]);
+
+  if (is_row) {
+    const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
+    for (int i = 0; i < 16; ++i) {
+      x[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[i], v_row_shift)));
+    }
+    for (int idx = 0; idx < 16; idx += 8) {
+      Transpose4x4(&x[idx], &x[idx]);
+      Transpose4x4(&x[idx + 4], &x[idx + 4]);
+      StoreDst<4>(dst, step, idx, &x[idx]);
+      StoreDst<4>(dst, step, idx + 4, &x[idx + 4]);
+    }
+  } else {
+    StoreDst<16>(dst, step, 0, &x[0]);
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE void Adst16DcOnlyInternal(int32x4_t* s, int32x4_t* x) {
+  // stage 2.
+  ButterflyRotation_FirstIsZero(&s[0], &s[1], 62, true);
+
+  // stage 3.
+  s[8] = s[0];
+  s[9] = s[1];
+
+  // stage 4.
+  ButterflyRotation_4(&s[8], &s[9], 56, true);
+
+  // stage 5.
+  s[4] = s[0];
+  s[12] = s[8];
+  s[5] = s[1];
+  s[13] = s[9];
+
+  // stage 6.
+  ButterflyRotation_4(&s[4], &s[5], 48, true);
+  ButterflyRotation_4(&s[12], &s[13], 48, true);
+
+  // stage 7.
+  s[2] = s[0];
+  s[6] = s[4];
+  s[10] = s[8];
+  s[14] = s[12];
+  s[3] = s[1];
+  s[7] = s[5];
+  s[11] = s[9];
+  s[15] = s[13];
+
+  // stage 8.
+  ButterflyRotation_4(&s[2], &s[3], 32, true);
+  ButterflyRotation_4(&s[6], &s[7], 32, true);
+  ButterflyRotation_4(&s[10], &s[11], 32, true);
+  ButterflyRotation_4(&s[14], &s[15], 32, true);
+
+  // stage 9.
+  x[0] = s[0];
+  x[1] = vqnegq_s32(s[8]);
+  x[2] = s[12];
+  x[3] = vqnegq_s32(s[4]);
+  x[4] = s[6];
+  x[5] = vqnegq_s32(s[14]);
+  x[6] = s[10];
+  x[7] = vqnegq_s32(s[2]);
+  x[8] = s[3];
+  x[9] = vqnegq_s32(s[11]);
+  x[10] = s[15];
+  x[11] = vqnegq_s32(s[7]);
+  x[12] = s[5];
+  x[13] = vqnegq_s32(s[13]);
+  x[14] = s[9];
+  x[15] = vqnegq_s32(s[1]);
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst16DcOnly(void* dest, int adjusted_tx_height,
+                                        bool should_round, int row_shift) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int32_t*>(dest);
+  int32x4_t s[16];
+  int32x4_t x[16];
+  const int32x4_t v_src = vdupq_n_s32(dst[0]);
+  const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0);
+  const int32x4_t v_src_round =
+      vqrdmulhq_n_s32(v_src, kTransformRowMultiplier << (31 - 12));
+  // stage 1.
+  s[1] = vbslq_s32(v_mask, v_src_round, v_src);
+
+  Adst16DcOnlyInternal(s, x);
+
+  for (int i = 0; i < 16; ++i) {
+    // vqrshlq_s32 will shift right if shift value is negative.
+    x[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[i], vdupq_n_s32(-row_shift))));
+    vst1q_lane_s32(&dst[i], x[i], 0);
+  }
+
+  return true;
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst16DcOnlyColumn(void* dest,
+                                              int adjusted_tx_height,
+                                              int width) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int32_t*>(dest);
+  int i = 0;
+  do {
+    int32x4_t s[16];
+    int32x4_t x[16];
+    const int32x4_t v_src = vld1q_s32(dst);
+    // stage 1.
+    s[1] = v_src;
+
+    Adst16DcOnlyInternal(s, x);
+
+    for (int j = 0; j < 16; ++j) {
+      vst1q_s32(&dst[j * width], x[j]);
+    }
+    i += 4;
+    dst += 4;
+  } while (i < width);
+
+  return true;
+}
+
+//------------------------------------------------------------------------------
+// Identity Transforms.
+
+LIBGAV1_ALWAYS_INLINE void Identity4_NEON(void* dest, int32_t step, int shift) {
+  auto* const dst = static_cast<int32_t*>(dest);
+  const int32x4_t v_dual_round = vdupq_n_s32((1 + (shift << 1)) << 11);
+  const int32x4_t v_multiplier = vdupq_n_s32(kIdentity4Multiplier);
+  const int32x4_t v_shift = vdupq_n_s32(-(12 + shift));
+  for (int i = 0; i < 4; ++i) {
+    const int32x4_t v_src = vld1q_s32(&dst[i * step]);
+    const int32x4_t v_src_mult_lo =
+        vmlaq_s32(v_dual_round, v_src, v_multiplier);
+    const int32x4_t shift_lo = vqshlq_s32(v_src_mult_lo, v_shift);
+    vst1q_s32(&dst[i * step], vmovl_s16(vqmovn_s32(shift_lo)));
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Identity4DcOnly(void* dest, int adjusted_tx_height,
+                                           bool should_round, int tx_height) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int32_t*>(dest);
+  const int32x4_t v_src0 = vdupq_n_s32(dst[0]);
+  const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0);
+  const int32x4_t v_src_round =
+      vqrdmulhq_n_s32(v_src0, kTransformRowMultiplier << (31 - 12));
+  const int32x4_t v_src = vbslq_s32(v_mask, v_src_round, v_src0);
+  const int shift = tx_height < 16 ? 0 : 1;
+  const int32x4_t v_dual_round = vdupq_n_s32((1 + (shift << 1)) << 11);
+  const int32x4_t v_multiplier = vdupq_n_s32(kIdentity4Multiplier);
+  const int32x4_t v_shift = vdupq_n_s32(-(12 + shift));
+  const int32x4_t v_src_mult_lo = vmlaq_s32(v_dual_round, v_src, v_multiplier);
+  const int32x4_t dst_0 = vqshlq_s32(v_src_mult_lo, v_shift);
+  vst1q_lane_s32(dst, vmovl_s16(vqmovn_s32(dst_0)), 0);
+  return true;
+}
+
+template <int identity_size>
+LIBGAV1_ALWAYS_INLINE void IdentityColumnStoreToFrame(
+    Array2DView<uint16_t> frame, const int start_x, const int start_y,
+    const int tx_width, const int tx_height, const int32_t* source) {
+  static_assert(identity_size == 4 || identity_size == 8 || identity_size == 16,
+                "Invalid identity_size.");
+  const int stride = frame.columns();
+  uint16_t* dst = frame[start_y] + start_x;
+  const int32x4_t v_dual_round = vdupq_n_s32((1 + (1 << 4)) << 11);
+  const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1);
+
+  if (tx_width == 4) {
+    int i = 0;
+    do {
+      int32x4x2_t v_src, v_dst_i, a, b;
+      v_src.val[0] = vld1q_s32(&source[i * 4]);
+      v_src.val[1] = vld1q_s32(&source[(i * 4) + 4]);
+      if (identity_size == 4) {
+        v_dst_i.val[0] =
+            vmlaq_n_s32(v_dual_round, v_src.val[0], kIdentity4Multiplier);
+        v_dst_i.val[1] =
+            vmlaq_n_s32(v_dual_round, v_src.val[1], kIdentity4Multiplier);
+        a.val[0] = vshrq_n_s32(v_dst_i.val[0], 4 + 12);
+        a.val[1] = vshrq_n_s32(v_dst_i.val[1], 4 + 12);
+      } else if (identity_size == 8) {
+        v_dst_i.val[0] = vaddq_s32(v_src.val[0], v_src.val[0]);
+        v_dst_i.val[1] = vaddq_s32(v_src.val[1], v_src.val[1]);
+        a.val[0] = vrshrq_n_s32(v_dst_i.val[0], 4);
+        a.val[1] = vrshrq_n_s32(v_dst_i.val[1], 4);
+      } else {  // identity_size == 16
+        v_dst_i.val[0] =
+            vmlaq_n_s32(v_dual_round, v_src.val[0], kIdentity16Multiplier);
+        v_dst_i.val[1] =
+            vmlaq_n_s32(v_dual_round, v_src.val[1], kIdentity16Multiplier);
+        a.val[0] = vshrq_n_s32(v_dst_i.val[0], 4 + 12);
+        a.val[1] = vshrq_n_s32(v_dst_i.val[1], 4 + 12);
+      }
+      uint16x4x2_t frame_data;
+      frame_data.val[0] = vld1_u16(dst);
+      frame_data.val[1] = vld1_u16(dst + stride);
+      b.val[0] = vaddw_s16(a.val[0], vreinterpret_s16_u16(frame_data.val[0]));
+      b.val[1] = vaddw_s16(a.val[1], vreinterpret_s16_u16(frame_data.val[1]));
+      vst1_u16(dst, vmin_u16(vqmovun_s32(b.val[0]), v_max_bitdepth));
+      vst1_u16(dst + stride, vmin_u16(vqmovun_s32(b.val[1]), v_max_bitdepth));
+      dst += stride << 1;
+      i += 2;
+    } while (i < tx_height);
+  } else {
+    int i = 0;
+    do {
+      const int row = i * tx_width;
+      int j = 0;
+      do {
+        int32x4x2_t v_src, v_dst_i, a, b;
+        v_src.val[0] = vld1q_s32(&source[row + j]);
+        v_src.val[1] = vld1q_s32(&source[row + j + 4]);
+        if (identity_size == 4) {
+          v_dst_i.val[0] =
+              vmlaq_n_s32(v_dual_round, v_src.val[0], kIdentity4Multiplier);
+          v_dst_i.val[1] =
+              vmlaq_n_s32(v_dual_round, v_src.val[1], kIdentity4Multiplier);
+          a.val[0] = vshrq_n_s32(v_dst_i.val[0], 4 + 12);
+          a.val[1] = vshrq_n_s32(v_dst_i.val[1], 4 + 12);
+        } else if (identity_size == 8) {
+          v_dst_i.val[0] = vaddq_s32(v_src.val[0], v_src.val[0]);
+          v_dst_i.val[1] = vaddq_s32(v_src.val[1], v_src.val[1]);
+          a.val[0] = vrshrq_n_s32(v_dst_i.val[0], 4);
+          a.val[1] = vrshrq_n_s32(v_dst_i.val[1], 4);
+        } else {  // identity_size == 16
+          v_dst_i.val[0] =
+              vmlaq_n_s32(v_dual_round, v_src.val[0], kIdentity16Multiplier);
+          v_dst_i.val[1] =
+              vmlaq_n_s32(v_dual_round, v_src.val[1], kIdentity16Multiplier);
+          a.val[0] = vshrq_n_s32(v_dst_i.val[0], 4 + 12);
+          a.val[1] = vshrq_n_s32(v_dst_i.val[1], 4 + 12);
+        }
+        uint16x4x2_t frame_data;
+        frame_data.val[0] = vld1_u16(dst + j);
+        frame_data.val[1] = vld1_u16(dst + j + 4);
+        b.val[0] = vaddw_s16(a.val[0], vreinterpret_s16_u16(frame_data.val[0]));
+        b.val[1] = vaddw_s16(a.val[1], vreinterpret_s16_u16(frame_data.val[1]));
+        vst1_u16(dst + j, vmin_u16(vqmovun_s32(b.val[0]), v_max_bitdepth));
+        vst1_u16(dst + j + 4, vmin_u16(vqmovun_s32(b.val[1]), v_max_bitdepth));
+        j += 8;
+      } while (j < tx_width);
+      dst += stride;
+    } while (++i < tx_height);
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity4RowColumnStoreToFrame(
+    Array2DView<uint16_t> frame, const int start_x, const int start_y,
+    const int tx_width, const int tx_height, const int32_t* source) {
+  const int stride = frame.columns();
+  uint16_t* dst = frame[start_y] + start_x;
+  const int32x4_t v_round = vdupq_n_s32((1 + (0)) << 11);
+  const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1);
+
+  if (tx_width == 4) {
+    int i = 0;
+    do {
+      const int32x4_t v_src = vld1q_s32(&source[i * 4]);
+      const int32x4_t v_dst_row =
+          vshrq_n_s32(vmlaq_n_s32(v_round, v_src, kIdentity4Multiplier), 12);
+      const int32x4_t v_dst_col =
+          vmlaq_n_s32(v_round, v_dst_row, kIdentity4Multiplier);
+      const uint16x4_t frame_data = vld1_u16(dst);
+      const int32x4_t a = vrshrq_n_s32(v_dst_col, 4 + 12);
+      const int32x4_t b = vaddw_s16(a, vreinterpret_s16_u16(frame_data));
+      vst1_u16(dst, vmin_u16(vqmovun_s32(b), v_max_bitdepth));
+      dst += stride;
+    } while (++i < tx_height);
+  } else {
+    int i = 0;
+    do {
+      const int row = i * tx_width;
+      int j = 0;
+      do {
+        int32x4x2_t v_src, v_src_round, v_dst_row, v_dst_col, a, b;
+        v_src.val[0] = vld1q_s32(&source[row + j]);
+        v_src.val[1] = vld1q_s32(&source[row + j + 4]);
+        v_src_round.val[0] = vshrq_n_s32(
+            vmlaq_n_s32(v_round, v_src.val[0], kTransformRowMultiplier), 12);
+        v_src_round.val[1] = vshrq_n_s32(
+            vmlaq_n_s32(v_round, v_src.val[1], kTransformRowMultiplier), 12);
+        v_dst_row.val[0] = vqaddq_s32(v_src_round.val[0], v_src_round.val[0]);
+        v_dst_row.val[1] = vqaddq_s32(v_src_round.val[1], v_src_round.val[1]);
+        v_dst_col.val[0] =
+            vmlaq_n_s32(v_round, v_dst_row.val[0], kIdentity4Multiplier);
+        v_dst_col.val[1] =
+            vmlaq_n_s32(v_round, v_dst_row.val[1], kIdentity4Multiplier);
+        uint16x4x2_t frame_data;
+        frame_data.val[0] = vld1_u16(dst + j);
+        frame_data.val[1] = vld1_u16(dst + j + 4);
+        a.val[0] = vrshrq_n_s32(v_dst_col.val[0], 4 + 12);
+        a.val[1] = vrshrq_n_s32(v_dst_col.val[1], 4 + 12);
+        b.val[0] = vaddw_s16(a.val[0], vreinterpret_s16_u16(frame_data.val[0]));
+        b.val[1] = vaddw_s16(a.val[1], vreinterpret_s16_u16(frame_data.val[1]));
+        vst1_u16(dst + j, vmin_u16(vqmovun_s32(b.val[0]), v_max_bitdepth));
+        vst1_u16(dst + j + 4, vmin_u16(vqmovun_s32(b.val[1]), v_max_bitdepth));
+        j += 8;
+      } while (j < tx_width);
+      dst += stride;
+    } while (++i < tx_height);
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity8Row32_NEON(void* dest, int32_t step) {
+  auto* const dst = static_cast<int32_t*>(dest);
+
+  // When combining the identity8 multiplier with the row shift, the
+  // calculations for tx_height equal to 32 can be simplified from
+  // ((A * 2) + 2) >> 2) to ((A + 1) >> 1).
+  for (int i = 0; i < 4; ++i) {
+    const int32x4_t v_src_lo = vld1q_s32(&dst[i * step]);
+    const int32x4_t v_src_hi = vld1q_s32(&dst[(i * step) + 4]);
+    const int32x4_t a_lo = vrshrq_n_s32(v_src_lo, 1);
+    const int32x4_t a_hi = vrshrq_n_s32(v_src_hi, 1);
+    vst1q_s32(&dst[i * step], vmovl_s16(vqmovn_s32(a_lo)));
+    vst1q_s32(&dst[(i * step) + 4], vmovl_s16(vqmovn_s32(a_hi)));
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity8Row4_NEON(void* dest, int32_t step) {
+  auto* const dst = static_cast<int32_t*>(dest);
+
+  for (int i = 0; i < 4; ++i) {
+    const int32x4_t v_src_lo = vld1q_s32(&dst[i * step]);
+    const int32x4_t v_src_hi = vld1q_s32(&dst[(i * step) + 4]);
+    const int32x4_t v_srcx2_lo = vqaddq_s32(v_src_lo, v_src_lo);
+    const int32x4_t v_srcx2_hi = vqaddq_s32(v_src_hi, v_src_hi);
+    vst1q_s32(&dst[i * step], vmovl_s16(vqmovn_s32(v_srcx2_lo)));
+    vst1q_s32(&dst[(i * step) + 4], vmovl_s16(vqmovn_s32(v_srcx2_hi)));
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Identity8DcOnly(void* dest, int adjusted_tx_height,
+                                           bool should_round, int row_shift) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int32_t*>(dest);
+  const int32x4_t v_src0 = vdupq_n_s32(dst[0]);
+  const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0);
+  const int32x4_t v_src_round =
+      vqrdmulhq_n_s32(v_src0, kTransformRowMultiplier << (31 - 12));
+  const int32x4_t v_src = vbslq_s32(v_mask, v_src_round, v_src0);
+  const int32x4_t v_srcx2 = vaddq_s32(v_src, v_src);
+  const int32x4_t dst_0 = vqrshlq_s32(v_srcx2, vdupq_n_s32(-row_shift));
+  vst1q_lane_s32(dst, vmovl_s16(vqmovn_s32(dst_0)), 0);
+  return true;
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity16Row_NEON(void* dest, int32_t step,
+                                              int shift) {
+  auto* const dst = static_cast<int32_t*>(dest);
+  const int32x4_t v_dual_round = vdupq_n_s32((1 + (shift << 1)) << 11);
+  const int32x4_t v_shift = vdupq_n_s32(-(12 + shift));
+
+  for (int i = 0; i < 4; ++i) {
+    for (int j = 0; j < 2; ++j) {
+      int32x4x2_t v_src;
+      v_src.val[0] = vld1q_s32(&dst[i * step + j * 8]);
+      v_src.val[1] = vld1q_s32(&dst[i * step + j * 8 + 4]);
+      const int32x4_t v_src_mult_lo =
+          vmlaq_n_s32(v_dual_round, v_src.val[0], kIdentity16Multiplier);
+      const int32x4_t v_src_mult_hi =
+          vmlaq_n_s32(v_dual_round, v_src.val[1], kIdentity16Multiplier);
+      const int32x4_t shift_lo = vqshlq_s32(v_src_mult_lo, v_shift);
+      const int32x4_t shift_hi = vqshlq_s32(v_src_mult_hi, v_shift);
+      vst1q_s32(&dst[i * step + j * 8], vmovl_s16(vqmovn_s32(shift_lo)));
+      vst1q_s32(&dst[i * step + j * 8 + 4], vmovl_s16(vqmovn_s32(shift_hi)));
+    }
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Identity16DcOnly(void* dest, int adjusted_tx_height,
+                                            bool should_round, int shift) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int32_t*>(dest);
+  const int32x4_t v_src0 = vdupq_n_s32(dst[0]);
+  const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0);
+  const int32x4_t v_src_round =
+      vqrdmulhq_n_s32(v_src0, kTransformRowMultiplier << (31 - 12));
+  const int32x4_t v_src = vbslq_s32(v_mask, v_src_round, v_src0);
+  const int32x4_t v_dual_round = vdupq_n_s32((1 + (shift << 1)) << 11);
+  const int32x4_t v_src_mult_lo =
+      vmlaq_n_s32(v_dual_round, v_src, kIdentity16Multiplier);
+  const int32x4_t dst_0 = vqshlq_s32(v_src_mult_lo, vdupq_n_s32(-(12 + shift)));
+  vst1q_lane_s32(dst, vmovl_s16(vqmovn_s32(dst_0)), 0);
+  return true;
+}
+
+//------------------------------------------------------------------------------
+// row/column transform loops
+
+template <int tx_height>
+LIBGAV1_ALWAYS_INLINE void FlipColumns(int32_t* source, int tx_width) {
+  if (tx_width >= 16) {
+    int i = 0;
+    do {
+      // 00 01 02 03
+      const int32x4_t a = vld1q_s32(&source[i]);
+      const int32x4_t b = vld1q_s32(&source[i + 4]);
+      const int32x4_t c = vld1q_s32(&source[i + 8]);
+      const int32x4_t d = vld1q_s32(&source[i + 12]);
+      // 01 00 03 02
+      const int32x4_t a_rev = vrev64q_s32(a);
+      const int32x4_t b_rev = vrev64q_s32(b);
+      const int32x4_t c_rev = vrev64q_s32(c);
+      const int32x4_t d_rev = vrev64q_s32(d);
+      // 03 02 01 00
+      vst1q_s32(&source[i], vextq_s32(d_rev, d_rev, 2));
+      vst1q_s32(&source[i + 4], vextq_s32(c_rev, c_rev, 2));
+      vst1q_s32(&source[i + 8], vextq_s32(b_rev, b_rev, 2));
+      vst1q_s32(&source[i + 12], vextq_s32(a_rev, a_rev, 2));
+      i += 16;
+    } while (i < tx_width * tx_height);
+  } else if (tx_width == 8) {
+    for (int i = 0; i < 8 * tx_height; i += 8) {
+      // 00 01 02 03
+      const int32x4_t a = vld1q_s32(&source[i]);
+      const int32x4_t b = vld1q_s32(&source[i + 4]);
+      // 01 00 03 02
+      const int32x4_t a_rev = vrev64q_s32(a);
+      const int32x4_t b_rev = vrev64q_s32(b);
+      // 03 02 01 00
+      vst1q_s32(&source[i], vextq_s32(b_rev, b_rev, 2));
+      vst1q_s32(&source[i + 4], vextq_s32(a_rev, a_rev, 2));
+    }
+  } else {
+    // Process two rows per iteration.
+    for (int i = 0; i < 4 * tx_height; i += 8) {
+      // 00 01 02 03
+      const int32x4_t a = vld1q_s32(&source[i]);
+      const int32x4_t b = vld1q_s32(&source[i + 4]);
+      // 01 00 03 02
+      const int32x4_t a_rev = vrev64q_s32(a);
+      const int32x4_t b_rev = vrev64q_s32(b);
+      // 03 02 01 00
+      vst1q_s32(&source[i], vextq_s32(a_rev, a_rev, 2));
+      vst1q_s32(&source[i + 4], vextq_s32(b_rev, b_rev, 2));
+    }
+  }
+}
+
+template <int tx_width>
+LIBGAV1_ALWAYS_INLINE void ApplyRounding(int32_t* source, int num_rows) {
+  // Process two rows per iteration.
+  int i = 0;
+  do {
+    const int32x4_t a_lo = vld1q_s32(&source[i]);
+    const int32x4_t a_hi = vld1q_s32(&source[i + 4]);
+    const int32x4_t b_lo =
+        vqrdmulhq_n_s32(a_lo, kTransformRowMultiplier << (31 - 12));
+    const int32x4_t b_hi =
+        vqrdmulhq_n_s32(a_hi, kTransformRowMultiplier << (31 - 12));
+    vst1q_s32(&source[i], b_lo);
+    vst1q_s32(&source[i + 4], b_hi);
+    i += 8;
+  } while (i < tx_width * num_rows);
+}
+
+template <int tx_width>
+LIBGAV1_ALWAYS_INLINE void RowShift(int32_t* source, int num_rows,
+                                    int row_shift) {
+  // vqrshlq_s32 will shift right if shift value is negative.
+  row_shift = -row_shift;
+
+  // Process two rows per iteration.
+  int i = 0;
+  do {
+    const int32x4_t residual0 = vld1q_s32(&source[i]);
+    const int32x4_t residual1 = vld1q_s32(&source[i + 4]);
+    vst1q_s32(&source[i], vqrshlq_s32(residual0, vdupq_n_s32(row_shift)));
+    vst1q_s32(&source[i + 4], vqrshlq_s32(residual1, vdupq_n_s32(row_shift)));
+    i += 8;
+  } while (i < tx_width * num_rows);
+}
+
+template <int tx_height, bool enable_flip_rows = false>
+LIBGAV1_ALWAYS_INLINE void StoreToFrameWithRound(
+    Array2DView<uint16_t> frame, const int start_x, const int start_y,
+    const int tx_width, const int32_t* source, TransformType tx_type) {
+  const bool flip_rows =
+      enable_flip_rows ? kTransformFlipRowsMask.Contains(tx_type) : false;
+  const int stride = frame.columns();
+  uint16_t* dst = frame[start_y] + start_x;
+
+  if (tx_width == 4) {
+    for (int i = 0; i < tx_height; ++i) {
+      const int row = flip_rows ? (tx_height - i - 1) * 4 : i * 4;
+      const int32x4_t residual = vld1q_s32(&source[row]);
+      const uint16x4_t frame_data = vld1_u16(dst);
+      const int32x4_t a = vrshrq_n_s32(residual, 4);
+      const uint32x4_t b = vaddw_u16(vreinterpretq_u32_s32(a), frame_data);
+      const uint16x4_t d = vqmovun_s32(vreinterpretq_s32_u32(b));
+      vst1_u16(dst, vmin_u16(d, vdup_n_u16((1 << kBitdepth10) - 1)));
+      dst += stride;
+    }
+  } else {
+    for (int i = 0; i < tx_height; ++i) {
+      const int y = start_y + i;
+      const int row = flip_rows ? (tx_height - i - 1) * tx_width : i * tx_width;
+      int j = 0;
+      do {
+        const int x = start_x + j;
+        const int32x4_t residual = vld1q_s32(&source[row + j]);
+        const int32x4_t residual_hi = vld1q_s32(&source[row + j + 4]);
+        const uint16x8_t frame_data = vld1q_u16(frame[y] + x);
+        const int32x4_t a = vrshrq_n_s32(residual, 4);
+        const int32x4_t a_hi = vrshrq_n_s32(residual_hi, 4);
+        const uint32x4_t b =
+            vaddw_u16(vreinterpretq_u32_s32(a), vget_low_u16(frame_data));
+        const uint32x4_t b_hi =
+            vaddw_u16(vreinterpretq_u32_s32(a_hi), vget_high_u16(frame_data));
+        const uint16x4_t d = vqmovun_s32(vreinterpretq_s32_u32(b));
+        const uint16x4_t d_hi = vqmovun_s32(vreinterpretq_s32_u32(b_hi));
+        vst1q_u16(frame[y] + x, vminq_u16(vcombine_u16(d, d_hi),
+                                          vdupq_n_u16((1 << kBitdepth10) - 1)));
+        j += 8;
+      } while (j < tx_width);
+    }
+  }
+}
+
+void Dct4TransformLoopRow_NEON(TransformType /*tx_type*/, TransformSize tx_size,
+                               int adjusted_tx_height, void* src_buffer,
+                               int /*start_x*/, int /*start_y*/,
+                               void* /*dst_frame*/) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const int tx_height = kTransformHeight[tx_size];
+  const bool should_round = (tx_height == 8);
+  const int row_shift = (tx_height == 16);
+
+  if (DctDcOnly<4>(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<4>(src, adjusted_tx_height);
+  }
+
+  // Process 4 1d dct4 rows in parallel per iteration.
+  int i = adjusted_tx_height;
+  auto* data = src;
+  do {
+    Dct4_NEON<ButterflyRotation_4>(data, /*step=*/4, /*is_row=*/true,
+                                   row_shift);
+    data += 16;
+    i -= 4;
+  } while (i != 0);
+}
+
+void Dct4TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+                                  int adjusted_tx_height, void* src_buffer,
+                                  int start_x, int start_y, void* dst_frame) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<4>(src, tx_width);
+  }
+
+  if (!DctDcOnlyColumn<4>(src, adjusted_tx_height, tx_width)) {
+    // Process 4 1d dct4 columns in parallel per iteration.
+    int i = tx_width;
+    auto* data = src;
+    do {
+      Dct4_NEON<ButterflyRotation_4>(data, tx_width, /*transpose=*/false,
+                                     /*row_shift=*/0);
+      data += 4;
+      i -= 4;
+    } while (i != 0);
+  }
+
+  auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+  StoreToFrameWithRound<4>(frame, start_x, start_y, tx_width, src, tx_type);
+}
+
+void Dct8TransformLoopRow_NEON(TransformType /*tx_type*/, TransformSize tx_size,
+                               int adjusted_tx_height, void* src_buffer,
+                               int /*start_x*/, int /*start_y*/,
+                               void* /*dst_frame*/) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+
+  if (DctDcOnly<8>(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<8>(src, adjusted_tx_height);
+  }
+
+  // Process 4 1d dct8 rows in parallel per iteration.
+  int i = adjusted_tx_height;
+  auto* data = src;
+  do {
+    Dct8_NEON<ButterflyRotation_4>(data, /*step=*/8, /*is_row=*/true,
+                                   row_shift);
+    data += 32;
+    i -= 4;
+  } while (i != 0);
+}
+
+void Dct8TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+                                  int adjusted_tx_height, void* src_buffer,
+                                  int start_x, int start_y, void* dst_frame) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<8>(src, tx_width);
+  }
+
+  if (!DctDcOnlyColumn<8>(src, adjusted_tx_height, tx_width)) {
+    // Process 4 1d dct8 columns in parallel per iteration.
+    int i = tx_width;
+    auto* data = src;
+    do {
+      Dct8_NEON<ButterflyRotation_4>(data, tx_width, /*is_row=*/false,
+                                     /*row_shift=*/0);
+      data += 4;
+      i -= 4;
+    } while (i != 0);
+  }
+  auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+  StoreToFrameWithRound<8>(frame, start_x, start_y, tx_width, src, tx_type);
+}
+
+void Dct16TransformLoopRow_NEON(TransformType /*tx_type*/,
+                                TransformSize tx_size, int adjusted_tx_height,
+                                void* src_buffer, int /*start_x*/,
+                                int /*start_y*/, void* /*dst_frame*/) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+
+  if (DctDcOnly<16>(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<16>(src, adjusted_tx_height);
+  }
+
+  assert(adjusted_tx_height % 4 == 0);
+  int i = adjusted_tx_height;
+  auto* data = src;
+  do {
+    // Process 4 1d dct16 rows in parallel per iteration.
+    Dct16_NEON<ButterflyRotation_4>(data, 16, /*is_row=*/true, row_shift);
+    data += 64;
+    i -= 4;
+  } while (i != 0);
+}
+
+void Dct16TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+                                   int adjusted_tx_height, void* src_buffer,
+                                   int start_x, int start_y, void* dst_frame) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<16>(src, tx_width);
+  }
+
+  if (!DctDcOnlyColumn<16>(src, adjusted_tx_height, tx_width)) {
+    // Process 4 1d dct16 columns in parallel per iteration.
+    int i = tx_width;
+    auto* data = src;
+    do {
+      Dct16_NEON<ButterflyRotation_4>(data, tx_width, /*is_row=*/false,
+                                      /*row_shift=*/0);
+      data += 4;
+      i -= 4;
+    } while (i != 0);
+  }
+  auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+  StoreToFrameWithRound<16>(frame, start_x, start_y, tx_width, src, tx_type);
+}
+
+void Dct32TransformLoopRow_NEON(TransformType /*tx_type*/,
+                                TransformSize tx_size, int adjusted_tx_height,
+                                void* src_buffer, int /*start_x*/,
+                                int /*start_y*/, void* /*dst_frame*/) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+
+  if (DctDcOnly<32>(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<32>(src, adjusted_tx_height);
+  }
+
+  assert(adjusted_tx_height % 4 == 0);
+  int i = adjusted_tx_height;
+  auto* data = src;
+  do {
+    // Process 4 1d dct32 rows in parallel per iteration.
+    Dct32_NEON(data, 32, /*is_row=*/true, row_shift);
+    data += 128;
+    i -= 4;
+  } while (i != 0);
+}
+
+void Dct32TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+                                   int adjusted_tx_height, void* src_buffer,
+                                   int start_x, int start_y, void* dst_frame) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<32>(src, tx_width);
+  }
+
+  if (!DctDcOnlyColumn<32>(src, adjusted_tx_height, tx_width)) {
+    // Process 4 1d dct32 columns in parallel per iteration.
+    int i = tx_width;
+    auto* data = src;
+    do {
+      Dct32_NEON(data, tx_width, /*is_row=*/false, /*row_shift=*/0);
+      data += 4;
+      i -= 4;
+    } while (i != 0);
+  }
+  auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+  StoreToFrameWithRound<32>(frame, start_x, start_y, tx_width, src, tx_type);
+}
+
+void Dct64TransformLoopRow_NEON(TransformType /*tx_type*/,
+                                TransformSize tx_size, int adjusted_tx_height,
+                                void* src_buffer, int /*start_x*/,
+                                int /*start_y*/, void* /*dst_frame*/) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+
+  if (DctDcOnly<64>(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<64>(src, adjusted_tx_height);
+  }
+
+  assert(adjusted_tx_height % 4 == 0);
+  int i = adjusted_tx_height;
+  auto* data = src;
+  do {
+    // Process 4 1d dct64 rows in parallel per iteration.
+    Dct64_NEON(data, 64, /*is_row=*/true, row_shift);
+    data += 128 * 2;
+    i -= 4;
+  } while (i != 0);
+}
+
+void Dct64TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+                                   int adjusted_tx_height, void* src_buffer,
+                                   int start_x, int start_y, void* dst_frame) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<64>(src, tx_width);
+  }
+
+  if (!DctDcOnlyColumn<64>(src, adjusted_tx_height, tx_width)) {
+    // Process 4 1d dct64 columns in parallel per iteration.
+    int i = tx_width;
+    auto* data = src;
+    do {
+      Dct64_NEON(data, tx_width, /*is_row=*/false, /*row_shift=*/0);
+      data += 4;
+      i -= 4;
+    } while (i != 0);
+  }
+  auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+  StoreToFrameWithRound<64>(frame, start_x, start_y, tx_width, src, tx_type);
+}
+
+void Adst4TransformLoopRow_NEON(TransformType /*tx_type*/,
+                                TransformSize tx_size, int adjusted_tx_height,
+                                void* src_buffer, int /*start_x*/,
+                                int /*start_y*/, void* /*dst_frame*/) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const int tx_height = kTransformHeight[tx_size];
+  const int row_shift = static_cast<int>(tx_height == 16);
+  const bool should_round = (tx_height == 8);
+
+  if (Adst4DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<4>(src, adjusted_tx_height);
+  }
+
+  // Process 4 1d adst4 rows in parallel per iteration.
+  int i = adjusted_tx_height;
+  auto* data = src;
+  do {
+    Adst4_NEON(data, /*step=*/4, /*is_row=*/true, row_shift);
+    data += 16;
+    i -= 4;
+  } while (i != 0);
+}
+
+void Adst4TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+                                   int adjusted_tx_height, void* src_buffer,
+                                   int start_x, int start_y, void* dst_frame) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<4>(src, tx_width);
+  }
+
+  if (!Adst4DcOnlyColumn(src, adjusted_tx_height, tx_width)) {
+    // Process 4 1d adst4 columns in parallel per iteration.
+    int i = tx_width;
+    auto* data = src;
+    do {
+      Adst4_NEON(data, tx_width, /*is_row=*/false, /*row_shift=*/0);
+      data += 4;
+      i -= 4;
+    } while (i != 0);
+  }
+
+  auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+  StoreToFrameWithRound<4, /*enable_flip_rows=*/true>(frame, start_x, start_y,
+                                                      tx_width, src, tx_type);
+}
+
+void Adst8TransformLoopRow_NEON(TransformType /*tx_type*/,
+                                TransformSize tx_size, int adjusted_tx_height,
+                                void* src_buffer, int /*start_x*/,
+                                int /*start_y*/, void* /*dst_frame*/) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+
+  if (Adst8DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<8>(src, adjusted_tx_height);
+  }
+
+  // Process 4 1d adst8 rows in parallel per iteration.
+  assert(adjusted_tx_height % 4 == 0);
+  int i = adjusted_tx_height;
+  auto* data = src;
+  do {
+    Adst8_NEON<ButterflyRotation_4>(data, /*step=*/8,
+                                    /*transpose=*/true, row_shift);
+    data += 32;
+    i -= 4;
+  } while (i != 0);
+}
+
+void Adst8TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+                                   int adjusted_tx_height, void* src_buffer,
+                                   int start_x, int start_y, void* dst_frame) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<8>(src, tx_width);
+  }
+
+  if (!Adst8DcOnlyColumn(src, adjusted_tx_height, tx_width)) {
+    // Process 4 1d adst8 columns in parallel per iteration.
+    int i = tx_width;
+    auto* data = src;
+    do {
+      Adst8_NEON<ButterflyRotation_4>(data, tx_width, /*transpose=*/false,
+                                      /*row_shift=*/0);
+      data += 4;
+      i -= 4;
+    } while (i != 0);
+  }
+  auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+  StoreToFrameWithRound<8, /*enable_flip_rows=*/true>(frame, start_x, start_y,
+                                                      tx_width, src, tx_type);
+}
+
+void Adst16TransformLoopRow_NEON(TransformType /*tx_type*/,
+                                 TransformSize tx_size, int adjusted_tx_height,
+                                 void* src_buffer, int /*start_x*/,
+                                 int /*start_y*/, void* /*dst_frame*/) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+
+  if (Adst16DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<16>(src, adjusted_tx_height);
+  }
+
+  assert(adjusted_tx_height % 4 == 0);
+  int i = adjusted_tx_height;
+  do {
+    // Process 4 1d adst16 rows in parallel per iteration.
+    Adst16_NEON<ButterflyRotation_4>(src, 16, /*is_row=*/true, row_shift);
+    src += 64;
+    i -= 4;
+  } while (i != 0);
+}
+
+void Adst16TransformLoopColumn_NEON(TransformType tx_type,
+                                    TransformSize tx_size,
+                                    int adjusted_tx_height, void* src_buffer,
+                                    int start_x, int start_y, void* dst_frame) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<16>(src, tx_width);
+  }
+
+  if (!Adst16DcOnlyColumn(src, adjusted_tx_height, tx_width)) {
+    int i = tx_width;
+    auto* data = src;
+    do {
+      // Process 4 1d adst16 columns in parallel per iteration.
+      Adst16_NEON<ButterflyRotation_4>(data, tx_width, /*is_row=*/false,
+                                       /*row_shift=*/0);
+      data += 4;
+      i -= 4;
+    } while (i != 0);
+  }
+  auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+  StoreToFrameWithRound<16, /*enable_flip_rows=*/true>(frame, start_x, start_y,
+                                                       tx_width, src, tx_type);
+}
+
+void Identity4TransformLoopRow_NEON(TransformType tx_type,
+                                    TransformSize tx_size,
+                                    int adjusted_tx_height, void* src_buffer,
+                                    int /*start_x*/, int /*start_y*/,
+                                    void* /*dst_frame*/) {
+  // Special case: Process row calculations during column transform call.
+  // Improves performance.
+  if (tx_type == kTransformTypeIdentityIdentity &&
+      tx_size == kTransformSize4x4) {
+    return;
+  }
+
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const int tx_height = kTransformHeight[tx_size];
+  const bool should_round = (tx_height == 8);
+
+  if (Identity4DcOnly(src, adjusted_tx_height, should_round, tx_height)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<4>(src, adjusted_tx_height);
+  }
+
+  const int shift = tx_height > 8 ? 1 : 0;
+  int i = adjusted_tx_height;
+  do {
+    Identity4_NEON(src, /*step=*/4, shift);
+    src += 16;
+    i -= 4;
+  } while (i != 0);
+}
+
+void Identity4TransformLoopColumn_NEON(TransformType tx_type,
+                                       TransformSize tx_size,
+                                       int adjusted_tx_height, void* src_buffer,
+                                       int start_x, int start_y,
+                                       void* dst_frame) {
+  auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  // Special case: Process row calculations during column transform call.
+  if (tx_type == kTransformTypeIdentityIdentity &&
+      (tx_size == kTransformSize4x4 || tx_size == kTransformSize8x4)) {
+    Identity4RowColumnStoreToFrame(frame, start_x, start_y, tx_width,
+                                   adjusted_tx_height, src);
+    return;
+  }
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<4>(src, tx_width);
+  }
+
+  IdentityColumnStoreToFrame<4>(frame, start_x, start_y, tx_width,
+                                adjusted_tx_height, src);
+}
+
+void Identity8TransformLoopRow_NEON(TransformType tx_type,
+                                    TransformSize tx_size,
+                                    int adjusted_tx_height, void* src_buffer,
+                                    int /*start_x*/, int /*start_y*/,
+                                    void* /*dst_frame*/) {
+  // Special case: Process row calculations during column transform call.
+  // Improves performance.
+  if (tx_type == kTransformTypeIdentityIdentity &&
+      tx_size == kTransformSize8x4) {
+    return;
+  }
+
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const int tx_height = kTransformHeight[tx_size];
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+
+  if (Identity8DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+  if (should_round) {
+    ApplyRounding<8>(src, adjusted_tx_height);
+  }
+
+  // When combining the identity8 multiplier with the row shift, the
+  // calculations for tx_height == 8 and tx_height == 16 can be simplified
+  // from ((A * 2) + 1) >> 1) to A. For 10bpp, A must be clamped to a signed 16
+  // bit value.
+  if ((tx_height & 0x18) != 0) {
+    for (int i = 0; i < tx_height; ++i) {
+      const int32x4_t v_src_lo = vld1q_s32(&src[i * 8]);
+      const int32x4_t v_src_hi = vld1q_s32(&src[(i * 8) + 4]);
+      vst1q_s32(&src[i * 8], vmovl_s16(vqmovn_s32(v_src_lo)));
+      vst1q_s32(&src[(i * 8) + 4], vmovl_s16(vqmovn_s32(v_src_hi)));
+    }
+    return;
+  }
+  if (tx_height == 32) {
+    int i = adjusted_tx_height;
+    do {
+      Identity8Row32_NEON(src, /*step=*/8);
+      src += 32;
+      i -= 4;
+    } while (i != 0);
+    return;
+  }
+
+  assert(tx_size == kTransformSize8x4);
+  int i = adjusted_tx_height;
+  do {
+    Identity8Row4_NEON(src, /*step=*/8);
+    src += 32;
+    i -= 4;
+  } while (i != 0);
+}
+
+void Identity8TransformLoopColumn_NEON(TransformType tx_type,
+                                       TransformSize tx_size,
+                                       int adjusted_tx_height, void* src_buffer,
+                                       int start_x, int start_y,
+                                       void* dst_frame) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<8>(src, tx_width);
+  }
+  auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+  IdentityColumnStoreToFrame<8>(frame, start_x, start_y, tx_width,
+                                adjusted_tx_height, src);
+}
+
+void Identity16TransformLoopRow_NEON(TransformType /*tx_type*/,
+                                     TransformSize tx_size,
+                                     int adjusted_tx_height, void* src_buffer,
+                                     int /*start_x*/, int /*start_y*/,
+                                     void* /*dst_frame*/) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+
+  if (Identity16DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<16>(src, adjusted_tx_height);
+  }
+  int i = adjusted_tx_height;
+  do {
+    Identity16Row_NEON(src, /*step=*/16, row_shift);
+    src += 64;
+    i -= 4;
+  } while (i != 0);
+}
+
+void Identity16TransformLoopColumn_NEON(TransformType tx_type,
+                                        TransformSize tx_size,
+                                        int adjusted_tx_height,
+                                        void* src_buffer, int start_x,
+                                        int start_y, void* dst_frame) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<16>(src, tx_width);
+  }
+  auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+  IdentityColumnStoreToFrame<16>(frame, start_x, start_y, tx_width,
+                                 adjusted_tx_height, src);
+}
+
+//------------------------------------------------------------------------------
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+  // Maximum transform size for Dct is 64.
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kRow] =
+      Dct4TransformLoopRow_NEON;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kColumn] =
+      Dct4TransformLoopColumn_NEON;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kRow] =
+      Dct8TransformLoopRow_NEON;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kColumn] =
+      Dct8TransformLoopColumn_NEON;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kRow] =
+      Dct16TransformLoopRow_NEON;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kColumn] =
+      Dct16TransformLoopColumn_NEON;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kRow] =
+      Dct32TransformLoopRow_NEON;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kColumn] =
+      Dct32TransformLoopColumn_NEON;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kRow] =
+      Dct64TransformLoopRow_NEON;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kColumn] =
+      Dct64TransformLoopColumn_NEON;
+
+  // Maximum transform size for Adst is 16.
+  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kRow] =
+      Adst4TransformLoopRow_NEON;
+  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kColumn] =
+      Adst4TransformLoopColumn_NEON;
+  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kRow] =
+      Adst8TransformLoopRow_NEON;
+  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kColumn] =
+      Adst8TransformLoopColumn_NEON;
+  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kRow] =
+      Adst16TransformLoopRow_NEON;
+  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kColumn] =
+      Adst16TransformLoopColumn_NEON;
+
+  // Maximum transform size for Identity transform is 32.
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kRow] =
+      Identity4TransformLoopRow_NEON;
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kColumn] =
+      Identity4TransformLoopColumn_NEON;
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kRow] =
+      Identity8TransformLoopRow_NEON;
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kColumn] =
+      Identity8TransformLoopColumn_NEON;
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kRow] =
+      Identity16TransformLoopRow_NEON;
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kColumn] =
+      Identity16TransformLoopColumn_NEON;
+}
+
+}  // namespace
+
+void InverseTransformInit10bpp_NEON() { Init10bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+#else   // !LIBGAV1_ENABLE_NEON || LIBGAV1_MAX_BITDEPTH < 10
+namespace libgav1 {
+namespace dsp {
+
+void InverseTransformInit10bpp_NEON() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_ENABLE_NEON && LIBGAV1_MAX_BITDEPTH >= 10
diff --git a/src/dsp/arm/inverse_transform_neon.cc b/src/dsp/arm/inverse_transform_neon.cc
index 072991a..315d5e9 100644
--- a/src/dsp/arm/inverse_transform_neon.cc
+++ b/src/dsp/arm/inverse_transform_neon.cc
@@ -3117,7 +3117,7 @@ void InverseTransformInit_NEON() { low_bitdepth::Init8bpp(); }
 
 }  // namespace dsp
 }  // namespace libgav1
-#else  // !LIBGAV1_ENABLE_NEON
+#else   // !LIBGAV1_ENABLE_NEON
 namespace libgav1 {
 namespace dsp {
 
diff --git a/src/dsp/arm/inverse_transform_neon.h b/src/dsp/arm/inverse_transform_neon.h
index af647e8..91e0e83 100644
--- a/src/dsp/arm/inverse_transform_neon.h
+++ b/src/dsp/arm/inverse_transform_neon.h
@@ -26,6 +26,7 @@ namespace dsp {
 // Initializes Dsp::inverse_transforms, see the defines below for specifics.
 // This function is not thread-safe.
 void InverseTransformInit_NEON();
+void InverseTransformInit10bpp_NEON();
 
 }  // namespace dsp
 }  // namespace libgav1
@@ -47,6 +48,21 @@ void InverseTransformInit_NEON();
 #define LIBGAV1_Dsp8bpp_1DTransformSize32_1DTransformIdentity LIBGAV1_CPU_NEON
 
 #define LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformWht LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_1DTransformSize4_1DTransformDct LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_1DTransformSize8_1DTransformDct LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_1DTransformSize16_1DTransformDct LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_1DTransformSize32_1DTransformDct LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_1DTransformSize64_1DTransformDct LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_1DTransformSize4_1DTransformAdst LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_1DTransformSize8_1DTransformAdst LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_1DTransformSize16_1DTransformAdst LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_1DTransformSize4_1DTransformIdentity LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_1DTransformSize8_1DTransformIdentity LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_1DTransformSize16_1DTransformIdentity LIBGAV1_CPU_NEON
+
 #endif  // LIBGAV1_ENABLE_NEON
 
 #endif  // LIBGAV1_SRC_DSP_ARM_INVERSE_TRANSFORM_NEON_H_
diff --git a/src/dsp/arm/loop_filter_neon.cc b/src/dsp/arm/loop_filter_neon.cc
index 146c983..8d72892 100644
--- a/src/dsp/arm/loop_filter_neon.cc
+++ b/src/dsp/arm/loop_filter_neon.cc
@@ -35,7 +35,7 @@ namespace {
 // (abs(p1 - p0) > thresh) || (abs(q1 - q0) > thresh)
 inline uint8x8_t Hev(const uint8x8_t abd_p0p1_q0q1, const uint8_t thresh) {
   const uint8x8_t a = vcgt_u8(abd_p0p1_q0q1, vdup_n_u8(thresh));
-  return vorr_u8(a, RightShift<32>(a));
+  return vorr_u8(a, RightShiftVector<32>(a));
 }
 
 // abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= outer_thresh
@@ -44,7 +44,7 @@ inline uint8x8_t OuterThreshold(const uint8x8_t p0q0, const uint8x8_t p1q1,
   const uint8x8x2_t a = Interleave32(p0q0, p1q1);
   const uint8x8_t b = vabd_u8(a.val[0], a.val[1]);
   const uint8x8_t p0q0_double = vqadd_u8(b, b);
-  const uint8x8_t p1q1_half = RightShift<32>(vshr_n_u8(b, 1));
+  const uint8x8_t p1q1_half = RightShiftVector<32>(vshr_n_u8(b, 1));
   const uint8x8_t c = vqadd_u8(p0q0_double, p1q1_half);
   return vcle_u8(c, vdup_n_u8(outer_thresh));
 }
@@ -56,7 +56,7 @@ inline uint8x8_t NeedsFilter4(const uint8x8_t abd_p0p1_q0q1,
                               const uint8_t inner_thresh,
                               const uint8_t outer_thresh) {
   const uint8x8_t a = vcle_u8(abd_p0p1_q0q1, vdup_n_u8(inner_thresh));
-  const uint8x8_t inner_mask = vand_u8(a, RightShift<32>(a));
+  const uint8x8_t inner_mask = vand_u8(a, RightShiftVector<32>(a));
   const uint8x8_t outer_mask = OuterThreshold(p0q0, p1q1, outer_thresh);
   return vand_u8(inner_mask, outer_mask);
 }
@@ -121,7 +121,7 @@ inline void Filter4(const uint8x8_t q0p1, const uint8x8_t p0q1,
       vcombine_s16(vget_low_s16(p0q1_l), vget_low_s16(q0p1_l));
   // Need to shift the second term or we end up with a2_ma2.
   const int8x8_t a2_ma1 =
-      InterleaveLow32(a2_a1, RightShift<32>(vneg_s8(a2_a1)));
+      InterleaveLow32(a2_a1, RightShiftVector<32>(vneg_s8(a2_a1)));
   const int16x8_t p0q0_a = vaddw_s8(p0q0_l, a2_ma1);
 
   *p1q1_result = vqmovun_s16(p1q1_a3);
@@ -251,7 +251,7 @@ inline uint8x8_t IsFlat3(const uint8x8_t abd_p0p1_q0q1,
                          const uint8x8_t abd_p0p2_q0q2) {
   const uint8x8_t a = vmax_u8(abd_p0p1_q0q1, abd_p0p2_q0q2);
   const uint8x8_t b = vcle_u8(a, vdup_n_u8(1));
-  return vand_u8(b, RightShift<32>(b));
+  return vand_u8(b, RightShiftVector<32>(b));
 }
 
 // abs(p2 - p1) <= inner_thresh && abs(p1 - p0) <= inner_thresh &&
@@ -264,7 +264,7 @@ inline uint8x8_t NeedsFilter6(const uint8x8_t abd_p0p1_q0q1,
                               const uint8_t outer_thresh) {
   const uint8x8_t a = vmax_u8(abd_p0p1_q0q1, abd_p1p2_q1q2);
   const uint8x8_t b = vcle_u8(a, vdup_n_u8(inner_thresh));
-  const uint8x8_t inner_mask = vand_u8(b, RightShift<32>(b));
+  const uint8x8_t inner_mask = vand_u8(b, RightShiftVector<32>(b));
   const uint8x8_t outer_mask = OuterThreshold(p0q0, p1q1, outer_thresh);
   return vand_u8(inner_mask, outer_mask);
 }
@@ -482,7 +482,7 @@ inline uint8x8_t IsFlat4(const uint8x8_t abd_p0n0_q0n0,
   const uint8x8_t a = vmax_u8(abd_p0n0_q0n0, abd_p0n1_q0n1);
   const uint8x8_t b = vmax_u8(a, abd_p0n2_q0n2);
   const uint8x8_t c = vcle_u8(b, vdup_n_u8(1));
-  return vand_u8(c, RightShift<32>(c));
+  return vand_u8(c, RightShiftVector<32>(c));
 }
 
 // abs(p3 - p2) <= inner_thresh && abs(p2 - p1) <= inner_thresh &&
@@ -498,7 +498,7 @@ inline uint8x8_t NeedsFilter8(const uint8x8_t abd_p0p1_q0q1,
   const uint8x8_t a = vmax_u8(abd_p0p1_q0q1, abd_p1p2_q1q2);
   const uint8x8_t b = vmax_u8(a, abd_p2p3_q2q3);
   const uint8x8_t c = vcle_u8(b, vdup_n_u8(inner_thresh));
-  const uint8x8_t inner_mask = vand_u8(c, RightShift<32>(c));
+  const uint8x8_t inner_mask = vand_u8(c, RightShiftVector<32>(c));
   const uint8x8_t outer_mask = OuterThreshold(p0q0, p1q1, outer_thresh);
   return vand_u8(inner_mask, outer_mask);
 }
@@ -1179,7 +1179,7 @@ void LoopFilterInit_NEON() { low_bitdepth::Init8bpp(); }
 }  // namespace dsp
 }  // namespace libgav1
 
-#else  // !LIBGAV1_ENABLE_NEON
+#else   // !LIBGAV1_ENABLE_NEON
 namespace libgav1 {
 namespace dsp {
 
diff --git a/src/dsp/arm/loop_restoration_neon.cc b/src/dsp/arm/loop_restoration_neon.cc
index 337c9b4..e6ceb66 100644
--- a/src/dsp/arm/loop_restoration_neon.cc
+++ b/src/dsp/arm/loop_restoration_neon.cc
@@ -41,10 +41,25 @@ inline uint8x8_t VshrU128(const uint8x8x2_t src) {
 }
 
 template <int bytes>
+inline uint8x8_t VshrU128(const uint8x8_t src[2]) {
+  return vext_u8(src[0], src[1], bytes);
+}
+
+template <int bytes>
+inline uint8x16_t VshrU128(const uint8x16_t src[2]) {
+  return vextq_u8(src[0], src[1], bytes);
+}
+
+template <int bytes>
 inline uint16x8_t VshrU128(const uint16x8x2_t src) {
   return vextq_u16(src.val[0], src.val[1], bytes / 2);
 }
 
+template <int bytes>
+inline uint16x8_t VshrU128(const uint16x8_t src[2]) {
+  return vextq_u16(src[0], src[1], bytes / 2);
+}
+
 // Wiener
 
 // Must make a local copy of coefficients to help compiler know that they have
@@ -177,18 +192,17 @@ inline void WienerHorizontalTap3(const uint8_t* src, const ptrdiff_t src_stride,
                                  int16_t** const wiener_buffer) {
   for (int y = height; y != 0; --y) {
     const uint8_t* src_ptr = src;
-    uint8x16_t s[4];
-    s[0] = vld1q_u8(src_ptr);
+    uint8x16_t s[3];
     ptrdiff_t x = width;
     do {
-      src_ptr += 16;
-      s[3] = vld1q_u8(src_ptr);
-      s[1] = vextq_u8(s[0], s[3], 1);
-      s[2] = vextq_u8(s[0], s[3], 2);
+      // Slightly faster than using vextq_u8().
+      s[0] = vld1q_u8(src_ptr);
+      s[1] = vld1q_u8(src_ptr + 1);
+      s[2] = vld1q_u8(src_ptr + 2);
       int16x8x2_t sum;
       sum.val[0] = sum.val[1] = vdupq_n_s16(0);
       WienerHorizontalSum(s, filter, sum, *wiener_buffer);
-      s[0] = s[3];
+      src_ptr += 16;
       *wiener_buffer += 16;
       x -= 16;
     } while (x != 0);
@@ -476,12 +490,12 @@ inline void WienerVerticalTap1(const int16_t* wiener_buffer,
 // For width 16 and up, store the horizontal results, and then do the vertical
 // filter row by row. This is faster than doing it column by column when
 // considering cache issues.
-void WienerFilter_NEON(const RestorationUnitInfo& restoration_info,
-                       const void* const source, const void* const top_border,
-                       const void* const bottom_border, const ptrdiff_t stride,
-                       const int width, const int height,
-                       RestorationBuffer* const restoration_buffer,
-                       void* const dest) {
+void WienerFilter_NEON(
+    const RestorationUnitInfo& restoration_info, const void* const source,
+    const ptrdiff_t stride, const void* const top_border,
+    const ptrdiff_t top_border_stride, const void* const bottom_border,
+    const ptrdiff_t bottom_border_stride, const int width, const int height,
+    RestorationBuffer* const restoration_buffer, void* const dest) {
   const int16_t* const number_leading_zero_coefficients =
       restoration_info.wiener_info.number_leading_zero_coefficients;
   const int number_rows_to_skip = std::max(
@@ -509,39 +523,42 @@ void WienerFilter_NEON(const RestorationUnitInfo& restoration_info,
   const auto* const top = static_cast<const uint8_t*>(top_border);
   const auto* const bottom = static_cast<const uint8_t*>(bottom_border);
   if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) {
-    WienerHorizontalTap7(top + (2 - height_extra) * stride - 3, stride,
-                         wiener_stride, height_extra, filter_horizontal,
-                         &wiener_buffer_horizontal);
-    WienerHorizontalTap7(src - 3, stride, wiener_stride, height,
+    WienerHorizontalTap7(top + (2 - height_extra) * top_border_stride - 3,
+                         top_border_stride, wiener_stride, height_extra,
                          filter_horizontal, &wiener_buffer_horizontal);
-    WienerHorizontalTap7(bottom - 3, stride, wiener_stride, height_extra,
+    WienerHorizontalTap7(src - 3, stride, wiener_stride, height,
                          filter_horizontal, &wiener_buffer_horizontal);
-  } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
-    WienerHorizontalTap5(top + (2 - height_extra) * stride - 2, stride,
-                         wiener_stride, height_extra, filter_horizontal,
+    WienerHorizontalTap7(bottom - 3, bottom_border_stride, wiener_stride,
+                         height_extra, filter_horizontal,
                          &wiener_buffer_horizontal);
-    WienerHorizontalTap5(src - 2, stride, wiener_stride, height,
+  } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
+    WienerHorizontalTap5(top + (2 - height_extra) * top_border_stride - 2,
+                         top_border_stride, wiener_stride, height_extra,
                          filter_horizontal, &wiener_buffer_horizontal);
-    WienerHorizontalTap5(bottom - 2, stride, wiener_stride, height_extra,
+    WienerHorizontalTap5(src - 2, stride, wiener_stride, height,
                          filter_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap5(bottom - 2, bottom_border_stride, wiener_stride,
+                         height_extra, filter_horizontal,
+                         &wiener_buffer_horizontal);
   } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) {
     // The maximum over-reads happen here.
-    WienerHorizontalTap3(top + (2 - height_extra) * stride - 1, stride,
-                         wiener_stride, height_extra, filter_horizontal,
-                         &wiener_buffer_horizontal);
-    WienerHorizontalTap3(src - 1, stride, wiener_stride, height,
+    WienerHorizontalTap3(top + (2 - height_extra) * top_border_stride - 1,
+                         top_border_stride, wiener_stride, height_extra,
                          filter_horizontal, &wiener_buffer_horizontal);
-    WienerHorizontalTap3(bottom - 1, stride, wiener_stride, height_extra,
+    WienerHorizontalTap3(src - 1, stride, wiener_stride, height,
                          filter_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap3(bottom - 1, bottom_border_stride, wiener_stride,
+                         height_extra, filter_horizontal,
+                         &wiener_buffer_horizontal);
   } else {
     assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3);
-    WienerHorizontalTap1(top + (2 - height_extra) * stride, stride,
-                         wiener_stride, height_extra,
+    WienerHorizontalTap1(top + (2 - height_extra) * top_border_stride,
+                         top_border_stride, wiener_stride, height_extra,
                          &wiener_buffer_horizontal);
     WienerHorizontalTap1(src, stride, wiener_stride, height,
                          &wiener_buffer_horizontal);
-    WienerHorizontalTap1(bottom, stride, wiener_stride, height_extra,
-                         &wiener_buffer_horizontal);
+    WienerHorizontalTap1(bottom, bottom_border_stride, wiener_stride,
+                         height_extra, &wiener_buffer_horizontal);
   }
 
   // vertical filtering.
@@ -574,13 +591,20 @@ void WienerFilter_NEON(const RestorationUnitInfo& restoration_info,
 //------------------------------------------------------------------------------
 // SGR
 
-inline void Prepare3_8(const uint8x8x2_t src, uint8x8_t dst[3]) {
+inline void Prepare3_8(const uint8x8_t src[2], uint8x8_t dst[3]) {
   dst[0] = VshrU128<0>(src);
   dst[1] = VshrU128<1>(src);
   dst[2] = VshrU128<2>(src);
 }
 
-inline void Prepare3_16(const uint16x8x2_t src, uint16x4_t low[3],
+template <int offset>
+inline void Prepare3_8(const uint8x16_t src[2], uint8x16_t dst[3]) {
+  dst[0] = VshrU128<offset + 0>(src);
+  dst[1] = VshrU128<offset + 1>(src);
+  dst[2] = VshrU128<offset + 2>(src);
+}
+
+inline void Prepare3_16(const uint16x8_t src[2], uint16x4_t low[3],
                         uint16x4_t high[3]) {
   uint16x8_t s[3];
   s[0] = VshrU128<0>(src);
@@ -594,7 +618,7 @@ inline void Prepare3_16(const uint16x8x2_t src, uint16x4_t low[3],
   high[2] = vget_high_u16(s[2]);
 }
 
-inline void Prepare5_8(const uint8x8x2_t src, uint8x8_t dst[5]) {
+inline void Prepare5_8(const uint8x8_t src[2], uint8x8_t dst[5]) {
   dst[0] = VshrU128<0>(src);
   dst[1] = VshrU128<1>(src);
   dst[2] = VshrU128<2>(src);
@@ -602,7 +626,16 @@ inline void Prepare5_8(const uint8x8x2_t src, uint8x8_t dst[5]) {
   dst[4] = VshrU128<4>(src);
 }
 
-inline void Prepare5_16(const uint16x8x2_t src, uint16x4_t low[5],
+template <int offset>
+inline void Prepare5_8(const uint8x16_t src[2], uint8x16_t dst[5]) {
+  dst[0] = VshrU128<offset + 0>(src);
+  dst[1] = VshrU128<offset + 1>(src);
+  dst[2] = VshrU128<offset + 2>(src);
+  dst[3] = VshrU128<offset + 3>(src);
+  dst[4] = VshrU128<offset + 4>(src);
+}
+
+inline void Prepare5_16(const uint16x8_t src[2], uint16x4_t low[5],
                         uint16x4_t high[5]) {
   Prepare3_16(src, low, high);
   const uint16x8_t s3 = VshrU128<6>(src);
@@ -641,6 +674,30 @@ inline uint16x8_t Sum3W_16(const uint8x8_t src[3]) {
   return vaddw_u8(sum, src[2]);
 }
 
+inline uint16x8_t Sum3WLo16(const uint8x16_t src[3]) {
+  const uint16x8_t sum = vaddl_u8(vget_low_u8(src[0]), vget_low_u8(src[1]));
+  return vaddw_u8(sum, vget_low_u8(src[2]));
+}
+
+inline uint16x8_t Sum3WHi16(const uint8x16_t src[3]) {
+  const uint16x8_t sum = vaddl_u8(vget_high_u8(src[0]), vget_high_u8(src[1]));
+  return vaddw_u8(sum, vget_high_u8(src[2]));
+}
+
+inline uint16x8_t Sum5WLo16(const uint8x16_t src[5]) {
+  const uint16x8_t sum01 = vaddl_u8(vget_low_u8(src[0]), vget_low_u8(src[1]));
+  const uint16x8_t sum23 = vaddl_u8(vget_low_u8(src[2]), vget_low_u8(src[3]));
+  const uint16x8_t sum = vaddq_u16(sum01, sum23);
+  return vaddw_u8(sum, vget_low_u8(src[4]));
+}
+
+inline uint16x8_t Sum5WHi16(const uint8x16_t src[5]) {
+  const uint16x8_t sum01 = vaddl_u8(vget_high_u8(src[0]), vget_high_u8(src[1]));
+  const uint16x8_t sum23 = vaddl_u8(vget_high_u8(src[2]), vget_high_u8(src[3]));
+  const uint16x8_t sum = vaddq_u16(sum01, sum23);
+  return vaddw_u8(sum, vget_high_u8(src[4]));
+}
+
 inline uint32x4_t Sum3W_32(const uint16x4_t src[3]) {
   const uint32x4_t sum = vaddl_u16(src[0], src[1]);
   return vaddw_u16(sum, src[2]);
@@ -678,13 +735,28 @@ inline uint32x4_t Sum5W_32(const uint16x4_t src[5]) {
   return vaddw_u16(sum0123, src[4]);
 }
 
-inline uint16x8_t Sum3Horizontal(const uint8x8x2_t src) {
+inline uint16x8_t Sum3Horizontal(const uint8x8_t src[2]) {
   uint8x8_t s[3];
   Prepare3_8(src, s);
   return Sum3W_16(s);
 }
 
-inline uint32x4x2_t Sum3WHorizontal(const uint16x8x2_t src) {
+inline uint16x8_t Sum3Horizontal(const uint8x16_t src) {
+  uint8x8_t s[2];
+  s[0] = vget_low_u8(src);
+  s[1] = vget_high_u8(src);
+  return Sum3Horizontal(s);
+}
+
+template <int offset>
+inline void Sum3Horizontal(const uint8x16_t src[2], uint16x8_t dst[2]) {
+  uint8x16_t s[3];
+  Prepare3_8<offset>(src, s);
+  dst[0] = Sum3WLo16(s);
+  dst[1] = Sum3WHi16(s);
+}
+
+inline uint32x4x2_t Sum3WHorizontal(const uint16x8_t src[2]) {
   uint16x4_t low[3], high[3];
   uint32x4x2_t sum;
   Prepare3_16(src, low, high);
@@ -693,7 +765,7 @@ inline uint32x4x2_t Sum3WHorizontal(const uint16x8x2_t src) {
   return sum;
 }
 
-inline uint16x8_t Sum5Horizontal(const uint8x8x2_t src) {
+inline uint16x8_t Sum5Horizontal(const uint8x8_t src[2]) {
   uint8x8_t s[5];
   Prepare5_8(src, s);
   const uint16x8_t sum01 = vaddl_u8(s[0], s[1]);
@@ -702,7 +774,23 @@ inline uint16x8_t Sum5Horizontal(const uint8x8x2_t src) {
   return vaddw_u8(sum0123, s[4]);
 }
 
-inline uint32x4x2_t Sum5WHorizontal(const uint16x8x2_t src) {
+inline uint16x8_t Sum5Horizontal(const uint8x16_t src) {
+  uint8x8_t s[2];
+  s[0] = vget_low_u8(src);
+  s[1] = vget_high_u8(src);
+  return Sum5Horizontal(s);
+}
+
+template <int offset>
+inline void Sum5Horizontal(const uint8x16_t src[2], uint16x8_t* const dst0,
+                           uint16x8_t* const dst1) {
+  uint8x16_t s[5];
+  Prepare5_8<offset>(src, s);
+  *dst0 = Sum5WLo16(s);
+  *dst1 = Sum5WHi16(s);
+}
+
+inline uint32x4x2_t Sum5WHorizontal(const uint16x8_t src[2]) {
   uint16x4_t low[5], high[5];
   Prepare5_16(src, low, high);
   uint32x4x2_t sum;
@@ -711,35 +799,68 @@ inline uint32x4x2_t Sum5WHorizontal(const uint16x8x2_t src) {
   return sum;
 }
 
-void SumHorizontal(const uint16x4_t src[5], uint32x4_t* const row_sq3,
-                   uint32x4_t* const row_sq5) {
-  const uint32x4_t sum04 = vaddl_u16(src[0], src[4]);
-  const uint32x4_t sum12 = vaddl_u16(src[1], src[2]);
-  *row_sq3 = vaddw_u16(sum12, src[3]);
-  *row_sq5 = vaddq_u32(sum04, *row_sq3);
+template <int offset>
+void SumHorizontal(const uint8x16_t src[2], uint16x8_t* const row3_0,
+                   uint16x8_t* const row3_1, uint16x8_t* const row5_0,
+                   uint16x8_t* const row5_1) {
+  uint8x16_t s[5];
+  Prepare5_8<offset>(src, s);
+  const uint16x8_t sum04_lo = vaddl_u8(vget_low_u8(s[0]), vget_low_u8(s[4]));
+  const uint16x8_t sum04_hi = vaddl_u8(vget_high_u8(s[0]), vget_high_u8(s[4]));
+  *row3_0 = Sum3WLo16(s + 1);
+  *row3_1 = Sum3WHi16(s + 1);
+  *row5_0 = vaddq_u16(sum04_lo, *row3_0);
+  *row5_1 = vaddq_u16(sum04_hi, *row3_1);
 }
 
-void SumHorizontal(const uint8x8x2_t src, const uint16x8x2_t sq,
-                   uint16x8_t* const row3, uint16x8_t* const row5,
-                   uint32x4x2_t* const row_sq3, uint32x4x2_t* const row_sq5) {
+void SumHorizontal(const uint8x8_t src[2], uint16x8_t* const row3,
+                   uint16x8_t* const row5) {
   uint8x8_t s[5];
   Prepare5_8(src, s);
   const uint16x8_t sum04 = vaddl_u8(s[0], s[4]);
   const uint16x8_t sum12 = vaddl_u8(s[1], s[2]);
   *row3 = vaddw_u8(sum12, s[3]);
   *row5 = vaddq_u16(sum04, *row3);
+}
+
+void SumHorizontal(const uint16x4_t src[5], uint32x4_t* const row_sq3,
+                   uint32x4_t* const row_sq5) {
+  const uint32x4_t sum04 = vaddl_u16(src[0], src[4]);
+  const uint32x4_t sum12 = vaddl_u16(src[1], src[2]);
+  *row_sq3 = vaddw_u16(sum12, src[3]);
+  *row_sq5 = vaddq_u32(sum04, *row_sq3);
+}
+
+void SumHorizontal(const uint16x8_t sq[2], uint32x4x2_t* const row_sq3,
+                   uint32x4x2_t* const row_sq5) {
   uint16x4_t low[5], high[5];
   Prepare5_16(sq, low, high);
   SumHorizontal(low, &row_sq3->val[0], &row_sq5->val[0]);
   SumHorizontal(high, &row_sq3->val[1], &row_sq5->val[1]);
 }
 
-inline uint16x8_t Sum343(const uint8x8x2_t src) {
-  uint8x8_t s[3];
-  Prepare3_8(src, s);
-  const uint16x8_t sum = Sum3W_16(s);
+void SumHorizontal(const uint8x8_t src[2], const uint16x8_t sq[2],
+                   uint16x8_t* const row3, uint16x8_t* const row5,
+                   uint32x4x2_t* const row_sq3, uint32x4x2_t* const row_sq5) {
+  SumHorizontal(src, row3, row5);
+  SumHorizontal(sq, row_sq3, row_sq5);
+}
+
+void SumHorizontal(const uint8x16_t src, const uint16x8_t sq[2],
+                   uint16x8_t* const row3, uint16x8_t* const row5,
+                   uint32x4x2_t* const row_sq3, uint32x4x2_t* const row_sq5) {
+  uint8x8_t s[2];
+  s[0] = vget_low_u8(src);
+  s[1] = vget_high_u8(src);
+  return SumHorizontal(s, sq, row3, row5, row_sq3, row_sq5);
+}
+
+template <int offset>
+inline uint16x8_t Sum343(const uint8x16_t ma3[2]) {
+  const uint16x8_t sum = (offset == 0) ? Sum3WLo16(ma3) : Sum3WHi16(ma3);
   const uint16x8_t sum3 = Sum3_16(sum, sum, sum);
-  return vaddw_u8(sum3, s[1]);
+  return vaddw_u8(sum3,
+                  (offset == 0) ? vget_low_u8(ma3[1]) : vget_high_u8(ma3[1]));
 }
 
 inline uint32x4_t Sum343W(const uint16x4_t src[3]) {
@@ -748,7 +869,7 @@ inline uint32x4_t Sum343W(const uint16x4_t src[3]) {
   return vaddw_u16(sum3, src[1]);
 }
 
-inline uint32x4x2_t Sum343W(const uint16x8x2_t src) {
+inline uint32x4x2_t Sum343W(const uint16x8_t src[2]) {
   uint16x4_t low[3], high[3];
   uint32x4x2_t d;
   Prepare3_16(src, low, high);
@@ -757,13 +878,13 @@ inline uint32x4x2_t Sum343W(const uint16x8x2_t src) {
   return d;
 }
 
-inline uint16x8_t Sum565(const uint8x8x2_t src) {
-  uint8x8_t s[3];
-  Prepare3_8(src, s);
-  const uint16x8_t sum = Sum3W_16(s);
+template <int offset>
+inline uint16x8_t Sum565(const uint8x16_t ma5[2]) {
+  const uint16x8_t sum = (offset == 0) ? Sum3WLo16(ma5) : Sum3WHi16(ma5);
   const uint16x8_t sum4 = vshlq_n_u16(sum, 2);
   const uint16x8_t sum5 = vaddq_u16(sum4, sum);
-  return vaddw_u8(sum5, s[1]);
+  return vaddw_u8(sum5,
+                  (offset == 0) ? vget_low_u8(ma5[1]) : vget_high_u8(ma5[1]));
 }
 
 inline uint32x4_t Sum565W(const uint16x4_t src[3]) {
@@ -773,7 +894,7 @@ inline uint32x4_t Sum565W(const uint16x4_t src[3]) {
   return vaddw_u16(sum5, src[1]);
 }
 
-inline uint32x4x2_t Sum565W(const uint16x8x2_t src) {
+inline uint32x4x2_t Sum565W(const uint16x8_t src[2]) {
   uint16x4_t low[3], high[3];
   uint32x4x2_t d;
   Prepare3_16(src, low, high);
@@ -783,21 +904,21 @@ inline uint32x4x2_t Sum565W(const uint16x8x2_t src) {
 }
 
 inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride,
-                   const int height, const ptrdiff_t sum_stride, uint16_t* sum3,
-                   uint16_t* sum5, uint32_t* square_sum3,
-                   uint32_t* square_sum5) {
-  int y = height;
+                   const ptrdiff_t sum_stride, uint16_t* sum3, uint16_t* sum5,
+                   uint32_t* square_sum3, uint32_t* square_sum5) {
+  int y = 2;
+  // Don't change loop width to 16, which is even slower.
   do {
-    uint8x8x2_t s;
-    uint16x8x2_t sq;
-    s.val[0] = vld1_u8(src);
-    sq.val[0] = vmull_u8(s.val[0], s.val[0]);
+    uint8x8_t s[2];
+    uint16x8_t sq[2];
+    s[0] = vld1_u8(src);
+    sq[0] = vmull_u8(s[0], s[0]);
     ptrdiff_t x = 0;
     do {
       uint16x8_t row3, row5;
       uint32x4x2_t row_sq3, row_sq5;
-      s.val[1] = vld1_u8(src + x + 8);
-      sq.val[1] = vmull_u8(s.val[1], s.val[1]);
+      s[1] = vld1_u8(src + x + 8);
+      sq[1] = vmull_u8(s[1], s[1]);
       SumHorizontal(s, sq, &row3, &row5, &row_sq3, &row_sq5);
       vst1q_u16(sum3, row3);
       vst1q_u16(sum5, row5);
@@ -805,8 +926,8 @@ inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride,
       vst1q_u32(square_sum3 + 4, row_sq3.val[1]);
       vst1q_u32(square_sum5 + 0, row_sq5.val[0]);
       vst1q_u32(square_sum5 + 4, row_sq5.val[1]);
-      s.val[0] = s.val[1];
-      sq.val[0] = sq.val[1];
+      s[0] = s[1];
+      sq[0] = sq[1];
       sum3 += 8;
       sum5 += 8;
       square_sum3 += 8;
@@ -819,21 +940,22 @@ inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride,
 
 template <int size>
 inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride,
-                   const int height, const ptrdiff_t sum_stride, uint16_t* sums,
+                   const ptrdiff_t sum_stride, uint16_t* sums,
                    uint32_t* square_sums) {
   static_assert(size == 3 || size == 5, "");
-  int y = height;
+  int y = 2;
+  // Don't change loop width to 16, which is even slower.
   do {
-    uint8x8x2_t s;
-    uint16x8x2_t sq;
-    s.val[0] = vld1_u8(src);
-    sq.val[0] = vmull_u8(s.val[0], s.val[0]);
+    uint8x8_t s[2];
+    uint16x8_t sq[2];
+    s[0] = vld1_u8(src);
+    sq[0] = vmull_u8(s[0], s[0]);
     ptrdiff_t x = 0;
     do {
       uint16x8_t row;
       uint32x4x2_t row_sq;
-      s.val[1] = vld1_u8(src + x + 8);
-      sq.val[1] = vmull_u8(s.val[1], s.val[1]);
+      s[1] = vld1_u8(src + x + 8);
+      sq[1] = vmull_u8(s[1], s[1]);
       if (size == 3) {
         row = Sum3Horizontal(s);
         row_sq = Sum3WHorizontal(sq);
@@ -844,8 +966,8 @@ inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride,
       vst1q_u16(sums, row);
       vst1q_u32(square_sums + 0, row_sq.val[0]);
       vst1q_u32(square_sums + 4, row_sq.val[1]);
-      s.val[0] = s.val[1];
-      sq.val[0] = sq.val[1];
+      s[0] = s[1];
+      sq[0] = sq[1];
       sums += 8;
       square_sums += 8;
       x += 8;
@@ -871,10 +993,18 @@ inline uint16x4_t CalculateMa(const uint16x4_t sum, const uint32x4_t sum_sq,
   return vmovn_u32(shifted);
 }
 
-template <int n>
+inline uint8x8_t AdjustValue(const uint8x8_t value, const uint8x8_t index,
+                             const int threshold) {
+  const uint8x8_t thresholds = vdup_n_u8(threshold);
+  const uint8x8_t offset = vcgt_u8(index, thresholds);
+  // Adding 255 is equivalent to subtracting 1 for 8-bit data.
+  return vadd_u8(value, offset);
+}
+
+template <int n, int offset>
 inline void CalculateIntermediate(const uint16x8_t sum,
                                   const uint32x4x2_t sum_sq,
-                                  const uint32_t scale, uint8x8_t* const ma,
+                                  const uint32_t scale, uint8x16_t* const ma,
                                   uint16x8_t* const b) {
   constexpr uint32_t one_over_n =
       ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n;
@@ -882,19 +1012,39 @@ inline void CalculateIntermediate(const uint16x8_t sum,
   const uint16x4_t z1 =
       CalculateMa<n>(vget_high_u16(sum), sum_sq.val[1], scale);
   const uint16x8_t z01 = vcombine_u16(z0, z1);
-  // Using vqmovn_u16() needs an extra sign extension instruction.
-  const uint16x8_t z = vminq_u16(z01, vdupq_n_u16(255));
-  // Using vgetq_lane_s16() can save the sign extension instruction.
-  const uint8_t lookup[8] = {
-      kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 0)],
-      kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 1)],
-      kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 2)],
-      kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 3)],
-      kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 4)],
-      kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 5)],
-      kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 6)],
-      kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 7)]};
-  *ma = vld1_u8(lookup);
+  const uint8x8_t idx = vqmovn_u16(z01);
+  // Use table lookup to read elements whose indices are less than 48.
+  // Using one uint8x8x4_t vector and one uint8x8x2_t vector is faster than
+  // using two uint8x8x3_t vectors.
+  uint8x8x4_t table0;
+  uint8x8x2_t table1;
+  table0.val[0] = vld1_u8(kSgrMaLookup + 0 * 8);
+  table0.val[1] = vld1_u8(kSgrMaLookup + 1 * 8);
+  table0.val[2] = vld1_u8(kSgrMaLookup + 2 * 8);
+  table0.val[3] = vld1_u8(kSgrMaLookup + 3 * 8);
+  table1.val[0] = vld1_u8(kSgrMaLookup + 4 * 8);
+  table1.val[1] = vld1_u8(kSgrMaLookup + 5 * 8);
+  // All elements whose indices are out of range [0, 47] are set to 0.
+  uint8x8_t val = vtbl4_u8(table0, idx);  // Range [0, 31].
+  // Subtract 8 to shuffle the next index range.
+  const uint8x8_t index = vsub_u8(idx, vdup_n_u8(32));
+  const uint8x8_t res = vtbl2_u8(table1, index);  // Range [32, 47].
+  // Use OR instruction to combine shuffle results together.
+  val = vorr_u8(val, res);
+
+  // For elements whose indices are larger than 47, since they seldom change
+  // values with the increase of the index, we use comparison and arithmetic
+  // operations to calculate their values.
+  // Elements whose indices are larger than 47 (with value 0) are set to 5.
+  val = vmax_u8(val, vdup_n_u8(5));
+  val = AdjustValue(val, idx, 55);   // 55 is the last index which value is 5.
+  val = AdjustValue(val, idx, 72);   // 72 is the last index which value is 4.
+  val = AdjustValue(val, idx, 101);  // 101 is the last index which value is 3.
+  val = AdjustValue(val, idx, 169);  // 169 is the last index which value is 2.
+  val = AdjustValue(val, idx, 254);  // 254 is the last index which value is 1.
+  *ma = (offset == 0) ? vcombine_u8(val, vget_high_u8(*ma))
+                      : vcombine_u8(vget_low_u8(*ma), val);
+
   // b = ma * b * one_over_n
   // |ma| = [0, 255]
   // |sum| is a box sum with radius 1 or 2.
@@ -906,7 +1056,8 @@ inline void CalculateIntermediate(const uint16x8_t sum,
   // |kSgrProjReciprocalBits| is 12.
   // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
   // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
-  const uint16x8_t maq = vmovl_u8(*ma);
+  const uint16x8_t maq =
+      vmovl_u8((offset == 0) ? vget_low_u8(*ma) : vget_high_u8(*ma));
   const uint32x4_t m0 = vmull_u16(vget_low_u16(maq), vget_low_u16(sum));
   const uint32x4_t m1 = vmull_u16(vget_high_u16(maq), vget_high_u16(sum));
   const uint32x4_t m2 = vmulq_n_u32(m0, one_over_n);
@@ -916,37 +1067,39 @@ inline void CalculateIntermediate(const uint16x8_t sum,
   *b = vcombine_u16(b_lo, b_hi);
 }
 
+template <int offset>
 inline void CalculateIntermediate5(const uint16x8_t s5[5],
                                    const uint32x4x2_t sq5[5],
-                                   const uint32_t scale, uint8x8_t* const ma,
+                                   const uint32_t scale, uint8x16_t* const ma,
                                    uint16x8_t* const b) {
   const uint16x8_t sum = Sum5_16(s5);
   const uint32x4x2_t sum_sq = Sum5_32(sq5);
-  CalculateIntermediate<25>(sum, sum_sq, scale, ma, b);
+  CalculateIntermediate<25, offset>(sum, sum_sq, scale, ma, b);
 }
 
+template <int offset>
 inline void CalculateIntermediate3(const uint16x8_t s3[3],
                                    const uint32x4x2_t sq3[3],
-                                   const uint32_t scale, uint8x8_t* const ma,
+                                   const uint32_t scale, uint8x16_t* const ma,
                                    uint16x8_t* const b) {
   const uint16x8_t sum = Sum3_16(s3);
   const uint32x4x2_t sum_sq = Sum3_32(sq3);
-  CalculateIntermediate<9>(sum, sum_sq, scale, ma, b);
+  CalculateIntermediate<9, offset>(sum, sum_sq, scale, ma, b);
 }
 
-inline void Store343_444(const uint8x8x2_t ma3, const uint16x8x2_t b3,
+template <int offset>
+inline void Store343_444(const uint8x16_t ma3[3], const uint16x8_t b3[2],
                          const ptrdiff_t x, uint16x8_t* const sum_ma343,
                          uint16x8_t* const sum_ma444,
                          uint32x4x2_t* const sum_b343,
                          uint32x4x2_t* const sum_b444, uint16_t* const ma343,
                          uint16_t* const ma444, uint32_t* const b343,
                          uint32_t* const b444) {
-  uint8x8_t s[3];
-  Prepare3_8(ma3, s);
-  const uint16x8_t sum_ma111 = Sum3W_16(s);
+  const uint16x8_t sum_ma111 = (offset == 0) ? Sum3WLo16(ma3) : Sum3WHi16(ma3);
   *sum_ma444 = vshlq_n_u16(sum_ma111, 2);
   const uint16x8_t sum333 = vsubq_u16(*sum_ma444, sum_ma111);
-  *sum_ma343 = vaddw_u8(sum333, s[1]);
+  *sum_ma343 = vaddw_u8(
+      sum333, (offset == 0) ? vget_low_u8(ma3[1]) : vget_high_u8(ma3[1]));
   uint16x4_t low[3], high[3];
   uint32x4x2_t sum_b111;
   Prepare3_16(b3, low, high);
@@ -966,93 +1119,211 @@ inline void Store343_444(const uint8x8x2_t ma3, const uint16x8x2_t b3,
   vst1q_u32(b444 + x + 4, sum_b444->val[1]);
 }
 
-inline void Store343_444(const uint8x8x2_t ma3, const uint16x8x2_t b3,
+template <int offset>
+inline void Store343_444(const uint8x16_t ma3[3], const uint16x8_t b3[2],
                          const ptrdiff_t x, uint16x8_t* const sum_ma343,
                          uint32x4x2_t* const sum_b343, uint16_t* const ma343,
                          uint16_t* const ma444, uint32_t* const b343,
                          uint32_t* const b444) {
   uint16x8_t sum_ma444;
   uint32x4x2_t sum_b444;
-  Store343_444(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, &sum_b444, ma343,
-               ma444, b343, b444);
+  Store343_444<offset>(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, &sum_b444,
+                       ma343, ma444, b343, b444);
 }
 
-inline void Store343_444(const uint8x8x2_t ma3, const uint16x8x2_t b3,
+template <int offset>
+inline void Store343_444(const uint8x16_t ma3[3], const uint16x8_t b3[2],
                          const ptrdiff_t x, uint16_t* const ma343,
                          uint16_t* const ma444, uint32_t* const b343,
                          uint32_t* const b444) {
   uint16x8_t sum_ma343;
   uint32x4x2_t sum_b343;
-  Store343_444(ma3, b3, x, &sum_ma343, &sum_b343, ma343, ma444, b343, b444);
+  Store343_444<offset>(ma3, b3, x, &sum_ma343, &sum_b343, ma343, ma444, b343,
+                       b444);
 }
 
-LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5(
-    const uint8_t* const src0, const uint8_t* const src1, const ptrdiff_t x,
-    const uint32_t scale, uint16_t* const sum5[5],
-    uint32_t* const square_sum5[5], uint8x8x2_t s[2], uint16x8x2_t sq[2],
-    uint8x8_t* const ma, uint16x8_t* const b) {
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5Lo(
+    const uint8_t* const src0, const uint8_t* const src1, const uint32_t scale,
+    uint8x16_t s[2][2], uint16_t* const sum5[5], uint32_t* const square_sum5[5],
+    uint16x8_t sq[2][4], uint8x16_t* const ma, uint16x8_t* const b) {
   uint16x8_t s5[5];
   uint32x4x2_t sq5[5];
-  s[0].val[1] = vld1_u8(src0 + x + 8);
-  s[1].val[1] = vld1_u8(src1 + x + 8);
-  sq[0].val[1] = vmull_u8(s[0].val[1], s[0].val[1]);
-  sq[1].val[1] = vmull_u8(s[1].val[1], s[1].val[1]);
-  s5[3] = Sum5Horizontal(s[0]);
-  s5[4] = Sum5Horizontal(s[1]);
+  s[0][0] = vld1q_u8(src0);
+  s[1][0] = vld1q_u8(src1);
+  sq[0][0] = vmull_u8(vget_low_u8(s[0][0]), vget_low_u8(s[0][0]));
+  sq[1][0] = vmull_u8(vget_low_u8(s[1][0]), vget_low_u8(s[1][0]));
+  sq[0][1] = vmull_u8(vget_high_u8(s[0][0]), vget_high_u8(s[0][0]));
+  sq[1][1] = vmull_u8(vget_high_u8(s[1][0]), vget_high_u8(s[1][0]));
+  s5[3] = Sum5Horizontal(s[0][0]);
+  s5[4] = Sum5Horizontal(s[1][0]);
   sq5[3] = Sum5WHorizontal(sq[0]);
   sq5[4] = Sum5WHorizontal(sq[1]);
-  vst1q_u16(sum5[3] + x, s5[3]);
-  vst1q_u16(sum5[4] + x, s5[4]);
+  vst1q_u16(sum5[3], s5[3]);
+  vst1q_u16(sum5[4], s5[4]);
+  vst1q_u32(square_sum5[3] + 0, sq5[3].val[0]);
+  vst1q_u32(square_sum5[3] + 4, sq5[3].val[1]);
+  vst1q_u32(square_sum5[4] + 0, sq5[4].val[0]);
+  vst1q_u32(square_sum5[4] + 4, sq5[4].val[1]);
+  s5[0] = vld1q_u16(sum5[0]);
+  s5[1] = vld1q_u16(sum5[1]);
+  s5[2] = vld1q_u16(sum5[2]);
+  sq5[0].val[0] = vld1q_u32(square_sum5[0] + 0);
+  sq5[0].val[1] = vld1q_u32(square_sum5[0] + 4);
+  sq5[1].val[0] = vld1q_u32(square_sum5[1] + 0);
+  sq5[1].val[1] = vld1q_u32(square_sum5[1] + 4);
+  sq5[2].val[0] = vld1q_u32(square_sum5[2] + 0);
+  sq5[2].val[1] = vld1q_u32(square_sum5[2] + 4);
+  CalculateIntermediate5<0>(s5, sq5, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5(
+    const uint8_t* const src0, const uint8_t* const src1, const ptrdiff_t x,
+    const uint32_t scale, uint8x16_t s[2][2], uint16_t* const sum5[5],
+    uint32_t* const square_sum5[5], uint16x8_t sq[2][4], uint8x16_t ma[2],
+    uint16x8_t b[2]) {
+  uint16x8_t s5[2][5];
+  uint32x4x2_t sq5[5];
+  s[0][1] = vld1q_u8(src0 + x + 8);
+  s[1][1] = vld1q_u8(src1 + x + 8);
+  sq[0][2] = vmull_u8(vget_low_u8(s[0][1]), vget_low_u8(s[0][1]));
+  sq[1][2] = vmull_u8(vget_low_u8(s[1][1]), vget_low_u8(s[1][1]));
+  Sum5Horizontal<8>(s[0], &s5[0][3], &s5[1][3]);
+  Sum5Horizontal<8>(s[1], &s5[0][4], &s5[1][4]);
+  sq5[3] = Sum5WHorizontal(sq[0] + 1);
+  sq5[4] = Sum5WHorizontal(sq[1] + 1);
+  vst1q_u16(sum5[3] + x, s5[0][3]);
+  vst1q_u16(sum5[4] + x, s5[0][4]);
   vst1q_u32(square_sum5[3] + x + 0, sq5[3].val[0]);
   vst1q_u32(square_sum5[3] + x + 4, sq5[3].val[1]);
   vst1q_u32(square_sum5[4] + x + 0, sq5[4].val[0]);
   vst1q_u32(square_sum5[4] + x + 4, sq5[4].val[1]);
-  s5[0] = vld1q_u16(sum5[0] + x);
-  s5[1] = vld1q_u16(sum5[1] + x);
-  s5[2] = vld1q_u16(sum5[2] + x);
+  s5[0][0] = vld1q_u16(sum5[0] + x);
+  s5[0][1] = vld1q_u16(sum5[1] + x);
+  s5[0][2] = vld1q_u16(sum5[2] + x);
   sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 0);
   sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 4);
   sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 0);
   sq5[1].val[1] = vld1q_u32(square_sum5[1] + x + 4);
   sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 0);
   sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 4);
-  CalculateIntermediate5(s5, sq5, scale, ma, b);
+  CalculateIntermediate5<8>(s5[0], sq5, scale, &ma[0], &b[0]);
+
+  sq[0][3] = vmull_u8(vget_high_u8(s[0][1]), vget_high_u8(s[0][1]));
+  sq[1][3] = vmull_u8(vget_high_u8(s[1][1]), vget_high_u8(s[1][1]));
+  sq5[3] = Sum5WHorizontal(sq[0] + 2);
+  sq5[4] = Sum5WHorizontal(sq[1] + 2);
+  vst1q_u16(sum5[3] + x + 8, s5[1][3]);
+  vst1q_u16(sum5[4] + x + 8, s5[1][4]);
+  vst1q_u32(square_sum5[3] + x + 8, sq5[3].val[0]);
+  vst1q_u32(square_sum5[3] + x + 12, sq5[3].val[1]);
+  vst1q_u32(square_sum5[4] + x + 8, sq5[4].val[0]);
+  vst1q_u32(square_sum5[4] + x + 12, sq5[4].val[1]);
+  s5[1][0] = vld1q_u16(sum5[0] + x + 8);
+  s5[1][1] = vld1q_u16(sum5[1] + x + 8);
+  s5[1][2] = vld1q_u16(sum5[2] + x + 8);
+  sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 8);
+  sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 12);
+  sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 8);
+  sq5[1].val[1] = vld1q_u32(square_sum5[1] + x + 12);
+  sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 8);
+  sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 12);
+  CalculateIntermediate5<0>(s5[1], sq5, scale, &ma[1], &b[1]);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRowLo(
+    const uint8_t* const src, const uint32_t scale, uint8x16_t* const s,
+    const uint16_t* const sum5[5], const uint32_t* const square_sum5[5],
+    uint16x8_t sq[2], uint8x16_t* const ma, uint16x8_t* const b) {
+  uint16x8_t s5[5];
+  uint32x4x2_t sq5[5];
+  *s = vld1q_u8(src);
+  sq[0] = vmull_u8(vget_low_u8(*s), vget_low_u8(*s));
+  sq[1] = vmull_u8(vget_high_u8(*s), vget_high_u8(*s));
+  s5[3] = s5[4] = Sum5Horizontal(*s);
+  sq5[3] = sq5[4] = Sum5WHorizontal(sq);
+  s5[0] = vld1q_u16(sum5[0]);
+  s5[1] = vld1q_u16(sum5[1]);
+  s5[2] = vld1q_u16(sum5[2]);
+  sq5[0].val[0] = vld1q_u32(square_sum5[0] + 0);
+  sq5[0].val[1] = vld1q_u32(square_sum5[0] + 4);
+  sq5[1].val[0] = vld1q_u32(square_sum5[1] + 0);
+  sq5[1].val[1] = vld1q_u32(square_sum5[1] + 4);
+  sq5[2].val[0] = vld1q_u32(square_sum5[2] + 0);
+  sq5[2].val[1] = vld1q_u32(square_sum5[2] + 4);
+  CalculateIntermediate5<0>(s5, sq5, scale, ma, b);
 }
 
 LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRow(
     const uint8_t* const src, const ptrdiff_t x, const uint32_t scale,
-    const uint16_t* const sum5[5], const uint32_t* const square_sum5[5],
-    uint8x8x2_t* const s, uint16x8x2_t* const sq, uint8x8_t* const ma,
-    uint16x8_t* const b) {
-  uint16x8_t s5[5];
+    uint8x16_t s[2], const uint16_t* const sum5[5],
+    const uint32_t* const square_sum5[5], uint16x8_t sq[3], uint8x16_t ma[2],
+    uint16x8_t b[2]) {
+  uint16x8_t s5[2][5];
   uint32x4x2_t sq5[5];
-  s->val[1] = vld1_u8(src + x + 8);
-  sq->val[1] = vmull_u8(s->val[1], s->val[1]);
-  s5[3] = s5[4] = Sum5Horizontal(*s);
-  sq5[3] = sq5[4] = Sum5WHorizontal(*sq);
-  s5[0] = vld1q_u16(sum5[0] + x);
-  s5[1] = vld1q_u16(sum5[1] + x);
-  s5[2] = vld1q_u16(sum5[2] + x);
+  s[1] = vld1q_u8(src + x + 8);
+  sq[1] = vmull_u8(vget_low_u8(s[1]), vget_low_u8(s[1]));
+  Sum5Horizontal<8>(s, &s5[0][3], &s5[1][3]);
+  sq5[3] = sq5[4] = Sum5WHorizontal(sq);
+  s5[0][0] = vld1q_u16(sum5[0] + x);
+  s5[0][1] = vld1q_u16(sum5[1] + x);
+  s5[0][2] = vld1q_u16(sum5[2] + x);
+  s5[0][4] = s5[0][3];
   sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 0);
   sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 4);
   sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 0);
   sq5[1].val[1] = vld1q_u32(square_sum5[1] + x + 4);
   sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 0);
   sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 4);
-  CalculateIntermediate5(s5, sq5, scale, ma, b);
+  CalculateIntermediate5<8>(s5[0], sq5, scale, &ma[0], &b[0]);
+
+  sq[2] = vmull_u8(vget_high_u8(s[1]), vget_high_u8(s[1]));
+  sq5[3] = sq5[4] = Sum5WHorizontal(sq + 1);
+  s5[1][0] = vld1q_u16(sum5[0] + x + 8);
+  s5[1][1] = vld1q_u16(sum5[1] + x + 8);
+  s5[1][2] = vld1q_u16(sum5[2] + x + 8);
+  s5[1][4] = s5[1][3];
+  sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 8);
+  sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 12);
+  sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 8);
+  sq5[1].val[1] = vld1q_u32(square_sum5[1] + x + 12);
+  sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 8);
+  sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 12);
+  CalculateIntermediate5<0>(s5[1], sq5, scale, &ma[1], &b[1]);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3Lo(
+    const uint8_t* const src, const uint32_t scale, uint8x16_t* const s,
+    uint16_t* const sum3[3], uint32_t* const square_sum3[3], uint16x8_t sq[2],
+    uint8x16_t* const ma, uint16x8_t* const b) {
+  uint16x8_t s3[3];
+  uint32x4x2_t sq3[3];
+  *s = vld1q_u8(src);
+  sq[0] = vmull_u8(vget_low_u8(*s), vget_low_u8(*s));
+  sq[1] = vmull_u8(vget_high_u8(*s), vget_high_u8(*s));
+  s3[2] = Sum3Horizontal(*s);
+  sq3[2] = Sum3WHorizontal(sq);
+  vst1q_u16(sum3[2], s3[2]);
+  vst1q_u32(square_sum3[2] + 0, sq3[2].val[0]);
+  vst1q_u32(square_sum3[2] + 4, sq3[2].val[1]);
+  s3[0] = vld1q_u16(sum3[0]);
+  s3[1] = vld1q_u16(sum3[1]);
+  sq3[0].val[0] = vld1q_u32(square_sum3[0] + 0);
+  sq3[0].val[1] = vld1q_u32(square_sum3[0] + 4);
+  sq3[1].val[0] = vld1q_u32(square_sum3[1] + 0);
+  sq3[1].val[1] = vld1q_u32(square_sum3[1] + 4);
+  CalculateIntermediate3<0>(s3, sq3, scale, ma, b);
 }
 
 LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3(
     const uint8_t* const src, const ptrdiff_t x, const uint32_t scale,
-    uint16_t* const sum3[3], uint32_t* const square_sum3[3],
-    uint8x8x2_t* const s, uint16x8x2_t* const sq, uint8x8_t* const ma,
-    uint16x8_t* const b) {
-  uint16x8_t s3[3];
+    uint16_t* const sum3[3], uint32_t* const square_sum3[3], uint8x16_t s[2],
+    uint16x8_t sq[3], uint8x16_t ma[2], uint16x8_t b[2]) {
+  uint16x8_t s3[4];
   uint32x4x2_t sq3[3];
-  s->val[1] = vld1_u8(src + x + 8);
-  sq->val[1] = vmull_u8(s->val[1], s->val[1]);
-  s3[2] = Sum3Horizontal(*s);
-  sq3[2] = Sum3WHorizontal(*sq);
+  s[1] = vld1q_u8(src + x + 8);
+  sq[1] = vmull_u8(vget_low_u8(s[1]), vget_low_u8(s[1]));
+  Sum3Horizontal<8>(s, s3 + 2);
+  sq3[2] = Sum3WHorizontal(sq);
   vst1q_u16(sum3[2] + x, s3[2]);
   vst1q_u32(square_sum3[2] + x + 0, sq3[2].val[0]);
   vst1q_u32(square_sum3[2] + x + 4, sq3[2].val[1]);
@@ -1062,71 +1333,204 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3(
   sq3[0].val[1] = vld1q_u32(square_sum3[0] + x + 4);
   sq3[1].val[0] = vld1q_u32(square_sum3[1] + x + 0);
   sq3[1].val[1] = vld1q_u32(square_sum3[1] + x + 4);
-  CalculateIntermediate3(s3, sq3, scale, ma, b);
+  CalculateIntermediate3<8>(s3, sq3, scale, &ma[0], &b[0]);
+
+  sq[2] = vmull_u8(vget_high_u8(s[1]), vget_high_u8(s[1]));
+  sq3[2] = Sum3WHorizontal(sq + 1);
+  vst1q_u16(sum3[2] + x + 8, s3[3]);
+  vst1q_u32(square_sum3[2] + x + 8, sq3[2].val[0]);
+  vst1q_u32(square_sum3[2] + x + 12, sq3[2].val[1]);
+  s3[1] = vld1q_u16(sum3[0] + x + 8);
+  s3[2] = vld1q_u16(sum3[1] + x + 8);
+  sq3[0].val[0] = vld1q_u32(square_sum3[0] + x + 8);
+  sq3[0].val[1] = vld1q_u32(square_sum3[0] + x + 12);
+  sq3[1].val[0] = vld1q_u32(square_sum3[1] + x + 8);
+  sq3[1].val[1] = vld1q_u32(square_sum3[1] + x + 12);
+  CalculateIntermediate3<0>(s3 + 1, sq3, scale, &ma[1], &b[1]);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLo(
+    const uint8_t* const src0, const uint8_t* const src1,
+    const uint16_t scales[2], uint8x16_t s[2][2], uint16_t* const sum3[4],
+    uint16_t* const sum5[5], uint32_t* const square_sum3[4],
+    uint32_t* const square_sum5[5], uint16x8_t sq[2][4], uint8x16_t ma3[2][2],
+    uint16x8_t b3[2][3], uint8x16_t* const ma5, uint16x8_t* const b5) {
+  uint16x8_t s3[4], s5[5];
+  uint32x4x2_t sq3[4], sq5[5];
+  s[0][0] = vld1q_u8(src0);
+  s[1][0] = vld1q_u8(src1);
+  sq[0][0] = vmull_u8(vget_low_u8(s[0][0]), vget_low_u8(s[0][0]));
+  sq[1][0] = vmull_u8(vget_low_u8(s[1][0]), vget_low_u8(s[1][0]));
+  sq[0][1] = vmull_u8(vget_high_u8(s[0][0]), vget_high_u8(s[0][0]));
+  sq[1][1] = vmull_u8(vget_high_u8(s[1][0]), vget_high_u8(s[1][0]));
+  SumHorizontal(s[0][0], sq[0], &s3[2], &s5[3], &sq3[2], &sq5[3]);
+  SumHorizontal(s[1][0], sq[1], &s3[3], &s5[4], &sq3[3], &sq5[4]);
+  vst1q_u16(sum3[2], s3[2]);
+  vst1q_u16(sum3[3], s3[3]);
+  vst1q_u32(square_sum3[2] + 0, sq3[2].val[0]);
+  vst1q_u32(square_sum3[2] + 4, sq3[2].val[1]);
+  vst1q_u32(square_sum3[3] + 0, sq3[3].val[0]);
+  vst1q_u32(square_sum3[3] + 4, sq3[3].val[1]);
+  vst1q_u16(sum5[3], s5[3]);
+  vst1q_u16(sum5[4], s5[4]);
+  vst1q_u32(square_sum5[3] + 0, sq5[3].val[0]);
+  vst1q_u32(square_sum5[3] + 4, sq5[3].val[1]);
+  vst1q_u32(square_sum5[4] + 0, sq5[4].val[0]);
+  vst1q_u32(square_sum5[4] + 4, sq5[4].val[1]);
+  s3[0] = vld1q_u16(sum3[0]);
+  s3[1] = vld1q_u16(sum3[1]);
+  sq3[0].val[0] = vld1q_u32(square_sum3[0] + 0);
+  sq3[0].val[1] = vld1q_u32(square_sum3[0] + 4);
+  sq3[1].val[0] = vld1q_u32(square_sum3[1] + 0);
+  sq3[1].val[1] = vld1q_u32(square_sum3[1] + 4);
+  s5[0] = vld1q_u16(sum5[0]);
+  s5[1] = vld1q_u16(sum5[1]);
+  s5[2] = vld1q_u16(sum5[2]);
+  sq5[0].val[0] = vld1q_u32(square_sum5[0] + 0);
+  sq5[0].val[1] = vld1q_u32(square_sum5[0] + 4);
+  sq5[1].val[0] = vld1q_u32(square_sum5[1] + 0);
+  sq5[1].val[1] = vld1q_u32(square_sum5[1] + 4);
+  sq5[2].val[0] = vld1q_u32(square_sum5[2] + 0);
+  sq5[2].val[1] = vld1q_u32(square_sum5[2] + 4);
+  CalculateIntermediate3<0>(s3, sq3, scales[1], ma3[0], b3[0]);
+  CalculateIntermediate3<0>(s3 + 1, sq3 + 1, scales[1], ma3[1], b3[1]);
+  CalculateIntermediate5<0>(s5, sq5, scales[0], ma5, b5);
 }
 
 LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess(
     const uint8_t* const src0, const uint8_t* const src1, const ptrdiff_t x,
-    const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5],
-    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
-    uint8x8x2_t s[2], uint16x8x2_t sq[2], uint8x8_t* const ma3_0,
-    uint8x8_t* const ma3_1, uint16x8_t* const b3_0, uint16x8_t* const b3_1,
-    uint8x8_t* const ma5, uint16x8_t* const b5) {
-  uint16x8_t s3[4], s5[5];
+    const uint16_t scales[2], uint8x16_t s[2][2], uint16_t* const sum3[4],
+    uint16_t* const sum5[5], uint32_t* const square_sum3[4],
+    uint32_t* const square_sum5[5], uint16x8_t sq[2][4], uint8x16_t ma3[2][2],
+    uint16x8_t b3[2][3], uint8x16_t ma5[2], uint16x8_t b5[2]) {
+  uint16x8_t s3[2][4], s5[2][5];
   uint32x4x2_t sq3[4], sq5[5];
-  s[0].val[1] = vld1_u8(src0 + x + 8);
-  s[1].val[1] = vld1_u8(src1 + x + 8);
-  sq[0].val[1] = vmull_u8(s[0].val[1], s[0].val[1]);
-  sq[1].val[1] = vmull_u8(s[1].val[1], s[1].val[1]);
-  SumHorizontal(s[0], sq[0], &s3[2], &s5[3], &sq3[2], &sq5[3]);
-  SumHorizontal(s[1], sq[1], &s3[3], &s5[4], &sq3[3], &sq5[4]);
-  vst1q_u16(sum3[2] + x, s3[2]);
-  vst1q_u16(sum3[3] + x, s3[3]);
+  s[0][1] = vld1q_u8(src0 + x + 8);
+  s[1][1] = vld1q_u8(src1 + x + 8);
+  sq[0][2] = vmull_u8(vget_low_u8(s[0][1]), vget_low_u8(s[0][1]));
+  sq[1][2] = vmull_u8(vget_low_u8(s[1][1]), vget_low_u8(s[1][1]));
+  SumHorizontal<8>(s[0], &s3[0][2], &s3[1][2], &s5[0][3], &s5[1][3]);
+  SumHorizontal<8>(s[1], &s3[0][3], &s3[1][3], &s5[0][4], &s5[1][4]);
+  SumHorizontal(sq[0] + 1, &sq3[2], &sq5[3]);
+  SumHorizontal(sq[1] + 1, &sq3[3], &sq5[4]);
+  vst1q_u16(sum3[2] + x, s3[0][2]);
+  vst1q_u16(sum3[3] + x, s3[0][3]);
   vst1q_u32(square_sum3[2] + x + 0, sq3[2].val[0]);
   vst1q_u32(square_sum3[2] + x + 4, sq3[2].val[1]);
   vst1q_u32(square_sum3[3] + x + 0, sq3[3].val[0]);
   vst1q_u32(square_sum3[3] + x + 4, sq3[3].val[1]);
-  vst1q_u16(sum5[3] + x, s5[3]);
-  vst1q_u16(sum5[4] + x, s5[4]);
+  vst1q_u16(sum5[3] + x, s5[0][3]);
+  vst1q_u16(sum5[4] + x, s5[0][4]);
   vst1q_u32(square_sum5[3] + x + 0, sq5[3].val[0]);
   vst1q_u32(square_sum5[3] + x + 4, sq5[3].val[1]);
   vst1q_u32(square_sum5[4] + x + 0, sq5[4].val[0]);
   vst1q_u32(square_sum5[4] + x + 4, sq5[4].val[1]);
-  s3[0] = vld1q_u16(sum3[0] + x);
-  s3[1] = vld1q_u16(sum3[1] + x);
+  s3[0][0] = vld1q_u16(sum3[0] + x);
+  s3[0][1] = vld1q_u16(sum3[1] + x);
   sq3[0].val[0] = vld1q_u32(square_sum3[0] + x + 0);
   sq3[0].val[1] = vld1q_u32(square_sum3[0] + x + 4);
   sq3[1].val[0] = vld1q_u32(square_sum3[1] + x + 0);
   sq3[1].val[1] = vld1q_u32(square_sum3[1] + x + 4);
-  s5[0] = vld1q_u16(sum5[0] + x);
-  s5[1] = vld1q_u16(sum5[1] + x);
-  s5[2] = vld1q_u16(sum5[2] + x);
+  s5[0][0] = vld1q_u16(sum5[0] + x);
+  s5[0][1] = vld1q_u16(sum5[1] + x);
+  s5[0][2] = vld1q_u16(sum5[2] + x);
   sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 0);
   sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 4);
   sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 0);
   sq5[1].val[1] = vld1q_u32(square_sum5[1] + x + 4);
   sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 0);
   sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 4);
-  CalculateIntermediate3(s3, sq3, scales[1], ma3_0, b3_0);
-  CalculateIntermediate3(s3 + 1, sq3 + 1, scales[1], ma3_1, b3_1);
-  CalculateIntermediate5(s5, sq5, scales[0], ma5, b5);
+  CalculateIntermediate3<8>(s3[0], sq3, scales[1], &ma3[0][0], &b3[0][1]);
+  CalculateIntermediate3<8>(s3[0] + 1, sq3 + 1, scales[1], &ma3[1][0],
+                            &b3[1][1]);
+  CalculateIntermediate5<8>(s5[0], sq5, scales[0], &ma5[0], &b5[0]);
+
+  sq[0][3] = vmull_u8(vget_high_u8(s[0][1]), vget_high_u8(s[0][1]));
+  sq[1][3] = vmull_u8(vget_high_u8(s[1][1]), vget_high_u8(s[1][1]));
+  SumHorizontal(sq[0] + 2, &sq3[2], &sq5[3]);
+  SumHorizontal(sq[1] + 2, &sq3[3], &sq5[4]);
+  vst1q_u16(sum3[2] + x + 8, s3[1][2]);
+  vst1q_u16(sum3[3] + x + 8, s3[1][3]);
+  vst1q_u32(square_sum3[2] + x + 8, sq3[2].val[0]);
+  vst1q_u32(square_sum3[2] + x + 12, sq3[2].val[1]);
+  vst1q_u32(square_sum3[3] + x + 8, sq3[3].val[0]);
+  vst1q_u32(square_sum3[3] + x + 12, sq3[3].val[1]);
+  vst1q_u16(sum5[3] + x + 8, s5[1][3]);
+  vst1q_u16(sum5[4] + x + 8, s5[1][4]);
+  vst1q_u32(square_sum5[3] + x + 8, sq5[3].val[0]);
+  vst1q_u32(square_sum5[3] + x + 12, sq5[3].val[1]);
+  vst1q_u32(square_sum5[4] + x + 8, sq5[4].val[0]);
+  vst1q_u32(square_sum5[4] + x + 12, sq5[4].val[1]);
+  s3[1][0] = vld1q_u16(sum3[0] + x + 8);
+  s3[1][1] = vld1q_u16(sum3[1] + x + 8);
+  sq3[0].val[0] = vld1q_u32(square_sum3[0] + x + 8);
+  sq3[0].val[1] = vld1q_u32(square_sum3[0] + x + 12);
+  sq3[1].val[0] = vld1q_u32(square_sum3[1] + x + 8);
+  sq3[1].val[1] = vld1q_u32(square_sum3[1] + x + 12);
+  s5[1][0] = vld1q_u16(sum5[0] + x + 8);
+  s5[1][1] = vld1q_u16(sum5[1] + x + 8);
+  s5[1][2] = vld1q_u16(sum5[2] + x + 8);
+  sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 8);
+  sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 12);
+  sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 8);
+  sq5[1].val[1] = vld1q_u32(square_sum5[1] + x + 12);
+  sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 8);
+  sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 12);
+  CalculateIntermediate3<0>(s3[1], sq3, scales[1], &ma3[0][1], &b3[0][2]);
+  CalculateIntermediate3<0>(s3[1] + 1, sq3 + 1, scales[1], &ma3[1][1],
+                            &b3[1][2]);
+  CalculateIntermediate5<0>(s5[1], sq5, scales[0], &ma5[1], &b5[1]);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRowLo(
+    const uint8_t* const src, const uint16_t scales[2],
+    const uint16_t* const sum3[4], const uint16_t* const sum5[5],
+    const uint32_t* const square_sum3[4], const uint32_t* const square_sum5[5],
+    uint8x16_t* const s, uint16x8_t sq[2], uint8x16_t* const ma3,
+    uint8x16_t* const ma5, uint16x8_t* const b3, uint16x8_t* const b5) {
+  uint16x8_t s3[3], s5[5];
+  uint32x4x2_t sq3[3], sq5[5];
+  *s = vld1q_u8(src);
+  sq[0] = vmull_u8(vget_low_u8(*s), vget_low_u8(*s));
+  sq[1] = vmull_u8(vget_high_u8(*s), vget_high_u8(*s));
+  SumHorizontal(*s, sq, &s3[2], &s5[3], &sq3[2], &sq5[3]);
+  s5[0] = vld1q_u16(sum5[0]);
+  s5[1] = vld1q_u16(sum5[1]);
+  s5[2] = vld1q_u16(sum5[2]);
+  s5[4] = s5[3];
+  sq5[0].val[0] = vld1q_u32(square_sum5[0] + 0);
+  sq5[0].val[1] = vld1q_u32(square_sum5[0] + 4);
+  sq5[1].val[0] = vld1q_u32(square_sum5[1] + 0);
+  sq5[1].val[1] = vld1q_u32(square_sum5[1] + 4);
+  sq5[2].val[0] = vld1q_u32(square_sum5[2] + 0);
+  sq5[2].val[1] = vld1q_u32(square_sum5[2] + 4);
+  sq5[4] = sq5[3];
+  CalculateIntermediate5<0>(s5, sq5, scales[0], ma5, b5);
+  s3[0] = vld1q_u16(sum3[0]);
+  s3[1] = vld1q_u16(sum3[1]);
+  sq3[0].val[0] = vld1q_u32(square_sum3[0] + 0);
+  sq3[0].val[1] = vld1q_u32(square_sum3[0] + 4);
+  sq3[1].val[0] = vld1q_u32(square_sum3[1] + 0);
+  sq3[1].val[1] = vld1q_u32(square_sum3[1] + 4);
+  CalculateIntermediate3<0>(s3, sq3, scales[1], ma3, b3);
 }
 
 LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRow(
     const uint8_t* const src, const ptrdiff_t x, const uint16_t scales[2],
     const uint16_t* const sum3[4], const uint16_t* const sum5[5],
     const uint32_t* const square_sum3[4], const uint32_t* const square_sum5[5],
-    uint8x8x2_t* const s, uint16x8x2_t* const sq, uint8x8_t* const ma3,
-    uint8x8_t* const ma5, uint16x8_t* const b3, uint16x8_t* const b5) {
-  uint16x8_t s3[3], s5[5];
+    uint8x16_t s[2], uint16x8_t sq[3], uint8x16_t ma3[2], uint8x16_t ma5[2],
+    uint16x8_t b3[2], uint16x8_t b5[2]) {
+  uint16x8_t s3[2][3], s5[2][5];
   uint32x4x2_t sq3[3], sq5[5];
-  s->val[1] = vld1_u8(src + x + 8);
-  sq->val[1] = vmull_u8(s->val[1], s->val[1]);
-  SumHorizontal(*s, *sq, &s3[2], &s5[3], &sq3[2], &sq5[3]);
-  s5[0] = vld1q_u16(sum5[0] + x);
-  s5[1] = vld1q_u16(sum5[1] + x);
-  s5[2] = vld1q_u16(sum5[2] + x);
-  s5[4] = s5[3];
+  s[1] = vld1q_u8(src + x + 8);
+  sq[1] = vmull_u8(vget_low_u8(s[1]), vget_low_u8(s[1]));
+  SumHorizontal<8>(s, &s3[0][2], &s3[1][2], &s5[0][3], &s5[1][3]);
+  SumHorizontal(sq, &sq3[2], &sq5[3]);
+  s5[0][0] = vld1q_u16(sum5[0] + x);
+  s5[0][1] = vld1q_u16(sum5[1] + x);
+  s5[0][2] = vld1q_u16(sum5[2] + x);
+  s5[0][4] = s5[0][3];
   sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 0);
   sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 4);
   sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 0);
@@ -1134,14 +1538,36 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRow(
   sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 0);
   sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 4);
   sq5[4] = sq5[3];
-  CalculateIntermediate5(s5, sq5, scales[0], ma5, b5);
-  s3[0] = vld1q_u16(sum3[0] + x);
-  s3[1] = vld1q_u16(sum3[1] + x);
+  CalculateIntermediate5<8>(s5[0], sq5, scales[0], &ma5[0], &b5[0]);
+  s3[0][0] = vld1q_u16(sum3[0] + x);
+  s3[0][1] = vld1q_u16(sum3[1] + x);
   sq3[0].val[0] = vld1q_u32(square_sum3[0] + x + 0);
   sq3[0].val[1] = vld1q_u32(square_sum3[0] + x + 4);
   sq3[1].val[0] = vld1q_u32(square_sum3[1] + x + 0);
   sq3[1].val[1] = vld1q_u32(square_sum3[1] + x + 4);
-  CalculateIntermediate3(s3, sq3, scales[1], ma3, b3);
+  CalculateIntermediate3<8>(s3[0], sq3, scales[1], &ma3[0], &b3[0]);
+
+  sq[2] = vmull_u8(vget_high_u8(s[1]), vget_high_u8(s[1]));
+  SumHorizontal(sq + 1, &sq3[2], &sq5[3]);
+  s5[1][0] = vld1q_u16(sum5[0] + x + 8);
+  s5[1][1] = vld1q_u16(sum5[1] + x + 8);
+  s5[1][2] = vld1q_u16(sum5[2] + x + 8);
+  s5[1][4] = s5[1][3];
+  sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 8);
+  sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 12);
+  sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 8);
+  sq5[1].val[1] = vld1q_u32(square_sum5[1] + x + 12);
+  sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 8);
+  sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 12);
+  sq5[4] = sq5[3];
+  CalculateIntermediate5<0>(s5[1], sq5, scales[0], &ma5[1], &b5[1]);
+  s3[1][0] = vld1q_u16(sum3[0] + x + 8);
+  s3[1][1] = vld1q_u16(sum3[1] + x + 8);
+  sq3[0].val[0] = vld1q_u32(square_sum3[0] + x + 8);
+  sq3[0].val[1] = vld1q_u32(square_sum3[0] + x + 12);
+  sq3[1].val[0] = vld1q_u32(square_sum3[1] + x + 8);
+  sq3[1].val[1] = vld1q_u32(square_sum3[1] + x + 12);
+  CalculateIntermediate3<0>(s3[1], sq3, scales[1], &ma3[1], &b3[1]);
 }
 
 inline void BoxSumFilterPreProcess5(const uint8_t* const src0,
@@ -1150,33 +1576,39 @@ inline void BoxSumFilterPreProcess5(const uint8_t* const src0,
                                     uint16_t* const sum5[5],
                                     uint32_t* const square_sum5[5],
                                     uint16_t* ma565, uint32_t* b565) {
-  uint8x8x2_t s[2], mas;
-  uint16x8x2_t sq[2], bs;
-  s[0].val[0] = vld1_u8(src0);
-  s[1].val[0] = vld1_u8(src1);
-  sq[0].val[0] = vmull_u8(s[0].val[0], s[0].val[0]);
-  sq[1].val[0] = vmull_u8(s[1].val[0], s[1].val[0]);
-  BoxFilterPreProcess5(src0, src1, 0, scale, sum5, square_sum5, s, sq,
-                       &mas.val[0], &bs.val[0]);
+  uint8x16_t s[2][2], mas[2];
+  uint16x8_t sq[2][4], bs[3];
+  BoxFilterPreProcess5Lo(src0, src1, scale, s, sum5, square_sum5, sq, &mas[0],
+                         &bs[0]);
 
   int x = 0;
   do {
-    s[0].val[0] = s[0].val[1];
-    s[1].val[0] = s[1].val[1];
-    sq[0].val[0] = sq[0].val[1];
-    sq[1].val[0] = sq[1].val[1];
-    BoxFilterPreProcess5(src0, src1, x + 8, scale, sum5, square_sum5, s, sq,
-                         &mas.val[1], &bs.val[1]);
-    const uint16x8_t ma = Sum565(mas);
-    const uint32x4x2_t b = Sum565W(bs);
-    vst1q_u16(ma565, ma);
-    vst1q_u32(b565 + 0, b.val[0]);
-    vst1q_u32(b565 + 4, b.val[1]);
-    mas.val[0] = mas.val[1];
-    bs.val[0] = bs.val[1];
-    ma565 += 8;
-    b565 += 8;
-    x += 8;
+    uint16x8_t ma[2];
+    uint8x16_t masx[3];
+    uint32x4x2_t b[2];
+    BoxFilterPreProcess5(src0, src1, x + 8, scale, s, sum5, square_sum5, sq,
+                         mas, bs + 1);
+    Prepare3_8<0>(mas, masx);
+    ma[0] = Sum565<0>(masx);
+    b[0] = Sum565W(bs);
+    vst1q_u16(ma565, ma[0]);
+    vst1q_u32(b565 + 0, b[0].val[0]);
+    vst1q_u32(b565 + 4, b[0].val[1]);
+
+    ma[1] = Sum565<8>(masx);
+    b[1] = Sum565W(bs + 1);
+    vst1q_u16(ma565 + 8, ma[1]);
+    vst1q_u32(b565 + 8, b[1].val[0]);
+    vst1q_u32(b565 + 12, b[1].val[1]);
+    s[0][0] = s[0][1];
+    s[1][0] = s[1][1];
+    sq[0][1] = sq[0][3];
+    sq[1][1] = sq[1][3];
+    mas[0] = mas[1];
+    bs[0] = bs[2];
+    ma565 += 16;
+    b565 += 16;
+    x += 16;
   } while (x < width);
 }
 
@@ -1185,35 +1617,44 @@ LIBGAV1_ALWAYS_INLINE void BoxSumFilterPreProcess3(
     const uint8_t* const src, const int width, const uint32_t scale,
     uint16_t* const sum3[3], uint32_t* const square_sum3[3], uint16_t* ma343,
     uint16_t* ma444, uint32_t* b343, uint32_t* b444) {
-  uint8x8x2_t s, mas;
-  uint16x8x2_t sq, bs;
-  s.val[0] = vld1_u8(src);
-  sq.val[0] = vmull_u8(s.val[0], s.val[0]);
-  BoxFilterPreProcess3(src, 0, scale, sum3, square_sum3, &s, &sq, &mas.val[0],
-                       &bs.val[0]);
+  uint8x16_t s[2], mas[2];
+  uint16x8_t sq[4], bs[3];
+  BoxFilterPreProcess3Lo(src, scale, &s[0], sum3, square_sum3, sq, &mas[0],
+                         &bs[0]);
 
   int x = 0;
   do {
-    s.val[0] = s.val[1];
-    sq.val[0] = sq.val[1];
-    BoxFilterPreProcess3(src, x + 8, scale, sum3, square_sum3, &s, &sq,
-                         &mas.val[1], &bs.val[1]);
+    uint8x16_t ma3x[3];
+    BoxFilterPreProcess3(src, x + 8, scale, sum3, square_sum3, s, sq + 1, mas,
+                         bs + 1);
+    Prepare3_8<0>(mas, ma3x);
     if (calculate444) {
-      Store343_444(mas, bs, 0, ma343, ma444, b343, b444);
-      ma444 += 8;
-      b444 += 8;
+      Store343_444<0>(ma3x, bs + 0, 0, ma343, ma444, b343, b444);
+      Store343_444<8>(ma3x, bs + 1, 0, ma343 + 8, ma444 + 8, b343 + 8,
+                      b444 + 8);
+      ma444 += 16;
+      b444 += 16;
     } else {
-      const uint16x8_t ma = Sum343(mas);
-      const uint32x4x2_t b = Sum343W(bs);
-      vst1q_u16(ma343, ma);
-      vst1q_u32(b343 + 0, b.val[0]);
-      vst1q_u32(b343 + 4, b.val[1]);
+      uint16x8_t ma[2];
+      uint32x4x2_t b[2];
+      ma[0] = Sum343<0>(ma3x);
+      b[0] = Sum343W(bs);
+      vst1q_u16(ma343, ma[0]);
+      vst1q_u32(b343 + 0, b[0].val[0]);
+      vst1q_u32(b343 + 4, b[0].val[1]);
+      ma[1] = Sum343<8>(ma3x);
+      b[1] = Sum343W(bs + 1);
+      vst1q_u16(ma343 + 8, ma[1]);
+      vst1q_u32(b343 + 8, b[1].val[0]);
+      vst1q_u32(b343 + 12, b[1].val[1]);
     }
-    mas.val[0] = mas.val[1];
-    bs.val[0] = bs.val[1];
-    ma343 += 8;
-    b343 += 8;
-    x += 8;
+    s[0] = s[1];
+    sq[1] = sq[3];
+    mas[0] = mas[1];
+    bs[0] = bs[2];
+    ma343 += 16;
+    b343 += 16;
+    x += 16;
   } while (x < width);
 }
 
@@ -1221,48 +1662,58 @@ inline void BoxSumFilterPreProcess(
     const uint8_t* const src0, const uint8_t* const src1, const int width,
     const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5],
     uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
-    uint16_t* const ma343[4], uint16_t* const ma444[2], uint16_t* ma565,
-    uint32_t* const b343[4], uint32_t* const b444[2], uint32_t* b565) {
-  uint8x8x2_t s[2];
-  uint8x8x2_t ma3[2], ma5;
-  uint16x8x2_t sq[2], b3[2], b5;
-  s[0].val[0] = vld1_u8(src0);
-  s[1].val[0] = vld1_u8(src1);
-  sq[0].val[0] = vmull_u8(s[0].val[0], s[0].val[0]);
-  sq[1].val[0] = vmull_u8(s[1].val[0], s[1].val[0]);
-  BoxFilterPreProcess(src0, src1, 0, scales, sum3, sum5, square_sum3,
-                      square_sum5, s, sq, &ma3[0].val[0], &ma3[1].val[0],
-                      &b3[0].val[0], &b3[1].val[0], &ma5.val[0], &b5.val[0]);
+    uint16_t* const ma343[4], uint16_t* const ma444, uint16_t* ma565,
+    uint32_t* const b343[4], uint32_t* const b444, uint32_t* b565) {
+  uint8x16_t s[2][2], ma3[2][2], ma5[2];
+  uint16x8_t sq[2][4], b3[2][3], b5[3];
+  BoxFilterPreProcessLo(src0, src1, scales, s, sum3, sum5, square_sum3,
+                        square_sum5, sq, ma3, b3, &ma5[0], &b5[0]);
 
   int x = 0;
   do {
-    s[0].val[0] = s[0].val[1];
-    s[1].val[0] = s[1].val[1];
-    sq[0].val[0] = sq[0].val[1];
-    sq[1].val[0] = sq[1].val[1];
-    BoxFilterPreProcess(src0, src1, x + 8, scales, sum3, sum5, square_sum3,
-                        square_sum5, s, sq, &ma3[0].val[1], &ma3[1].val[1],
-                        &b3[0].val[1], &b3[1].val[1], &ma5.val[1], &b5.val[1]);
-    uint16x8_t ma = Sum343(ma3[0]);
-    uint32x4x2_t b = Sum343W(b3[0]);
-    vst1q_u16(ma343[0] + x, ma);
-    vst1q_u32(b343[0] + x, b.val[0]);
-    vst1q_u32(b343[0] + x + 4, b.val[1]);
-    Store343_444(ma3[1], b3[1], x, ma343[1], ma444[0], b343[1], b444[0]);
-    ma = Sum565(ma5);
-    b = Sum565W(b5);
-    vst1q_u16(ma565, ma);
-    vst1q_u32(b565 + 0, b.val[0]);
-    vst1q_u32(b565 + 4, b.val[1]);
-    ma3[0].val[0] = ma3[0].val[1];
-    ma3[1].val[0] = ma3[1].val[1];
-    b3[0].val[0] = b3[0].val[1];
-    b3[1].val[0] = b3[1].val[1];
-    ma5.val[0] = ma5.val[1];
-    b5.val[0] = b5.val[1];
-    ma565 += 8;
-    b565 += 8;
-    x += 8;
+    uint16x8_t ma[2];
+    uint8x16_t ma3x[3], ma5x[3];
+    uint32x4x2_t b[2];
+    BoxFilterPreProcess(src0, src1, x + 8, scales, s, sum3, sum5, square_sum3,
+                        square_sum5, sq, ma3, b3, ma5, b5 + 1);
+    Prepare3_8<0>(ma3[0], ma3x);
+    ma[0] = Sum343<0>(ma3x);
+    ma[1] = Sum343<8>(ma3x);
+    b[0] = Sum343W(b3[0] + 0);
+    b[1] = Sum343W(b3[0] + 1);
+    vst1q_u16(ma343[0] + x, ma[0]);
+    vst1q_u16(ma343[0] + x + 8, ma[1]);
+    vst1q_u32(b343[0] + x, b[0].val[0]);
+    vst1q_u32(b343[0] + x + 4, b[0].val[1]);
+    vst1q_u32(b343[0] + x + 8, b[1].val[0]);
+    vst1q_u32(b343[0] + x + 12, b[1].val[1]);
+    Prepare3_8<0>(ma3[1], ma3x);
+    Store343_444<0>(ma3x, b3[1], x, ma343[1], ma444, b343[1], b444);
+    Store343_444<8>(ma3x, b3[1] + 1, x + 8, ma343[1], ma444, b343[1], b444);
+    Prepare3_8<0>(ma5, ma5x);
+    ma[0] = Sum565<0>(ma5x);
+    ma[1] = Sum565<8>(ma5x);
+    b[0] = Sum565W(b5);
+    b[1] = Sum565W(b5 + 1);
+    vst1q_u16(ma565, ma[0]);
+    vst1q_u16(ma565 + 8, ma[1]);
+    vst1q_u32(b565 + 0, b[0].val[0]);
+    vst1q_u32(b565 + 4, b[0].val[1]);
+    vst1q_u32(b565 + 8, b[1].val[0]);
+    vst1q_u32(b565 + 12, b[1].val[1]);
+    s[0][0] = s[0][1];
+    s[1][0] = s[1][1];
+    sq[0][1] = sq[0][3];
+    sq[1][1] = sq[1][3];
+    ma3[0][0] = ma3[0][1];
+    ma3[1][0] = ma3[1][1];
+    b3[0][0] = b3[0][2];
+    b3[1][0] = b3[1][2];
+    ma5[0] = ma5[1];
+    b5[0] = b5[2];
+    ma565 += 16;
+    b565 += 16;
+    x += 16;
   } while (x < width);
 }
 
@@ -1310,37 +1761,36 @@ inline int16x8_t CalculateFilteredOutputPass2(const uint8x8_t s,
   return CalculateFilteredOutput<5>(s, ma_sum, b_sum);
 }
 
-inline void SelfGuidedFinal(const uint8x8_t src, const int32x4_t v[2],
-                            uint8_t* const dst) {
+inline uint8x8_t SelfGuidedFinal(const uint8x8_t src, const int32x4_t v[2]) {
   const int16x4_t v_lo =
       vrshrn_n_s32(v[0], kSgrProjRestoreBits + kSgrProjPrecisionBits);
   const int16x4_t v_hi =
       vrshrn_n_s32(v[1], kSgrProjRestoreBits + kSgrProjPrecisionBits);
   const int16x8_t vv = vcombine_s16(v_lo, v_hi);
-  const int16x8_t s = ZeroExtend(src);
-  const int16x8_t d = vaddq_s16(s, vv);
-  vst1_u8(dst, vqmovun_s16(d));
+  const int16x8_t d =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(vv), src));
+  return vqmovun_s16(d);
 }
 
-inline void SelfGuidedDoubleMultiplier(const uint8x8_t src,
-                                       const int16x8_t filter[2], const int w0,
-                                       const int w2, uint8_t* const dst) {
+inline uint8x8_t SelfGuidedDoubleMultiplier(const uint8x8_t src,
+                                            const int16x8_t filter[2],
+                                            const int w0, const int w2) {
   int32x4_t v[2];
   v[0] = vmull_n_s16(vget_low_s16(filter[0]), w0);
   v[1] = vmull_n_s16(vget_high_s16(filter[0]), w0);
   v[0] = vmlal_n_s16(v[0], vget_low_s16(filter[1]), w2);
   v[1] = vmlal_n_s16(v[1], vget_high_s16(filter[1]), w2);
-  SelfGuidedFinal(src, v, dst);
+  return SelfGuidedFinal(src, v);
 }
 
-inline void SelfGuidedSingleMultiplier(const uint8x8_t src,
-                                       const int16x8_t filter, const int w0,
-                                       uint8_t* const dst) {
+inline uint8x8_t SelfGuidedSingleMultiplier(const uint8x8_t src,
+                                            const int16x8_t filter,
+                                            const int w0) {
   // weight: -96 to 96 (Sgrproj_Xqd_Min/Max)
   int32x4_t v[2];
   v[0] = vmull_n_s16(vget_low_s16(filter), w0);
   v[1] = vmull_n_s16(vget_high_s16(filter), w0);
-  SelfGuidedFinal(src, v, dst);
+  return SelfGuidedFinal(src, v);
 }
 
 LIBGAV1_ALWAYS_INLINE void BoxFilterPass1(
@@ -1349,43 +1799,60 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterPass1(
     uint32_t* const square_sum5[5], const int width, const uint32_t scale,
     const int16_t w0, uint16_t* const ma565[2], uint32_t* const b565[2],
     uint8_t* const dst) {
-  uint8x8x2_t s[2], mas;
-  uint16x8x2_t sq[2], bs;
-  s[0].val[0] = vld1_u8(src0);
-  s[1].val[0] = vld1_u8(src1);
-  sq[0].val[0] = vmull_u8(s[0].val[0], s[0].val[0]);
-  sq[1].val[0] = vmull_u8(s[1].val[0], s[1].val[0]);
-  BoxFilterPreProcess5(src0, src1, 0, scale, sum5, square_sum5, s, sq,
-                       &mas.val[0], &bs.val[0]);
+  uint8x16_t s[2][2], mas[2];
+  uint16x8_t sq[2][4], bs[3];
+  BoxFilterPreProcess5Lo(src0, src1, scale, s, sum5, square_sum5, sq, &mas[0],
+                         &bs[0]);
 
   int x = 0;
   do {
-    s[0].val[0] = s[0].val[1];
-    s[1].val[0] = s[1].val[1];
-    sq[0].val[0] = sq[0].val[1];
-    sq[1].val[0] = sq[1].val[1];
-    BoxFilterPreProcess5(src0, src1, x + 8, scale, sum5, square_sum5, s, sq,
-                         &mas.val[1], &bs.val[1]);
     uint16x8_t ma[2];
+    uint8x16_t masx[3];
     uint32x4x2_t b[2];
-    ma[1] = Sum565(mas);
+    int16x8_t p0, p1;
+    BoxFilterPreProcess5(src0, src1, x + 8, scale, s, sum5, square_sum5, sq,
+                         mas, bs + 1);
+    Prepare3_8<0>(mas, masx);
+    ma[1] = Sum565<0>(masx);
     b[1] = Sum565W(bs);
     vst1q_u16(ma565[1] + x, ma[1]);
     vst1q_u32(b565[1] + x + 0, b[1].val[0]);
     vst1q_u32(b565[1] + x + 4, b[1].val[1]);
-    const uint8x8_t sr0 = vld1_u8(src + x);
-    const uint8x8_t sr1 = vld1_u8(src + stride + x);
-    int16x8_t p0, p1;
+    const uint8x16_t sr0 = vld1q_u8(src + x);
+    const uint8x16_t sr1 = vld1q_u8(src + stride + x);
+    const uint8x8_t sr00 = vget_low_u8(sr0);
+    const uint8x8_t sr10 = vget_low_u8(sr1);
     ma[0] = vld1q_u16(ma565[0] + x);
     b[0].val[0] = vld1q_u32(b565[0] + x + 0);
     b[0].val[1] = vld1q_u32(b565[0] + x + 4);
-    p0 = CalculateFilteredOutputPass1(sr0, ma, b);
-    p1 = CalculateFilteredOutput<4>(sr1, ma[1], b[1]);
-    SelfGuidedSingleMultiplier(sr0, p0, w0, dst + x);
-    SelfGuidedSingleMultiplier(sr1, p1, w0, dst + stride + x);
-    mas.val[0] = mas.val[1];
-    bs.val[0] = bs.val[1];
-    x += 8;
+    p0 = CalculateFilteredOutputPass1(sr00, ma, b);
+    p1 = CalculateFilteredOutput<4>(sr10, ma[1], b[1]);
+    const uint8x8_t d00 = SelfGuidedSingleMultiplier(sr00, p0, w0);
+    const uint8x8_t d10 = SelfGuidedSingleMultiplier(sr10, p1, w0);
+
+    ma[1] = Sum565<8>(masx);
+    b[1] = Sum565W(bs + 1);
+    vst1q_u16(ma565[1] + x + 8, ma[1]);
+    vst1q_u32(b565[1] + x + 8, b[1].val[0]);
+    vst1q_u32(b565[1] + x + 12, b[1].val[1]);
+    const uint8x8_t sr01 = vget_high_u8(sr0);
+    const uint8x8_t sr11 = vget_high_u8(sr1);
+    ma[0] = vld1q_u16(ma565[0] + x + 8);
+    b[0].val[0] = vld1q_u32(b565[0] + x + 8);
+    b[0].val[1] = vld1q_u32(b565[0] + x + 12);
+    p0 = CalculateFilteredOutputPass1(sr01, ma, b);
+    p1 = CalculateFilteredOutput<4>(sr11, ma[1], b[1]);
+    const uint8x8_t d01 = SelfGuidedSingleMultiplier(sr01, p0, w0);
+    const uint8x8_t d11 = SelfGuidedSingleMultiplier(sr11, p1, w0);
+    vst1q_u8(dst + x, vcombine_u8(d00, d01));
+    vst1q_u8(dst + stride + x, vcombine_u8(d10, d11));
+    s[0][0] = s[0][1];
+    s[1][0] = s[1][1];
+    sq[0][1] = sq[0][3];
+    sq[1][1] = sq[1][3];
+    mas[0] = mas[1];
+    bs[0] = bs[2];
+    x += 16;
   } while (x < width);
 }
 
@@ -1396,34 +1863,45 @@ inline void BoxFilterPass1LastRow(const uint8_t* const src,
                                   uint32_t* const square_sum5[5],
                                   uint16_t* ma565, uint32_t* b565,
                                   uint8_t* const dst) {
-  uint8x8x2_t s, mas;
-  uint16x8x2_t sq, bs;
-  s.val[0] = vld1_u8(src0);
-  sq.val[0] = vmull_u8(s.val[0], s.val[0]);
-  BoxFilterPreProcess5LastRow(src0, 0, scale, sum5, square_sum5, &s, &sq,
-                              &mas.val[0], &bs.val[0]);
+  uint8x16_t s[2], mas[2];
+  uint16x8_t sq[4], bs[4];
+  BoxFilterPreProcess5LastRowLo(src0, scale, s, sum5, square_sum5, sq, &mas[0],
+                                &bs[0]);
 
   int x = 0;
   do {
-    s.val[0] = s.val[1];
-    sq.val[0] = sq.val[1];
-    BoxFilterPreProcess5LastRow(src0, x + 8, scale, sum5, square_sum5, &s, &sq,
-                                &mas.val[1], &bs.val[1]);
     uint16x8_t ma[2];
+    uint8x16_t masx[3];
     uint32x4x2_t b[2];
-    ma[1] = Sum565(mas);
+    BoxFilterPreProcess5LastRow(src0, x + 8, scale, s, sum5, square_sum5,
+                                sq + 1, mas, bs + 1);
+    Prepare3_8<0>(mas, masx);
+    ma[1] = Sum565<0>(masx);
     b[1] = Sum565W(bs);
-    mas.val[0] = mas.val[1];
-    bs.val[0] = bs.val[1];
     ma[0] = vld1q_u16(ma565);
     b[0].val[0] = vld1q_u32(b565 + 0);
     b[0].val[1] = vld1q_u32(b565 + 4);
-    const uint8x8_t sr = vld1_u8(src + x);
-    const int16x8_t p = CalculateFilteredOutputPass1(sr, ma, b);
-    SelfGuidedSingleMultiplier(sr, p, w0, dst + x);
-    ma565 += 8;
-    b565 += 8;
-    x += 8;
+    const uint8x16_t sr = vld1q_u8(src + x);
+    const uint8x8_t sr0 = vget_low_u8(sr);
+    const int16x8_t p0 = CalculateFilteredOutputPass1(sr0, ma, b);
+    const uint8x8_t d0 = SelfGuidedSingleMultiplier(sr0, p0, w0);
+
+    ma[1] = Sum565<8>(masx);
+    b[1] = Sum565W(bs + 1);
+    bs[0] = bs[2];
+    const uint8x8_t sr1 = vget_high_u8(sr);
+    ma[0] = vld1q_u16(ma565 + 8);
+    b[0].val[0] = vld1q_u32(b565 + 8);
+    b[0].val[1] = vld1q_u32(b565 + 12);
+    const int16x8_t p1 = CalculateFilteredOutputPass1(sr1, ma, b);
+    const uint8x8_t d1 = SelfGuidedSingleMultiplier(sr1, p1, w0);
+    vst1q_u8(dst + x, vcombine_u8(d0, d1));
+    s[0] = s[1];
+    sq[1] = sq[3];
+    mas[0] = mas[1];
+    ma565 += 16;
+    b565 += 16;
+    x += 16;
   } while (x < width);
 }
 
@@ -1433,35 +1911,49 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterPass2(
     uint32_t* const square_sum3[3], uint16_t* const ma343[3],
     uint16_t* const ma444[2], uint32_t* const b343[3], uint32_t* const b444[2],
     uint8_t* const dst) {
-  uint8x8x2_t s, mas;
-  uint16x8x2_t sq, bs;
-  s.val[0] = vld1_u8(src0);
-  sq.val[0] = vmull_u8(s.val[0], s.val[0]);
-  BoxFilterPreProcess3(src0, 0, scale, sum3, square_sum3, &s, &sq, &mas.val[0],
-                       &bs.val[0]);
+  uint8x16_t s[2], mas[2];
+  uint16x8_t sq[4], bs[3];
+  BoxFilterPreProcess3Lo(src0, scale, &s[0], sum3, square_sum3, sq, &mas[0],
+                         &bs[0]);
 
   int x = 0;
   do {
-    s.val[0] = s.val[1];
-    sq.val[0] = sq.val[1];
-    BoxFilterPreProcess3(src0, x + 8, scale, sum3, square_sum3, &s, &sq,
-                         &mas.val[1], &bs.val[1]);
     uint16x8_t ma[3];
+    uint8x16_t ma3x[3];
     uint32x4x2_t b[3];
-    Store343_444(mas, bs, x, &ma[2], &b[2], ma343[2], ma444[1], b343[2],
-                 b444[1]);
-    const uint8x8_t sr = vld1_u8(src + x);
+    BoxFilterPreProcess3(src0, x + 8, scale, sum3, square_sum3, s, sq + 1, mas,
+                         bs + 1);
+    Prepare3_8<0>(mas, ma3x);
+    Store343_444<0>(ma3x, bs, x, &ma[2], &b[2], ma343[2], ma444[1], b343[2],
+                    b444[1]);
+    const uint8x16_t sr = vld1q_u8(src + x);
+    const uint8x8_t sr0 = vget_low_u8(sr);
     ma[0] = vld1q_u16(ma343[0] + x);
     ma[1] = vld1q_u16(ma444[0] + x);
     b[0].val[0] = vld1q_u32(b343[0] + x + 0);
     b[0].val[1] = vld1q_u32(b343[0] + x + 4);
     b[1].val[0] = vld1q_u32(b444[0] + x + 0);
     b[1].val[1] = vld1q_u32(b444[0] + x + 4);
-    const int16x8_t p = CalculateFilteredOutputPass2(sr, ma, b);
-    SelfGuidedSingleMultiplier(sr, p, w0, dst + x);
-    mas.val[0] = mas.val[1];
-    bs.val[0] = bs.val[1];
-    x += 8;
+    const int16x8_t p0 = CalculateFilteredOutputPass2(sr0, ma, b);
+    const uint8x8_t d0 = SelfGuidedSingleMultiplier(sr0, p0, w0);
+
+    Store343_444<8>(ma3x, bs + 1, x + 8, &ma[2], &b[2], ma343[2], ma444[1],
+                    b343[2], b444[1]);
+    const uint8x8_t sr1 = vget_high_u8(sr);
+    ma[0] = vld1q_u16(ma343[0] + x + 8);
+    ma[1] = vld1q_u16(ma444[0] + x + 8);
+    b[0].val[0] = vld1q_u32(b343[0] + x + 8);
+    b[0].val[1] = vld1q_u32(b343[0] + x + 12);
+    b[1].val[0] = vld1q_u32(b444[0] + x + 8);
+    b[1].val[1] = vld1q_u32(b444[0] + x + 12);
+    const int16x8_t p1 = CalculateFilteredOutputPass2(sr1, ma, b);
+    const uint8x8_t d1 = SelfGuidedSingleMultiplier(sr1, p1, w0);
+    vst1q_u8(dst + x, vcombine_u8(d0, d1));
+    s[0] = s[1];
+    sq[1] = sq[3];
+    mas[0] = mas[1];
+    bs[0] = bs[2];
+    x += 16;
   } while (x < width);
 }
 
@@ -1474,64 +1966,96 @@ LIBGAV1_ALWAYS_INLINE void BoxFilter(
     uint16_t* const ma343[4], uint16_t* const ma444[3],
     uint16_t* const ma565[2], uint32_t* const b343[4], uint32_t* const b444[3],
     uint32_t* const b565[2], uint8_t* const dst) {
-  uint8x8x2_t s[2], ma3[2], ma5;
-  uint16x8x2_t sq[2], b3[2], b5;
-  s[0].val[0] = vld1_u8(src0);
-  s[1].val[0] = vld1_u8(src1);
-  sq[0].val[0] = vmull_u8(s[0].val[0], s[0].val[0]);
-  sq[1].val[0] = vmull_u8(s[1].val[0], s[1].val[0]);
-  BoxFilterPreProcess(src0, src1, 0, scales, sum3, sum5, square_sum3,
-                      square_sum5, s, sq, &ma3[0].val[0], &ma3[1].val[0],
-                      &b3[0].val[0], &b3[1].val[0], &ma5.val[0], &b5.val[0]);
+  uint8x16_t s[2][2], ma3[2][2], ma5[2];
+  uint16x8_t sq[2][4], b3[2][3], b5[3];
+  BoxFilterPreProcessLo(src0, src1, scales, s, sum3, sum5, square_sum3,
+                        square_sum5, sq, ma3, b3, &ma5[0], &b5[0]);
 
   int x = 0;
   do {
-    s[0].val[0] = s[0].val[1];
-    s[1].val[0] = s[1].val[1];
-    sq[0].val[0] = sq[0].val[1];
-    sq[1].val[0] = sq[1].val[1];
-    BoxFilterPreProcess(src0, src1, x + 8, scales, sum3, sum5, square_sum3,
-                        square_sum5, s, sq, &ma3[0].val[1], &ma3[1].val[1],
-                        &b3[0].val[1], &b3[1].val[1], &ma5.val[1], &b5.val[1]);
     uint16x8_t ma[3][3];
+    uint8x16_t ma3x[2][3], ma5x[3];
     uint32x4x2_t b[3][3];
-    Store343_444(ma3[0], b3[0], x, &ma[1][2], &ma[2][1], &b[1][2], &b[2][1],
-                 ma343[2], ma444[1], b343[2], b444[1]);
-    Store343_444(ma3[1], b3[1], x, &ma[2][2], &b[2][2], ma343[3], ma444[2],
-                 b343[3], b444[2]);
-    ma[0][1] = Sum565(ma5);
+    int16x8_t p[2][2];
+    BoxFilterPreProcess(src0, src1, x + 8, scales, s, sum3, sum5, square_sum3,
+                        square_sum5, sq, ma3, b3, ma5, b5 + 1);
+    Prepare3_8<0>(ma3[0], ma3x[0]);
+    Prepare3_8<0>(ma3[1], ma3x[1]);
+    Store343_444<0>(ma3x[0], b3[0], x, &ma[1][2], &ma[2][1], &b[1][2], &b[2][1],
+                    ma343[2], ma444[1], b343[2], b444[1]);
+    Store343_444<0>(ma3x[1], b3[1], x, &ma[2][2], &b[2][2], ma343[3], ma444[2],
+                    b343[3], b444[2]);
+    Prepare3_8<0>(ma5, ma5x);
+    ma[0][1] = Sum565<0>(ma5x);
     b[0][1] = Sum565W(b5);
     vst1q_u16(ma565[1] + x, ma[0][1]);
     vst1q_u32(b565[1] + x, b[0][1].val[0]);
     vst1q_u32(b565[1] + x + 4, b[0][1].val[1]);
-    ma3[0].val[0] = ma3[0].val[1];
-    ma3[1].val[0] = ma3[1].val[1];
-    b3[0].val[0] = b3[0].val[1];
-    b3[1].val[0] = b3[1].val[1];
-    ma5.val[0] = ma5.val[1];
-    b5.val[0] = b5.val[1];
-    int16x8_t p[2][2];
-    const uint8x8_t sr0 = vld1_u8(src + x);
-    const uint8x8_t sr1 = vld1_u8(src + stride + x);
+    const uint8x16_t sr0 = vld1q_u8(src + x);
+    const uint8x16_t sr1 = vld1q_u8(src + stride + x);
+    const uint8x8_t sr00 = vget_low_u8(sr0);
+    const uint8x8_t sr10 = vget_low_u8(sr1);
     ma[0][0] = vld1q_u16(ma565[0] + x);
     b[0][0].val[0] = vld1q_u32(b565[0] + x);
     b[0][0].val[1] = vld1q_u32(b565[0] + x + 4);
-    p[0][0] = CalculateFilteredOutputPass1(sr0, ma[0], b[0]);
-    p[1][0] = CalculateFilteredOutput<4>(sr1, ma[0][1], b[0][1]);
+    p[0][0] = CalculateFilteredOutputPass1(sr00, ma[0], b[0]);
+    p[1][0] = CalculateFilteredOutput<4>(sr10, ma[0][1], b[0][1]);
     ma[1][0] = vld1q_u16(ma343[0] + x);
     ma[1][1] = vld1q_u16(ma444[0] + x);
     b[1][0].val[0] = vld1q_u32(b343[0] + x);
     b[1][0].val[1] = vld1q_u32(b343[0] + x + 4);
     b[1][1].val[0] = vld1q_u32(b444[0] + x);
     b[1][1].val[1] = vld1q_u32(b444[0] + x + 4);
-    p[0][1] = CalculateFilteredOutputPass2(sr0, ma[1], b[1]);
+    p[0][1] = CalculateFilteredOutputPass2(sr00, ma[1], b[1]);
     ma[2][0] = vld1q_u16(ma343[1] + x);
     b[2][0].val[0] = vld1q_u32(b343[1] + x);
     b[2][0].val[1] = vld1q_u32(b343[1] + x + 4);
-    p[1][1] = CalculateFilteredOutputPass2(sr1, ma[2], b[2]);
-    SelfGuidedDoubleMultiplier(sr0, p[0], w0, w2, dst + x);
-    SelfGuidedDoubleMultiplier(sr1, p[1], w0, w2, dst + stride + x);
-    x += 8;
+    p[1][1] = CalculateFilteredOutputPass2(sr10, ma[2], b[2]);
+    const uint8x8_t d00 = SelfGuidedDoubleMultiplier(sr00, p[0], w0, w2);
+    const uint8x8_t d10 = SelfGuidedDoubleMultiplier(sr10, p[1], w0, w2);
+
+    Store343_444<8>(ma3x[0], b3[0] + 1, x + 8, &ma[1][2], &ma[2][1], &b[1][2],
+                    &b[2][1], ma343[2], ma444[1], b343[2], b444[1]);
+    Store343_444<8>(ma3x[1], b3[1] + 1, x + 8, &ma[2][2], &b[2][2], ma343[3],
+                    ma444[2], b343[3], b444[2]);
+    ma[0][1] = Sum565<8>(ma5x);
+    b[0][1] = Sum565W(b5 + 1);
+    vst1q_u16(ma565[1] + x + 8, ma[0][1]);
+    vst1q_u32(b565[1] + x + 8, b[0][1].val[0]);
+    vst1q_u32(b565[1] + x + 12, b[0][1].val[1]);
+    b3[0][0] = b3[0][2];
+    b3[1][0] = b3[1][2];
+    b5[0] = b5[2];
+    const uint8x8_t sr01 = vget_high_u8(sr0);
+    const uint8x8_t sr11 = vget_high_u8(sr1);
+    ma[0][0] = vld1q_u16(ma565[0] + x + 8);
+    b[0][0].val[0] = vld1q_u32(b565[0] + x + 8);
+    b[0][0].val[1] = vld1q_u32(b565[0] + x + 12);
+    p[0][0] = CalculateFilteredOutputPass1(sr01, ma[0], b[0]);
+    p[1][0] = CalculateFilteredOutput<4>(sr11, ma[0][1], b[0][1]);
+    ma[1][0] = vld1q_u16(ma343[0] + x + 8);
+    ma[1][1] = vld1q_u16(ma444[0] + x + 8);
+    b[1][0].val[0] = vld1q_u32(b343[0] + x + 8);
+    b[1][0].val[1] = vld1q_u32(b343[0] + x + 12);
+    b[1][1].val[0] = vld1q_u32(b444[0] + x + 8);
+    b[1][1].val[1] = vld1q_u32(b444[0] + x + 12);
+    p[0][1] = CalculateFilteredOutputPass2(sr01, ma[1], b[1]);
+    ma[2][0] = vld1q_u16(ma343[1] + x + 8);
+    b[2][0].val[0] = vld1q_u32(b343[1] + x + 8);
+    b[2][0].val[1] = vld1q_u32(b343[1] + x + 12);
+    p[1][1] = CalculateFilteredOutputPass2(sr11, ma[2], b[2]);
+    const uint8x8_t d01 = SelfGuidedDoubleMultiplier(sr01, p[0], w0, w2);
+    const uint8x8_t d11 = SelfGuidedDoubleMultiplier(sr11, p[1], w0, w2);
+    vst1q_u8(dst + x, vcombine_u8(d00, d01));
+    vst1q_u8(dst + stride + x, vcombine_u8(d10, d11));
+    s[0][0] = s[0][1];
+    s[1][0] = s[1][1];
+    sq[0][1] = sq[0][3];
+    sq[1][1] = sq[1][3];
+    ma3[0][0] = ma3[0][1];
+    ma3[1][0] = ma3[1][1];
+    ma5[0] = ma5[1];
+    x += 16;
   } while (x < width);
 }
 
@@ -1540,58 +2064,79 @@ inline void BoxFilterLastRow(
     const uint16_t scales[2], const int16_t w0, const int16_t w2,
     uint16_t* const sum3[4], uint16_t* const sum5[5],
     uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
-    uint16_t* const ma343[4], uint16_t* const ma444[3],
-    uint16_t* const ma565[2], uint32_t* const b343[4], uint32_t* const b444[3],
-    uint32_t* const b565[2], uint8_t* const dst) {
-  uint8x8x2_t s, ma3, ma5;
-  uint16x8x2_t sq, b3, b5;
-  uint16x8_t ma[3];
+    uint16_t* const ma343, uint16_t* const ma444, uint16_t* const ma565,
+    uint32_t* const b343, uint32_t* const b444, uint32_t* const b565,
+    uint8_t* const dst) {
+  uint8x16_t s[2], ma3[2], ma5[2];
+  uint16x8_t sq[4], ma[3], b3[3], b5[3];
   uint32x4x2_t b[3];
-  s.val[0] = vld1_u8(src0);
-  sq.val[0] = vmull_u8(s.val[0], s.val[0]);
-  BoxFilterPreProcessLastRow(src0, 0, scales, sum3, sum5, square_sum3,
-                             square_sum5, &s, &sq, &ma3.val[0], &ma5.val[0],
-                             &b3.val[0], &b5.val[0]);
+  BoxFilterPreProcessLastRowLo(src0, scales, sum3, sum5, square_sum3,
+                               square_sum5, &s[0], sq, &ma3[0], &ma5[0], &b3[0],
+                               &b5[0]);
 
   int x = 0;
   do {
-    s.val[0] = s.val[1];
-    sq.val[0] = sq.val[1];
+    uint8x16_t ma3x[3], ma5x[3];
+    int16x8_t p[2];
     BoxFilterPreProcessLastRow(src0, x + 8, scales, sum3, sum5, square_sum3,
-                               square_sum5, &s, &sq, &ma3.val[1], &ma5.val[1],
-                               &b3.val[1], &b5.val[1]);
-    ma[1] = Sum565(ma5);
+                               square_sum5, s, sq + 1, ma3, ma5, &b3[1],
+                               &b5[1]);
+    Prepare3_8<0>(ma5, ma5x);
+    ma[1] = Sum565<0>(ma5x);
     b[1] = Sum565W(b5);
-    ma5.val[0] = ma5.val[1];
-    b5.val[0] = b5.val[1];
-    ma[2] = Sum343(ma3);
+    Prepare3_8<0>(ma3, ma3x);
+    ma[2] = Sum343<0>(ma3x);
     b[2] = Sum343W(b3);
-    ma3.val[0] = ma3.val[1];
-    b3.val[0] = b3.val[1];
-    const uint8x8_t sr = vld1_u8(src + x);
-    int16x8_t p[2];
-    ma[0] = vld1q_u16(ma565[0] + x);
-    b[0].val[0] = vld1q_u32(b565[0] + x + 0);
-    b[0].val[1] = vld1q_u32(b565[0] + x + 4);
-    p[0] = CalculateFilteredOutputPass1(sr, ma, b);
-    ma[0] = vld1q_u16(ma343[0] + x);
-    ma[1] = vld1q_u16(ma444[0] + x);
-    b[0].val[0] = vld1q_u32(b343[0] + x + 0);
-    b[0].val[1] = vld1q_u32(b343[0] + x + 4);
-    b[1].val[0] = vld1q_u32(b444[0] + x + 0);
-    b[1].val[1] = vld1q_u32(b444[0] + x + 4);
-    p[1] = CalculateFilteredOutputPass2(sr, ma, b);
-    SelfGuidedDoubleMultiplier(sr, p, w0, w2, dst + x);
-    x += 8;
+    const uint8x16_t sr = vld1q_u8(src + x);
+    const uint8x8_t sr0 = vget_low_u8(sr);
+    ma[0] = vld1q_u16(ma565 + x);
+    b[0].val[0] = vld1q_u32(b565 + x + 0);
+    b[0].val[1] = vld1q_u32(b565 + x + 4);
+    p[0] = CalculateFilteredOutputPass1(sr0, ma, b);
+    ma[0] = vld1q_u16(ma343 + x);
+    ma[1] = vld1q_u16(ma444 + x);
+    b[0].val[0] = vld1q_u32(b343 + x + 0);
+    b[0].val[1] = vld1q_u32(b343 + x + 4);
+    b[1].val[0] = vld1q_u32(b444 + x + 0);
+    b[1].val[1] = vld1q_u32(b444 + x + 4);
+    p[1] = CalculateFilteredOutputPass2(sr0, ma, b);
+    const uint8x8_t d0 = SelfGuidedDoubleMultiplier(sr0, p, w0, w2);
+
+    ma[1] = Sum565<8>(ma5x);
+    b[1] = Sum565W(b5 + 1);
+    b5[0] = b5[2];
+    ma[2] = Sum343<8>(ma3x);
+    b[2] = Sum343W(b3 + 1);
+    b3[0] = b3[2];
+    const uint8x8_t sr1 = vget_high_u8(sr);
+    ma[0] = vld1q_u16(ma565 + x + 8);
+    b[0].val[0] = vld1q_u32(b565 + x + 8);
+    b[0].val[1] = vld1q_u32(b565 + x + 12);
+    p[0] = CalculateFilteredOutputPass1(sr1, ma, b);
+    ma[0] = vld1q_u16(ma343 + x + 8);
+    ma[1] = vld1q_u16(ma444 + x + 8);
+    b[0].val[0] = vld1q_u32(b343 + x + 8);
+    b[0].val[1] = vld1q_u32(b343 + x + 12);
+    b[1].val[0] = vld1q_u32(b444 + x + 8);
+    b[1].val[1] = vld1q_u32(b444 + x + 12);
+    p[1] = CalculateFilteredOutputPass2(sr1, ma, b);
+    const uint8x8_t d1 = SelfGuidedDoubleMultiplier(sr1, p, w0, w2);
+    vst1q_u8(dst + x, vcombine_u8(d0, d1));
+    s[0] = s[1];
+    sq[1] = sq[3];
+    ma3[0] = ma3[1];
+    ma5[0] = ma5[1];
+    x += 16;
   } while (x < width);
 }
 
 LIBGAV1_ALWAYS_INLINE void BoxFilterProcess(
     const RestorationUnitInfo& restoration_info, const uint8_t* src,
-    const uint8_t* const top_border, const uint8_t* bottom_border,
-    const ptrdiff_t stride, const int width, const int height,
+    const ptrdiff_t stride, const uint8_t* const top_border,
+    const ptrdiff_t top_border_stride, const uint8_t* bottom_border,
+    const ptrdiff_t bottom_border_stride, const int width, const int height,
     SgrBuffer* const sgr_buffer, uint8_t* dst) {
-  const auto temp_stride = Align<ptrdiff_t>(width, 8);
+  const auto temp_stride = Align<ptrdiff_t>(width, 16);
   const ptrdiff_t sum_stride = temp_stride + 8;
   const int sgr_proj_index = restoration_info.sgr_proj_info.index;
   const uint16_t* const scales = kSgrScaleParameter[sgr_proj_index];  // < 2^12.
@@ -1628,13 +2173,13 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterProcess(
   b565[1] = b565[0] + temp_stride;
   assert(scales[0] != 0);
   assert(scales[1] != 0);
-  BoxSum(top_border, stride, 2, sum_stride, sum3[0], sum5[1], square_sum3[0],
-         square_sum5[1]);
+  BoxSum(top_border, top_border_stride, sum_stride, sum3[0], sum5[1],
+         square_sum3[0], square_sum5[1]);
   sum5[0] = sum5[1];
   square_sum5[0] = square_sum5[1];
   const uint8_t* const s = (height > 1) ? src + stride : bottom_border;
   BoxSumFilterPreProcess(src, s, width, scales, sum3, sum5, square_sum3,
-                         square_sum5, ma343, ma444, ma565[0], b343, b444,
+                         square_sum5, ma343, ma444[0], ma565[0], b343, b444[0],
                          b565[0]);
   sum5[0] = sgr_buffer->sum5;
   square_sum5[0] = sgr_buffer->square_sum5;
@@ -1665,7 +2210,7 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterProcess(
     const uint8_t* sr[2];
     if ((height & 1) == 0) {
       sr[0] = bottom_border;
-      sr[1] = bottom_border + stride;
+      sr[1] = bottom_border + bottom_border_stride;
     } else {
       sr[0] = src + 2 * stride;
       sr[1] = bottom_border;
@@ -1689,20 +2234,22 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterProcess(
       std::swap(ma565[0], ma565[1]);
       std::swap(b565[0], b565[1]);
     }
-    BoxFilterLastRow(src + 3, bottom_border + stride, width, scales, w0, w2,
-                     sum3, sum5, square_sum3, square_sum5, ma343, ma444, ma565,
-                     b343, b444, b565, dst);
+    BoxFilterLastRow(src + 3, bottom_border + bottom_border_stride, width,
+                     scales, w0, w2, sum3, sum5, square_sum3, square_sum5,
+                     ma343[0], ma444[0], ma565[0], b343[0], b444[0], b565[0],
+                     dst);
   }
 }
 
 inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
-                                  const uint8_t* src,
+                                  const uint8_t* src, const ptrdiff_t stride,
                                   const uint8_t* const top_border,
+                                  const ptrdiff_t top_border_stride,
                                   const uint8_t* bottom_border,
-                                  const ptrdiff_t stride, const int width,
-                                  const int height, SgrBuffer* const sgr_buffer,
-                                  uint8_t* dst) {
-  const auto temp_stride = Align<ptrdiff_t>(width, 8);
+                                  const ptrdiff_t bottom_border_stride,
+                                  const int width, const int height,
+                                  SgrBuffer* const sgr_buffer, uint8_t* dst) {
+  const auto temp_stride = Align<ptrdiff_t>(width, 16);
   const ptrdiff_t sum_stride = temp_stride + 8;
   const int sgr_proj_index = restoration_info.sgr_proj_info.index;
   const uint32_t scale = kSgrScaleParameter[sgr_proj_index][0];  // < 2^12.
@@ -1720,7 +2267,7 @@ inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
   b565[0] = sgr_buffer->b565;
   b565[1] = b565[0] + temp_stride;
   assert(scale != 0);
-  BoxSum<5>(top_border, stride, 2, sum_stride, sum5[1], square_sum5[1]);
+  BoxSum<5>(top_border, top_border_stride, sum_stride, sum5[1], square_sum5[1]);
   sum5[0] = sum5[1];
   square_sum5[0] = square_sum5[1];
   const uint8_t* const s = (height > 1) ? src + stride : bottom_border;
@@ -1746,7 +2293,7 @@ inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
     const uint8_t* sr[2];
     if ((height & 1) == 0) {
       sr[0] = bottom_border;
-      sr[1] = bottom_border + stride;
+      sr[1] = bottom_border + bottom_border_stride;
     } else {
       sr[0] = src + 2 * stride;
       sr[1] = bottom_border;
@@ -1763,20 +2310,21 @@ inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
       Circulate5PointersBy2<uint16_t>(sum5);
       Circulate5PointersBy2<uint32_t>(square_sum5);
     }
-    BoxFilterPass1LastRow(src + 3, bottom_border + stride, width, scale, w0,
-                          sum5, square_sum5, ma565[0], b565[0], dst);
+    BoxFilterPass1LastRow(src + 3, bottom_border + bottom_border_stride, width,
+                          scale, w0, sum5, square_sum5, ma565[0], b565[0], dst);
   }
 }
 
 inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
-                                  const uint8_t* src,
+                                  const uint8_t* src, const ptrdiff_t stride,
                                   const uint8_t* const top_border,
+                                  const ptrdiff_t top_border_stride,
                                   const uint8_t* bottom_border,
-                                  const ptrdiff_t stride, const int width,
-                                  const int height, SgrBuffer* const sgr_buffer,
-                                  uint8_t* dst) {
+                                  const ptrdiff_t bottom_border_stride,
+                                  const int width, const int height,
+                                  SgrBuffer* const sgr_buffer, uint8_t* dst) {
   assert(restoration_info.sgr_proj_info.multiplier[0] == 0);
-  const auto temp_stride = Align<ptrdiff_t>(width, 8);
+  const auto temp_stride = Align<ptrdiff_t>(width, 16);
   const ptrdiff_t sum_stride = temp_stride + 8;
   const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
   const int16_t w0 = (1 << kSgrProjPrecisionBits) - w1;
@@ -1799,7 +2347,7 @@ inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
   b444[0] = sgr_buffer->b444;
   b444[1] = b444[0] + temp_stride;
   assert(scale != 0);
-  BoxSum<3>(top_border, stride, 2, sum_stride, sum3[0], square_sum3[0]);
+  BoxSum<3>(top_border, top_border_stride, sum_stride, sum3[0], square_sum3[0]);
   BoxSumFilterPreProcess3<false>(src, width, scale, sum3, square_sum3, ma343[0],
                                  nullptr, b343[0], nullptr);
   Circulate3PointersBy1<uint16_t>(sum3);
@@ -1809,7 +2357,7 @@ inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
     s = src + stride;
   } else {
     s = bottom_border;
-    bottom_border += stride;
+    bottom_border += bottom_border_stride;
   }
   BoxSumFilterPreProcess3<true>(s, width, scale, sum3, square_sum3, ma343[1],
                                 ma444[0], b343[1], b444[0]);
@@ -1836,7 +2384,7 @@ inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
                    ma343, ma444, b343, b444, dst);
     src += stride;
     dst += stride;
-    bottom_border += stride;
+    bottom_border += bottom_border_stride;
     Circulate3PointersBy1<uint16_t>(ma343);
     Circulate3PointersBy1<uint32_t>(b343);
     std::swap(ma444[0], ma444[1]);
@@ -1849,8 +2397,9 @@ inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
 // part of the visible frame.
 void SelfGuidedFilter_NEON(
     const RestorationUnitInfo& restoration_info, const void* const source,
-    const void* const top_border, const void* const bottom_border,
-    const ptrdiff_t stride, const int width, const int height,
+    const ptrdiff_t stride, const void* const top_border,
+    const ptrdiff_t top_border_stride, const void* const bottom_border,
+    const ptrdiff_t bottom_border_stride, const int width, const int height,
     RestorationBuffer* const restoration_buffer, void* const dest) {
   const int index = restoration_info.sgr_proj_info.index;
   const int radius_pass_0 = kSgrProjParams[index][0];  // 2 or 0
@@ -1864,14 +2413,17 @@ void SelfGuidedFilter_NEON(
     // |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the
     // following assertion.
     assert(radius_pass_0 != 0);
-    BoxFilterProcessPass1(restoration_info, src - 3, top - 3, bottom - 3,
-                          stride, width, height, sgr_buffer, dst);
+    BoxFilterProcessPass1(restoration_info, src - 3, stride, top - 3,
+                          top_border_stride, bottom - 3, bottom_border_stride,
+                          width, height, sgr_buffer, dst);
   } else if (radius_pass_0 == 0) {
-    BoxFilterProcessPass2(restoration_info, src - 2, top - 2, bottom - 2,
-                          stride, width, height, sgr_buffer, dst);
+    BoxFilterProcessPass2(restoration_info, src - 2, stride, top - 2,
+                          top_border_stride, bottom - 2, bottom_border_stride,
+                          width, height, sgr_buffer, dst);
   } else {
-    BoxFilterProcess(restoration_info, src - 3, top - 3, bottom - 3, stride,
-                     width, height, sgr_buffer, dst);
+    BoxFilterProcess(restoration_info, src - 3, stride, top - 3,
+                     top_border_stride, bottom - 3, bottom_border_stride, width,
+                     height, sgr_buffer, dst);
   }
 }
 
@@ -1890,7 +2442,7 @@ void LoopRestorationInit_NEON() { low_bitdepth::Init8bpp(); }
 }  // namespace dsp
 }  // namespace libgav1
 
-#else  // !LIBGAV1_ENABLE_NEON
+#else   // !LIBGAV1_ENABLE_NEON
 namespace libgav1 {
 namespace dsp {
 
diff --git a/src/dsp/arm/mask_blend_neon.cc b/src/dsp/arm/mask_blend_neon.cc
index 084f42f..ee50923 100644
--- a/src/dsp/arm/mask_blend_neon.cc
+++ b/src/dsp/arm/mask_blend_neon.cc
@@ -432,7 +432,7 @@ void MaskBlendInit_NEON() { low_bitdepth::Init8bpp(); }
 }  // namespace dsp
 }  // namespace libgav1
 
-#else  // !LIBGAV1_ENABLE_NEON
+#else   // !LIBGAV1_ENABLE_NEON
 
 namespace libgav1 {
 namespace dsp {
diff --git a/src/dsp/arm/motion_field_projection_neon.cc b/src/dsp/arm/motion_field_projection_neon.cc
index 8caba7d..3e731b2 100644
--- a/src/dsp/arm/motion_field_projection_neon.cc
+++ b/src/dsp/arm/motion_field_projection_neon.cc
@@ -382,7 +382,7 @@ void MotionFieldProjectionInit_NEON() {
 }  // namespace dsp
 }  // namespace libgav1
 
-#else  // !LIBGAV1_ENABLE_NEON
+#else   // !LIBGAV1_ENABLE_NEON
 namespace libgav1 {
 namespace dsp {
 
diff --git a/src/dsp/arm/motion_vector_search_neon.cc b/src/dsp/arm/motion_vector_search_neon.cc
index 8a403a6..da3ba17 100644
--- a/src/dsp/arm/motion_vector_search_neon.cc
+++ b/src/dsp/arm/motion_vector_search_neon.cc
@@ -256,7 +256,7 @@ void MotionVectorSearchInit_NEON() {
 }  // namespace dsp
 }  // namespace libgav1
 
-#else  // !LIBGAV1_ENABLE_NEON
+#else   // !LIBGAV1_ENABLE_NEON
 namespace libgav1 {
 namespace dsp {
 
diff --git a/src/dsp/arm/obmc_neon.cc b/src/dsp/arm/obmc_neon.cc
index 66ad663..1111a90 100644
--- a/src/dsp/arm/obmc_neon.cc
+++ b/src/dsp/arm/obmc_neon.cc
@@ -380,7 +380,7 @@ void ObmcInit_NEON() { Init8bpp(); }
 }  // namespace dsp
 }  // namespace libgav1
 
-#else  // !LIBGAV1_ENABLE_NEON
+#else   // !LIBGAV1_ENABLE_NEON
 
 namespace libgav1 {
 namespace dsp {
diff --git a/src/dsp/arm/super_res_neon.cc b/src/dsp/arm/super_res_neon.cc
index 1680450..91537c4 100644
--- a/src/dsp/arm/super_res_neon.cc
+++ b/src/dsp/arm/super_res_neon.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "src/dsp/arm/common_neon.h"
 #include "src/dsp/super_res.h"
 #include "src/utils/cpu.h"
 
@@ -20,6 +19,7 @@
 
 #include <arm_neon.h>
 
+#include "src/dsp/arm/common_neon.h"
 #include "src/dsp/constants.h"
 #include "src/dsp/dsp.h"
 #include "src/utils/common.h"
@@ -82,10 +82,10 @@ inline uint8x8_t SuperRes(const uint8x8_t src[kSuperResFilterTaps],
 }
 
 void SuperRes_NEON(const void* const coefficients, void* const source,
-                   const ptrdiff_t stride, const int height,
+                   const ptrdiff_t source_stride, const int height,
                    const int downscaled_width, const int upscaled_width,
                    const int initial_subpixel_x, const int step,
-                   void* const dest) {
+                   void* const dest, const ptrdiff_t dest_stride) {
   auto* src = static_cast<uint8_t*>(source) - DivideBy2(kSuperResFilterTaps);
   auto* dst = static_cast<uint8_t*>(dest);
   int y = height;
@@ -100,7 +100,7 @@ void SuperRes_NEON(const void* const coefficients, void* const source,
     int x = RightShiftWithCeiling(upscaled_width, 4);
     // The below code calculates up to 15 extra upscaled
     // pixels which will over-read up to 15 downscaled pixels in the end of each
-    // row. kSuperResHorizontalBorder accounts for this.
+    // row. kSuperResHorizontalPadding accounts for this.
     do {
       for (int i = 0; i < 8; ++i, subpixel_x += step) {
         sr[i] = vld1_u8(&src[subpixel_x >> kSuperResScaleBits]);
@@ -135,8 +135,8 @@ void SuperRes_NEON(const void* const coefficients, void* const source,
       vst1q_u8(dst_ptr, vcombine_u8(d0, d1));
       dst_ptr += 16;
     } while (--x != 0);
-    src += stride;
-    dst += stride;
+    src += source_stride;
+    dst += dest_stride;
   } while (--y != 0);
 }
 
@@ -149,12 +149,147 @@ void Init8bpp() {
 }  // namespace
 }  // namespace low_bitdepth
 
-void SuperResInit_NEON() { low_bitdepth::Init8bpp(); }
+//------------------------------------------------------------------------------
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+void SuperResCoefficients_NEON(const int upscaled_width,
+                               const int initial_subpixel_x, const int step,
+                               void* const coefficients) {
+  auto* dst = static_cast<uint16_t*>(coefficients);
+  int subpixel_x = initial_subpixel_x;
+  int x = RightShiftWithCeiling(upscaled_width, 3);
+  do {
+    uint16x8_t filter[8];
+    for (int i = 0; i < 8; ++i, subpixel_x += step) {
+      const uint8x8_t filter_8 =
+          vld1_u8(kUpscaleFilterUnsigned[(subpixel_x & kSuperResScaleMask) >>
+                                         kSuperResExtraBits]);
+      // uint8_t -> uint16_t
+      filter[i] = vmovl_u8(filter_8);
+    }
+
+    Transpose8x8(filter);
+
+    vst1q_u16(dst, filter[0]);
+    dst += 8;
+    vst1q_u16(dst, filter[1]);
+    dst += 8;
+    vst1q_u16(dst, filter[2]);
+    dst += 8;
+    vst1q_u16(dst, filter[3]);
+    dst += 8;
+    vst1q_u16(dst, filter[4]);
+    dst += 8;
+    vst1q_u16(dst, filter[5]);
+    dst += 8;
+    vst1q_u16(dst, filter[6]);
+    dst += 8;
+    vst1q_u16(dst, filter[7]);
+    dst += 8;
+  } while (--x != 0);
+}
+
+// The sum is clipped to [0, ((1 << bitdepth) -1)]. Adding all positive and then
+// subtracting all negative with saturation will clip to zero.
+//           0 1 2 3 4 5 6 7
+// tap sign: - + - + + - + -
+inline uint16x8_t SuperRes(const uint16x8_t src[kSuperResFilterTaps],
+                           const uint16_t** coefficients, int bitdepth) {
+  uint16x8_t f[kSuperResFilterTaps];
+  for (int i = 0; i < kSuperResFilterTaps; ++i, *coefficients += 8) {
+    f[i] = vld1q_u16(*coefficients);
+  }
+
+  uint32x4_t res_lo = vmull_u16(vget_low_u16(src[1]), vget_low_u16(f[1]));
+  res_lo = vmlal_u16(res_lo, vget_low_u16(src[3]), vget_low_u16(f[3]));
+  res_lo = vmlal_u16(res_lo, vget_low_u16(src[4]), vget_low_u16(f[4]));
+  res_lo = vmlal_u16(res_lo, vget_low_u16(src[6]), vget_low_u16(f[6]));
+
+  uint32x4_t temp_lo = vmull_u16(vget_low_u16(src[0]), vget_low_u16(f[0]));
+  temp_lo = vmlal_u16(temp_lo, vget_low_u16(src[2]), vget_low_u16(f[2]));
+  temp_lo = vmlal_u16(temp_lo, vget_low_u16(src[5]), vget_low_u16(f[5]));
+  temp_lo = vmlal_u16(temp_lo, vget_low_u16(src[7]), vget_low_u16(f[7]));
+
+  res_lo = vqsubq_u32(res_lo, temp_lo);
+
+  uint32x4_t res_hi = vmull_u16(vget_high_u16(src[1]), vget_high_u16(f[1]));
+  res_hi = vmlal_u16(res_hi, vget_high_u16(src[3]), vget_high_u16(f[3]));
+  res_hi = vmlal_u16(res_hi, vget_high_u16(src[4]), vget_high_u16(f[4]));
+  res_hi = vmlal_u16(res_hi, vget_high_u16(src[6]), vget_high_u16(f[6]));
 
+  uint32x4_t temp_hi = vmull_u16(vget_high_u16(src[0]), vget_high_u16(f[0]));
+  temp_hi = vmlal_u16(temp_hi, vget_high_u16(src[2]), vget_high_u16(f[2]));
+  temp_hi = vmlal_u16(temp_hi, vget_high_u16(src[5]), vget_high_u16(f[5]));
+  temp_hi = vmlal_u16(temp_hi, vget_high_u16(src[7]), vget_high_u16(f[7]));
+
+  res_hi = vqsubq_u32(res_hi, temp_hi);
+
+  const uint16x8_t res = vcombine_u16(vqrshrn_n_u32(res_lo, kFilterBits),
+                                      vqrshrn_n_u32(res_hi, kFilterBits));
+
+  // Clip the result at (1 << bd) - 1.
+  return vminq_u16(res, vdupq_n_u16((1 << bitdepth) - 1));
+}
+
+template <int bitdepth>
+void SuperRes_NEON(const void* const coefficients, void* const source,
+                   const ptrdiff_t source_stride, const int height,
+                   const int downscaled_width, const int upscaled_width,
+                   const int initial_subpixel_x, const int step,
+                   void* const dest, const ptrdiff_t dest_stride) {
+  auto* src = static_cast<uint16_t*>(source) - DivideBy2(kSuperResFilterTaps);
+  auto* dst = static_cast<uint16_t*>(dest);
+  int y = height;
+  do {
+    const auto* filter = static_cast<const uint16_t*>(coefficients);
+    uint16_t* dst_ptr = dst;
+    ExtendLine<uint16_t>(src + DivideBy2(kSuperResFilterTaps), downscaled_width,
+                         kSuperResHorizontalBorder, kSuperResHorizontalBorder);
+    int subpixel_x = initial_subpixel_x;
+    uint16x8_t sr[8];
+    int x = RightShiftWithCeiling(upscaled_width, 3);
+    // The below code calculates up to 7 extra upscaled
+    // pixels which will over-read up to 7 downscaled pixels in the end of each
+    // row. kSuperResHorizontalBorder accounts for this.
+    do {
+      for (int i = 0; i < 8; ++i, subpixel_x += step) {
+        sr[i] = vld1q_u16(&src[subpixel_x >> kSuperResScaleBits]);
+      }
+
+      Transpose8x8(sr);
+
+      const uint16x8_t d0 = SuperRes(sr, &filter, bitdepth);
+      vst1q_u16(dst_ptr, d0);
+      dst_ptr += 8;
+    } while (--x != 0);
+    src += source_stride;
+    dst += dest_stride;
+  } while (--y != 0);
+}
+
+void Init10bpp() {
+  Dsp* dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+  dsp->super_res_coefficients = SuperResCoefficients_NEON;
+  dsp->super_res = SuperRes_NEON<10>;
+}
+
+}  // namespace
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+void SuperResInit_NEON() {
+  low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif
+}
 }  // namespace dsp
 }  // namespace libgav1
 
-#else  // !LIBGAV1_ENABLE_NEON
+#else   // !LIBGAV1_ENABLE_NEON
 
 namespace libgav1 {
 namespace dsp {
diff --git a/src/dsp/arm/super_res_neon.h b/src/dsp/arm/super_res_neon.h
index f51785d..65e48c5 100644
--- a/src/dsp/arm/super_res_neon.h
+++ b/src/dsp/arm/super_res_neon.h
@@ -31,7 +31,10 @@ void SuperResInit_NEON();
 
 #if LIBGAV1_ENABLE_NEON
 #define LIBGAV1_Dsp8bpp_SuperRes LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_SuperResClip LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_SuperResCoefficients LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_SuperResCoefficients LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_SuperRes LIBGAV1_CPU_NEON
 #endif  // LIBGAV1_ENABLE_NEON
 
 #endif  // LIBGAV1_SRC_DSP_ARM_SUPER_RES_NEON_H_
diff --git a/src/dsp/arm/warp_neon.cc b/src/dsp/arm/warp_neon.cc
index 7a41998..c7fb739 100644
--- a/src/dsp/arm/warp_neon.cc
+++ b/src/dsp/arm/warp_neon.cc
@@ -289,7 +289,7 @@ void Warp_NEON(const void* const source, const ptrdiff_t source_stride,
             const int16x8_t sum = vld1q_s16(tmp);
             vst1_u8(reinterpret_cast<uint8_t*>(dst_row), vqmovun_s16(sum));
           }
-#else  // !defined(__aarch64__)
+#else   // !defined(__aarch64__)
           int16x8_t filter[8];
           for (int x = 0; x < 8; ++x) {
             const int offset =
@@ -442,7 +442,7 @@ void WarpInit_NEON() { low_bitdepth::Init8bpp(); }
 
 }  // namespace dsp
 }  // namespace libgav1
-#else  // !LIBGAV1_ENABLE_NEON
+#else   // !LIBGAV1_ENABLE_NEON
 namespace libgav1 {
 namespace dsp {
 
diff --git a/src/dsp/arm/weight_mask_neon.cc b/src/dsp/arm/weight_mask_neon.cc
index 49d3be0..7e5bff0 100644
--- a/src/dsp/arm/weight_mask_neon.cc
+++ b/src/dsp/arm/weight_mask_neon.cc
@@ -451,7 +451,7 @@ void WeightMaskInit_NEON() { low_bitdepth::Init8bpp(); }
 }  // namespace dsp
 }  // namespace libgav1
 
-#else  // !LIBGAV1_ENABLE_NEON
+#else   // !LIBGAV1_ENABLE_NEON
 
 namespace libgav1 {
 namespace dsp {
author	qinxialei <xialeiqin@gmail.com>	2021-04-22 11:20:15 +0800
committer	qinxialei <xialeiqin@gmail.com>	2021-04-22 11:20:15 +0800
commit	2381d803c76105f44717d75f089ec37f51e5cfe4 (patch)
tree	33f40fb4dfd1039ac262d5f1c1065d298578ddc1 /src/dsp/arm
parent	e8d277081293b6fb2a5d469616baaa7a06f52496 (diff)
download	libgav1-2381d803c76105f44717d75f089ec37f51e5cfe4.tar.gz libgav1-2381d803c76105f44717d75f089ec37f51e5cfe4.tar.bz2 libgav1-2381d803c76105f44717d75f089ec37f51e5cfe4.zip