1 files changed, 477 insertions, 466 deletions
diff --git a/src/dsp/arm/convolve_neon.cc b/src/dsp/arm/convolve_neon.cc
index fd9b912..331bfe2 100644
--- a/src/dsp/arm/convolve_neon.cc
+++ b/src/dsp/arm/convolve_neon.cc
@@ -101,245 +101,278 @@ int16x8_t SumOnePassTaps(const uint8x8_t* const src,
   return vreinterpretq_s16_u16(sum);
 }
 
-template <int filter_index, bool negative_outside_taps>
-int16x8_t SumHorizontalTaps(const uint8_t* const src,
-                            const uint8x8_t* const v_tap) {
-  uint8x8_t v_src[8];
-  const uint8x16_t src_long = vld1q_u8(src);
-  int16x8_t sum;
-
-  if (filter_index < 2) {
-    v_src[0] = vget_low_u8(vextq_u8(src_long, src_long, 1));
-    v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 2));
-    v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 3));
-    v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 4));
-    v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 5));
-    v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 6));
-    sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src, v_tap + 1);
-  } else if (filter_index == 2) {
-    v_src[0] = vget_low_u8(src_long);
-    v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
-    v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2));
-    v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3));
-    v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 4));
-    v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 5));
-    v_src[6] = vget_low_u8(vextq_u8(src_long, src_long, 6));
-    v_src[7] = vget_low_u8(vextq_u8(src_long, src_long, 7));
-    sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src, v_tap);
-  } else if (filter_index == 3) {
-    v_src[0] = vget_low_u8(vextq_u8(src_long, src_long, 3));
-    v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 4));
-    sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src, v_tap + 3);
-  } else if (filter_index > 3) {
-    v_src[0] = vget_low_u8(vextq_u8(src_long, src_long, 2));
-    v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 3));
-    v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 4));
-    v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 5));
-    sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src, v_tap + 2);
-  }
-  return sum;
-}
-
-template <int filter_index, bool negative_outside_taps>
-uint8x8_t SimpleHorizontalTaps(const uint8_t* const src,
-                               const uint8x8_t* const v_tap) {
-  int16x8_t sum =
-      SumHorizontalTaps<filter_index, negative_outside_taps>(src, v_tap);
-
-  // Normally the Horizontal pass does the downshift in two passes:
-  // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
-  // kInterRoundBitsHorizontal). Each one uses a rounding shift. Combining them
-  // requires adding the rounding offset from the skipped shift.
-  constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2);
-
-  sum = vaddq_s16(sum, vdupq_n_s16(first_shift_rounding_bit));
-  return vqrshrun_n_s16(sum, kFilterBits - 1);
-}
-
-template <int filter_index, bool negative_outside_taps>
-uint16x8_t HorizontalTaps8To16(const uint8_t* const src,
-                               const uint8x8_t* const v_tap) {
-  const int16x8_t sum =
-      SumHorizontalTaps<filter_index, negative_outside_taps>(src, v_tap);
-
-  return vreinterpretq_u16_s16(
-      vrshrq_n_s16(sum, kInterRoundBitsHorizontal - 1));
-}
-
-template <int filter_index>
-int16x8_t SumHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride,
-                               const uint8x8_t* const v_tap) {
-  uint16x8_t sum;
-  const uint8x8_t input0 = vld1_u8(src);
-  src += src_stride;
-  const uint8x8_t input1 = vld1_u8(src);
-  uint8x8x2_t input = vzip_u8(input0, input1);
-
-  if (filter_index == 3) {
-    // tap signs : + +
-    sum = vmull_u8(vext_u8(input.val[0], input.val[1], 6), v_tap[3]);
-    sum = vmlal_u8(sum, input.val[1], v_tap[4]);
-  } else if (filter_index == 4) {
-    // tap signs : - + + -
-    sum = vmull_u8(vext_u8(input.val[0], input.val[1], 6), v_tap[3]);
-    sum = vmlsl_u8(sum, RightShift<4 * 8>(input.val[0]), v_tap[2]);
-    sum = vmlal_u8(sum, input.val[1], v_tap[4]);
-    sum = vmlsl_u8(sum, RightShift<2 * 8>(input.val[1]), v_tap[5]);
-  } else {
-    // tap signs : + + + +
-    sum = vmull_u8(RightShift<4 * 8>(input.val[0]), v_tap[2]);
-    sum = vmlal_u8(sum, vext_u8(input.val[0], input.val[1], 6), v_tap[3]);
-    sum = vmlal_u8(sum, input.val[1], v_tap[4]);
-    sum = vmlal_u8(sum, RightShift<2 * 8>(input.val[1]), v_tap[5]);
-  }
-
-  return vreinterpretq_s16_u16(sum);
-}
-
-template <int filter_index>
-uint8x8_t SimpleHorizontalTaps2x2(const uint8_t* src,
-                                  const ptrdiff_t src_stride,
-                                  const uint8x8_t* const v_tap) {
-  int16x8_t sum = SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
-
-  // Normally the Horizontal pass does the downshift in two passes:
-  // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
-  // kInterRoundBitsHorizontal). Each one uses a rounding shift. Combining them
-  // requires adding the rounding offset from the skipped shift.
-  constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2);
-
-  sum = vaddq_s16(sum, vdupq_n_s16(first_shift_rounding_bit));
-  return vqrshrun_n_s16(sum, kFilterBits - 1);
-}
-
-template <int filter_index>
-uint16x8_t HorizontalTaps8To16_2x2(const uint8_t* src,
-                                   const ptrdiff_t src_stride,
-                                   const uint8x8_t* const v_tap) {
-  const int16x8_t sum =
-      SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
-
-  return vreinterpretq_u16_s16(
-      vrshrq_n_s16(sum, kInterRoundBitsHorizontal - 1));
-}
-
-template <int num_taps, int step, int filter_index,
-          bool negative_outside_taps = true, bool is_2d = false,
-          bool is_compound = false>
-void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride,
-                      void* const dest, const ptrdiff_t pred_stride,
-                      const int width, const int height,
-                      const uint8x8_t* const v_tap) {
+template <int filter_index, bool negative_outside_taps, bool is_2d,
+          bool is_compound>
+void FilterHorizontalWidth8AndUp(const uint8_t* src, const ptrdiff_t src_stride,
+                                 void* const dest, const ptrdiff_t pred_stride,
+                                 const int width, const int height,
+                                 const uint8x8_t* const v_tap) {
   auto* dest8 = static_cast<uint8_t*>(dest);
   auto* dest16 = static_cast<uint16_t*>(dest);
-
-  // 4 tap filters are never used when width > 4.
-  if (num_taps != 4 && width > 4) {
-    int y = 0;
+  if (!is_2d) {
+    int y = height;
     do {
       int x = 0;
-      do {
-        if (is_2d || is_compound) {
-          const uint16x8_t v_sum =
-              HorizontalTaps8To16<filter_index, negative_outside_taps>(&src[x],
-                                                                       v_tap);
+      do {  // Increasing loop counter x is better.
+        const uint8x16_t src_long = vld1q_u8(src + x);
+        uint8x8_t v_src[8];
+        int16x8_t sum;
+        if (filter_index < 2) {
+          v_src[0] = vget_low_u8(src_long);
+          v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+          v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2));
+          v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3));
+          v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 4));
+          v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 5));
+          sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src,
+                                                                    v_tap + 1);
+        } else if (filter_index == 2) {
+          v_src[0] = vget_low_u8(src_long);
+          v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+          v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2));
+          v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3));
+          v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 4));
+          v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 5));
+          v_src[6] = vget_low_u8(vextq_u8(src_long, src_long, 6));
+          v_src[7] = vget_low_u8(vextq_u8(src_long, src_long, 7));
+          sum = SumOnePassTaps<filter_index, false>(v_src, v_tap);
+        } else if (filter_index == 3) {
+          v_src[0] = vget_low_u8(src_long);
+          v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+          sum = SumOnePassTaps<filter_index, false>(v_src, v_tap + 3);
+        } else if (filter_index > 3) {
+          v_src[0] = vget_low_u8(src_long);
+          v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+          v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2));
+          v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3));
+          sum = SumOnePassTaps<filter_index, false>(v_src, v_tap + 2);
+        }
+        if (is_compound) {
+          const uint16x8_t v_sum = vreinterpretq_u16_s16(
+              vrshrq_n_s16(sum, kInterRoundBitsHorizontal - 1));
           vst1q_u16(&dest16[x], v_sum);
         } else {
-          const uint8x8_t result =
-              SimpleHorizontalTaps<filter_index, negative_outside_taps>(&src[x],
-                                                                        v_tap);
+          // Normally the Horizontal pass does the downshift in two passes:
+          // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
+          // kInterRoundBitsHorizontal). Each one uses a rounding shift.
+          // Combining them requires adding the rounding offset from the skipped
+          // shift.
+          constexpr int first_shift_rounding_bit =
+              1 << (kInterRoundBitsHorizontal - 2);
+          sum = vaddq_s16(sum, vdupq_n_s16(first_shift_rounding_bit));
+          const uint8x8_t result = vqrshrun_n_s16(sum, kFilterBits - 1);
           vst1_u8(&dest8[x], result);
         }
-        x += step;
+        x += 8;
       } while (x < width);
       src += src_stride;
       dest8 += pred_stride;
       dest16 += pred_stride;
-    } while (++y < height);
+    } while (--y != 0);
+  } else {
+    int x = 0;
+    do {
+      const uint8_t* s = src + x;
+      int y = height;
+      do {  // Increasing loop counter x is better.
+        const uint8x16_t src_long = vld1q_u8(s);
+        uint8x8_t v_src[8];
+        int16x8_t sum;
+        if (filter_index < 2) {
+          v_src[0] = vget_low_u8(src_long);
+          v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+          v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2));
+          v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3));
+          v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 4));
+          v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 5));
+          sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src,
+                                                                    v_tap + 1);
+        } else if (filter_index == 2) {
+          v_src[0] = vget_low_u8(src_long);
+          v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+          v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2));
+          v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3));
+          v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 4));
+          v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 5));
+          v_src[6] = vget_low_u8(vextq_u8(src_long, src_long, 6));
+          v_src[7] = vget_low_u8(vextq_u8(src_long, src_long, 7));
+          sum = SumOnePassTaps<filter_index, false>(v_src, v_tap);
+        } else if (filter_index == 3) {
+          v_src[0] = vget_low_u8(src_long);
+          v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+          sum = SumOnePassTaps<filter_index, false>(v_src, v_tap + 3);
+        } else if (filter_index > 3) {
+          v_src[0] = vget_low_u8(src_long);
+          v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+          v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2));
+          v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3));
+          sum = SumOnePassTaps<filter_index, false>(v_src, v_tap + 2);
+        }
+        const uint16x8_t v_sum = vreinterpretq_u16_s16(
+            vrshrq_n_s16(sum, kInterRoundBitsHorizontal - 1));
+        vst1q_u16(dest16, v_sum);
+        s += src_stride;
+        dest16 += 8;
+      } while (--y != 0);
+      x += 8;
+    } while (x < width);
+  }
+}
+
+template <int filter_index, bool is_2d, bool is_compound>
+void FilterHorizontalWidth4(const uint8_t* src, const ptrdiff_t src_stride,
+                            void* const dest, const ptrdiff_t pred_stride,
+                            const int height, const uint8x8_t* const v_tap) {
+  auto* dest8 = static_cast<uint8_t*>(dest);
+  auto* dest16 = static_cast<uint16_t*>(dest);
+  int y = height;
+  do {
+    uint8x8_t v_src[4];
+    int16x8_t sum;
+    v_src[0] = vld1_u8(src);
+    if (filter_index == 3) {
+      v_src[1] = RightShiftVector<1 * 8>(v_src[0]);
+      sum = SumOnePassTaps<filter_index, false>(v_src, v_tap + 3);
+    } else {
+      v_src[1] = RightShiftVector<1 * 8>(v_src[0]);
+      v_src[2] = RightShiftVector<2 * 8>(v_src[0]);
+      v_src[3] = RightShiftVector<3 * 8>(v_src[0]);
+      sum = SumOnePassTaps<filter_index, false>(v_src, v_tap + 2);
+    }
+    if (is_2d || is_compound) {
+      const uint16x4_t v_sum = vreinterpret_u16_s16(
+          vrshr_n_s16(vget_low_s16(sum), kInterRoundBitsHorizontal - 1));
+      vst1_u16(dest16, v_sum);
+    } else {
+      constexpr int first_shift_rounding_bit =
+          1 << (kInterRoundBitsHorizontal - 2);
+      sum = vaddq_s16(sum, vdupq_n_s16(first_shift_rounding_bit));
+      const uint8x8_t result = vqrshrun_n_s16(sum, kFilterBits - 1);
+      StoreLo4(&dest8[0], result);
+    }
+    src += src_stride;
+    dest8 += pred_stride;
+    dest16 += pred_stride;
+  } while (--y != 0);
+}
+
+template <int filter_index, bool is_2d>
+void FilterHorizontalWidth2(const uint8_t* src, const ptrdiff_t src_stride,
+                            void* const dest, const ptrdiff_t pred_stride,
+                            const int height, const uint8x8_t* const v_tap) {
+  auto* dest8 = static_cast<uint8_t*>(dest);
+  auto* dest16 = static_cast<uint16_t*>(dest);
+  int y = height >> 1;
+  do {
+    const uint8x8_t input0 = vld1_u8(src);
+    const uint8x8_t input1 = vld1_u8(src + src_stride);
+    const uint8x8x2_t input = vzip_u8(input0, input1);
+    uint16x8_t sum;
+    if (filter_index == 3) {
+      // tap signs : + +
+      sum = vmull_u8(input.val[0], v_tap[3]);
+      sum = vmlal_u8(sum, vext_u8(input.val[0], input.val[1], 2), v_tap[4]);
+    } else if (filter_index == 4) {
+      // tap signs : - + + -
+      sum = vmull_u8(RightShiftVector<2 * 8>(input.val[0]), v_tap[3]);
+      sum = vmlsl_u8(sum, input.val[0], v_tap[2]);
+      sum = vmlal_u8(sum, RightShiftVector<4 * 8>(input.val[0]), v_tap[4]);
+      sum = vmlsl_u8(sum, vext_u8(input.val[0], input.val[1], 6), v_tap[5]);
+    } else {
+      // tap signs : + + + +
+      sum = vmull_u8(input.val[0], v_tap[2]);
+      sum = vmlal_u8(sum, RightShiftVector<2 * 8>(input.val[0]), v_tap[3]);
+      sum = vmlal_u8(sum, RightShiftVector<4 * 8>(input.val[0]), v_tap[4]);
+      sum = vmlal_u8(sum, vext_u8(input.val[0], input.val[1], 6), v_tap[5]);
+    }
+    int16x8_t s = vreinterpretq_s16_u16(sum);
+    if (is_2d) {
+      const uint16x8_t v_sum =
+          vreinterpretq_u16_s16(vrshrq_n_s16(s, kInterRoundBitsHorizontal - 1));
+      dest16[0] = vgetq_lane_u16(v_sum, 0);
+      dest16[1] = vgetq_lane_u16(v_sum, 2);
+      dest16 += pred_stride;
+      dest16[0] = vgetq_lane_u16(v_sum, 1);
+      dest16[1] = vgetq_lane_u16(v_sum, 3);
+      dest16 += pred_stride;
+    } else {
+      // Normally the Horizontal pass does the downshift in two passes:
+      // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
+      // kInterRoundBitsHorizontal). Each one uses a rounding shift.
+      // Combining them requires adding the rounding offset from the skipped
+      // shift.
+      constexpr int first_shift_rounding_bit =
+          1 << (kInterRoundBitsHorizontal - 2);
+      s = vaddq_s16(s, vdupq_n_s16(first_shift_rounding_bit));
+      const uint8x8_t result = vqrshrun_n_s16(s, kFilterBits - 1);
+      dest8[0] = vget_lane_u8(result, 0);
+      dest8[1] = vget_lane_u8(result, 2);
+      dest8 += pred_stride;
+      dest8[0] = vget_lane_u8(result, 1);
+      dest8[1] = vget_lane_u8(result, 3);
+      dest8 += pred_stride;
+    }
+    src += src_stride << 1;
+  } while (--y != 0);
+
+  // The 2d filters have an odd |height| because the horizontal pass
+  // generates context for the vertical pass.
+  if (is_2d) {
+    assert(height % 2 == 1);
+    const uint8x8_t input = vld1_u8(src);
+    uint16x8_t sum;
+    if (filter_index == 3) {
+      sum = vmull_u8(input, v_tap[3]);
+      sum = vmlal_u8(sum, RightShiftVector<1 * 8>(input), v_tap[4]);
+    } else if (filter_index == 4) {
+      sum = vmull_u8(RightShiftVector<1 * 8>(input), v_tap[3]);
+      sum = vmlsl_u8(sum, input, v_tap[2]);
+      sum = vmlal_u8(sum, RightShiftVector<2 * 8>(input), v_tap[4]);
+      sum = vmlsl_u8(sum, RightShiftVector<3 * 8>(input), v_tap[5]);
+    } else {
+      assert(filter_index == 5);
+      sum = vmull_u8(input, v_tap[2]);
+      sum = vmlal_u8(sum, RightShiftVector<1 * 8>(input), v_tap[3]);
+      sum = vmlal_u8(sum, RightShiftVector<2 * 8>(input), v_tap[4]);
+      sum = vmlal_u8(sum, RightShiftVector<3 * 8>(input), v_tap[5]);
+    }
+    // |sum| contains an int16_t value.
+    sum = vreinterpretq_u16_s16(vrshrq_n_s16(vreinterpretq_s16_u16(sum),
+                                             kInterRoundBitsHorizontal - 1));
+    Store2<0>(dest16, sum);
+  }
+}
+
+template <int filter_index, bool negative_outside_taps, bool is_2d,
+          bool is_compound>
+void FilterHorizontal(const uint8_t* const src, const ptrdiff_t src_stride,
+                      void* const dest, const ptrdiff_t pred_stride,
+                      const int width, const int height,
+                      const uint8x8_t* const v_tap) {
+  assert(width < 8 || filter_index <= 3);
+  // Don't simplify the redundant if conditions with the template parameters,
+  // which helps the compiler generate compact code.
+  if (width >= 8 && filter_index <= 3) {
+    FilterHorizontalWidth8AndUp<filter_index, negative_outside_taps, is_2d,
+                                is_compound>(src, src_stride, dest, pred_stride,
+                                             width, height, v_tap);
     return;
   }
 
-  // Horizontal passes only needs to account for |num_taps| 2 and 4 when
+  // Horizontal passes only needs to account for number of taps 2 and 4 when
   // |width| <= 4.
   assert(width <= 4);
-  assert(num_taps <= 4);
-  if (num_taps <= 4) {
+  assert(filter_index >= 3 && filter_index <= 5);
+  if (filter_index >= 3 && filter_index <= 5) {
     if (width == 4) {
-      int y = 0;
-      do {
-        if (is_2d || is_compound) {
-          const uint16x8_t v_sum =
-              HorizontalTaps8To16<filter_index, negative_outside_taps>(src,
-                                                                       v_tap);
-          vst1_u16(dest16, vget_low_u16(v_sum));
-        } else {
-          const uint8x8_t result =
-              SimpleHorizontalTaps<filter_index, negative_outside_taps>(src,
-                                                                        v_tap);
-          StoreLo4(&dest8[0], result);
-        }
-        src += src_stride;
-        dest8 += pred_stride;
-        dest16 += pred_stride;
-      } while (++y < height);
+      FilterHorizontalWidth4<filter_index, is_2d, is_compound>(
+          src, src_stride, dest, pred_stride, height, v_tap);
       return;
     }
-
+    assert(width == 2);
     if (!is_compound) {
-      int y = 0;
-      do {
-        if (is_2d) {
-          const uint16x8_t sum =
-              HorizontalTaps8To16_2x2<filter_index>(src, src_stride, v_tap);
-          dest16[0] = vgetq_lane_u16(sum, 0);
-          dest16[1] = vgetq_lane_u16(sum, 2);
-          dest16 += pred_stride;
-          dest16[0] = vgetq_lane_u16(sum, 1);
-          dest16[1] = vgetq_lane_u16(sum, 3);
-          dest16 += pred_stride;
-        } else {
-          const uint8x8_t sum =
-              SimpleHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
-
-          dest8[0] = vget_lane_u8(sum, 0);
-          dest8[1] = vget_lane_u8(sum, 2);
-          dest8 += pred_stride;
-
-          dest8[0] = vget_lane_u8(sum, 1);
-          dest8[1] = vget_lane_u8(sum, 3);
-          dest8 += pred_stride;
-        }
-
-        src += src_stride << 1;
-        y += 2;
-      } while (y < height - 1);
-
-      // The 2d filters have an odd |height| because the horizontal pass
-      // generates context for the vertical pass.
-      if (is_2d) {
-        assert(height % 2 == 1);
-        uint16x8_t sum;
-        const uint8x8_t input = vld1_u8(src);
-        if (filter_index == 3) {  // |num_taps| == 2
-          sum = vmull_u8(RightShift<3 * 8>(input), v_tap[3]);
-          sum = vmlal_u8(sum, RightShift<4 * 8>(input), v_tap[4]);
-        } else if (filter_index == 4) {
-          sum = vmull_u8(RightShift<3 * 8>(input), v_tap[3]);
-          sum = vmlsl_u8(sum, RightShift<2 * 8>(input), v_tap[2]);
-          sum = vmlal_u8(sum, RightShift<4 * 8>(input), v_tap[4]);
-          sum = vmlsl_u8(sum, RightShift<5 * 8>(input), v_tap[5]);
-        } else {
-          assert(filter_index == 5);
-          sum = vmull_u8(RightShift<2 * 8>(input), v_tap[2]);
-          sum = vmlal_u8(sum, RightShift<3 * 8>(input), v_tap[3]);
-          sum = vmlal_u8(sum, RightShift<4 * 8>(input), v_tap[4]);
-          sum = vmlal_u8(sum, RightShift<5 * 8>(input), v_tap[5]);
-        }
-        // |sum| contains an int16_t value.
-        sum = vreinterpretq_u16_s16(vrshrq_n_s16(
-            vreinterpretq_s16_u16(sum), kInterRoundBitsHorizontal - 1));
-        Store2<0>(dest16, sum);
-      }
+      FilterHorizontalWidth2<filter_index, is_2d>(src, src_stride, dest,
+                                                  pred_stride, height, v_tap);
     }
   }
 }
@@ -451,78 +484,85 @@ int16x8_t SimpleSum2DVerticalTaps(const int16x8_t* const src,
 }
 
 template <int num_taps, bool is_compound = false>
-void Filter2DVertical(const uint16_t* src, void* const dst,
-                      const ptrdiff_t dst_stride, const int width,
-                      const int height, const int16x8_t taps) {
+void Filter2DVerticalWidth8AndUp(const uint16_t* src, void* const dst,
+                                 const ptrdiff_t dst_stride, const int width,
+                                 const int height, const int16x8_t taps) {
   assert(width >= 8);
   constexpr int next_row = num_taps - 1;
-  // The Horizontal pass uses |width| as |stride| for the intermediate buffer.
-  const ptrdiff_t src_stride = width;
-
-  auto* dst8 = static_cast<uint8_t*>(dst);
-  auto* dst16 = static_cast<uint16_t*>(dst);
+  auto* const dst8 = static_cast<uint8_t*>(dst);
+  auto* const dst16 = static_cast<uint16_t*>(dst);
 
   int x = 0;
   do {
-    int16x8_t srcs[8];
-    const uint16_t* src_x = src + x;
-    srcs[0] = vreinterpretq_s16_u16(vld1q_u16(src_x));
-    src_x += src_stride;
+    int16x8_t srcs[9];
+    srcs[0] = vreinterpretq_s16_u16(vld1q_u16(src));
+    src += 8;
     if (num_taps >= 4) {
-      srcs[1] = vreinterpretq_s16_u16(vld1q_u16(src_x));
-      src_x += src_stride;
-      srcs[2] = vreinterpretq_s16_u16(vld1q_u16(src_x));
-      src_x += src_stride;
+      srcs[1] = vreinterpretq_s16_u16(vld1q_u16(src));
+      src += 8;
+      srcs[2] = vreinterpretq_s16_u16(vld1q_u16(src));
+      src += 8;
       if (num_taps >= 6) {
-        srcs[3] = vreinterpretq_s16_u16(vld1q_u16(src_x));
-        src_x += src_stride;
-        srcs[4] = vreinterpretq_s16_u16(vld1q_u16(src_x));
-        src_x += src_stride;
+        srcs[3] = vreinterpretq_s16_u16(vld1q_u16(src));
+        src += 8;
+        srcs[4] = vreinterpretq_s16_u16(vld1q_u16(src));
+        src += 8;
         if (num_taps == 8) {
-          srcs[5] = vreinterpretq_s16_u16(vld1q_u16(src_x));
-          src_x += src_stride;
-          srcs[6] = vreinterpretq_s16_u16(vld1q_u16(src_x));
-          src_x += src_stride;
+          srcs[5] = vreinterpretq_s16_u16(vld1q_u16(src));
+          src += 8;
+          srcs[6] = vreinterpretq_s16_u16(vld1q_u16(src));
+          src += 8;
         }
       }
     }
 
-    int y = 0;
+    uint8_t* d8 = dst8 + x;
+    uint16_t* d16 = dst16 + x;
+    int y = height;
     do {
-      srcs[next_row] = vreinterpretq_s16_u16(vld1q_u16(src_x));
-      src_x += src_stride;
-
-      const int16x8_t sum =
-          SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps);
+      srcs[next_row] = vreinterpretq_s16_u16(vld1q_u16(src));
+      src += 8;
+      srcs[next_row + 1] = vreinterpretq_s16_u16(vld1q_u16(src));
+      src += 8;
+      const int16x8_t sum0 =
+          SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs + 0, taps);
+      const int16x8_t sum1 =
+          SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs + 1, taps);
       if (is_compound) {
-        vst1q_u16(dst16 + x + y * dst_stride, vreinterpretq_u16_s16(sum));
+        vst1q_u16(d16, vreinterpretq_u16_s16(sum0));
+        d16 += dst_stride;
+        vst1q_u16(d16, vreinterpretq_u16_s16(sum1));
+        d16 += dst_stride;
       } else {
-        vst1_u8(dst8 + x + y * dst_stride, vqmovun_s16(sum));
+        vst1_u8(d8, vqmovun_s16(sum0));
+        d8 += dst_stride;
+        vst1_u8(d8, vqmovun_s16(sum1));
+        d8 += dst_stride;
       }
-
-      srcs[0] = srcs[1];
+      srcs[0] = srcs[2];
       if (num_taps >= 4) {
-        srcs[1] = srcs[2];
-        srcs[2] = srcs[3];
+        srcs[1] = srcs[3];
+        srcs[2] = srcs[4];
         if (num_taps >= 6) {
-          srcs[3] = srcs[4];
-          srcs[4] = srcs[5];
+          srcs[3] = srcs[5];
+          srcs[4] = srcs[6];
           if (num_taps == 8) {
-            srcs[5] = srcs[6];
-            srcs[6] = srcs[7];
+            srcs[5] = srcs[7];
+            srcs[6] = srcs[8];
           }
         }
       }
-    } while (++y < height);
+      y -= 2;
+    } while (y != 0);
     x += 8;
   } while (x < width);
 }
 
 // Take advantage of |src_stride| == |width| to process two rows at a time.
 template <int num_taps, bool is_compound = false>
-void Filter2DVertical4xH(const uint16_t* src, void* const dst,
-                         const ptrdiff_t dst_stride, const int height,
-                         const int16x8_t taps) {
+void Filter2DVerticalWidth4(const uint16_t* src, void* const dst,
+                            const ptrdiff_t dst_stride, const int height,
+                            const int16x8_t taps) {
   auto* dst8 = static_cast<uint8_t*>(dst);
   auto* dst16 = static_cast<uint16_t*>(dst);
 
@@ -545,7 +585,7 @@ void Filter2DVertical4xH(const uint16_t* src, void* const dst,
     }
   }
 
-  int y = 0;
+  int y = height;
   do {
     srcs[num_taps] = vreinterpretq_s16_u16(vld1q_u16(src));
     src += 8;
@@ -580,15 +620,15 @@ void Filter2DVertical4xH(const uint16_t* src, void* const dst,
         }
       }
     }
-    y += 2;
-  } while (y < height);
+    y -= 2;
+  } while (y != 0);
 }
 
 // Take advantage of |src_stride| == |width| to process four rows at a time.
 template <int num_taps>
-void Filter2DVertical2xH(const uint16_t* src, void* const dst,
-                         const ptrdiff_t dst_stride, const int height,
-                         const int16x8_t taps) {
+void Filter2DVerticalWidth2(const uint16_t* src, void* const dst,
+                            const ptrdiff_t dst_stride, const int height,
+                            const int16x8_t taps) {
   constexpr int next_row = (num_taps < 6) ? 4 : 8;
 
   auto* dst8 = static_cast<uint8_t*>(dst);
@@ -672,29 +712,47 @@ LIBGAV1_ALWAYS_INLINE void DoHorizontalPass(
   }
 
   if (filter_index == 2) {  // 8 tap.
-    FilterHorizontal<8, 8, 2, true, is_2d, is_compound>(
+    FilterHorizontal<2, true, is_2d, is_compound>(
         src, src_stride, dst, dst_stride, width, height, v_tap);
   } else if (filter_index == 1) {  // 6 tap.
     // Check if outside taps are positive.
     if ((filter_id == 1) | (filter_id == 15)) {
-      FilterHorizontal<6, 8, 1, false, is_2d, is_compound>(
-          src, src_stride, dst, dst_stride, width, height, v_tap);
+      FilterHorizontal<1, false, is_2d, is_compound>(
+          src + 1, src_stride, dst, dst_stride, width, height, v_tap);
     } else {
-      FilterHorizontal<6, 8, 1, true, is_2d, is_compound>(
-          src, src_stride, dst, dst_stride, width, height, v_tap);
+      FilterHorizontal<1, true, is_2d, is_compound>(
+          src + 1, src_stride, dst, dst_stride, width, height, v_tap);
     }
   } else if (filter_index == 0) {  // 6 tap.
-    FilterHorizontal<6, 8, 0, true, is_2d, is_compound>(
-        src, src_stride, dst, dst_stride, width, height, v_tap);
+    FilterHorizontal<0, true, is_2d, is_compound>(
+        src + 1, src_stride, dst, dst_stride, width, height, v_tap);
   } else if (filter_index == 4) {  // 4 tap.
-    FilterHorizontal<4, 8, 4, true, is_2d, is_compound>(
-        src, src_stride, dst, dst_stride, width, height, v_tap);
+    FilterHorizontal<4, true, is_2d, is_compound>(
+        src + 2, src_stride, dst, dst_stride, width, height, v_tap);
   } else if (filter_index == 5) {  // 4 tap.
-    FilterHorizontal<4, 8, 5, true, is_2d, is_compound>(
-        src, src_stride, dst, dst_stride, width, height, v_tap);
+    FilterHorizontal<5, true, is_2d, is_compound>(
+        src + 2, src_stride, dst, dst_stride, width, height, v_tap);
   } else {  // 2 tap.
-    FilterHorizontal<2, 8, 3, true, is_2d, is_compound>(
-        src, src_stride, dst, dst_stride, width, height, v_tap);
+    FilterHorizontal<3, true, is_2d, is_compound>(
+        src + 3, src_stride, dst, dst_stride, width, height, v_tap);
+  }
+}
+
+template <int vertical_taps>
+void Filter2DVertical(const uint16_t* const intermediate_result,
+                      const int width, const int height, const int16x8_t taps,
+                      void* const prediction, const ptrdiff_t pred_stride) {
+  auto* const dest = static_cast<uint8_t*>(prediction);
+  if (width >= 8) {
+    Filter2DVerticalWidth8AndUp<vertical_taps>(
+        intermediate_result, dest, pred_stride, width, height, taps);
+  } else if (width == 4) {
+    Filter2DVerticalWidth4<vertical_taps>(intermediate_result, dest,
+                                          pred_stride, height, taps);
+  } else {
+    assert(width == 2);
+    Filter2DVerticalWidth2<vertical_taps>(intermediate_result, dest,
+                                          pred_stride, height, taps);
   }
 }
 
@@ -704,7 +762,7 @@ void Convolve2D_NEON(const void* const reference,
                      const int vertical_filter_index,
                      const int horizontal_filter_id,
                      const int vertical_filter_id, const int width,
-                     const int height, void* prediction,
+                     const int height, void* const prediction,
                      const ptrdiff_t pred_stride) {
   const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
   const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
@@ -715,67 +773,31 @@ void Convolve2D_NEON(const void* const reference,
       intermediate_result[kMaxSuperBlockSizeInPixels *
                           (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
   const int intermediate_height = height + vertical_taps - 1;
-
   const ptrdiff_t src_stride = reference_stride;
-  const auto* src = static_cast<const uint8_t*>(reference) -
-                    (vertical_taps / 2 - 1) * src_stride - kHorizontalOffset;
+  const auto* const src = static_cast<const uint8_t*>(reference) -
+                          (vertical_taps / 2 - 1) * src_stride -
+                          kHorizontalOffset;
 
   DoHorizontalPass</*is_2d=*/true>(src, src_stride, intermediate_result, width,
                                    width, intermediate_height,
                                    horizontal_filter_id, horiz_filter_index);
 
   // Vertical filter.
-  auto* dest = static_cast<uint8_t*>(prediction);
-  const ptrdiff_t dest_stride = pred_stride;
   assert(vertical_filter_id != 0);
-
   const int16x8_t taps = vmovl_s8(
       vld1_s8(kHalfSubPixelFilters[vert_filter_index][vertical_filter_id]));
-
   if (vertical_taps == 8) {
-    if (width == 2) {
-      Filter2DVertical2xH<8>(intermediate_result, dest, dest_stride, height,
-                             taps);
-    } else if (width == 4) {
-      Filter2DVertical4xH<8>(intermediate_result, dest, dest_stride, height,
-                             taps);
-    } else {
-      Filter2DVertical<8>(intermediate_result, dest, dest_stride, width, height,
-                          taps);
-    }
+    Filter2DVertical<8>(intermediate_result, width, height, taps, prediction,
+                        pred_stride);
   } else if (vertical_taps == 6) {
-    if (width == 2) {
-      Filter2DVertical2xH<6>(intermediate_result, dest, dest_stride, height,
-                             taps);
-    } else if (width == 4) {
-      Filter2DVertical4xH<6>(intermediate_result, dest, dest_stride, height,
-                             taps);
-    } else {
-      Filter2DVertical<6>(intermediate_result, dest, dest_stride, width, height,
-                          taps);
-    }
+    Filter2DVertical<6>(intermediate_result, width, height, taps, prediction,
+                        pred_stride);
   } else if (vertical_taps == 4) {
-    if (width == 2) {
-      Filter2DVertical2xH<4>(intermediate_result, dest, dest_stride, height,
-                             taps);
-    } else if (width == 4) {
-      Filter2DVertical4xH<4>(intermediate_result, dest, dest_stride, height,
-                             taps);
-    } else {
-      Filter2DVertical<4>(intermediate_result, dest, dest_stride, width, height,
-                          taps);
-    }
+    Filter2DVertical<4>(intermediate_result, width, height, taps, prediction,
+                        pred_stride);
   } else {  // |vertical_taps| == 2
-    if (width == 2) {
-      Filter2DVertical2xH<2>(intermediate_result, dest, dest_stride, height,
-                             taps);
-    } else if (width == 4) {
-      Filter2DVertical4xH<2>(intermediate_result, dest, dest_stride, height,
-                             taps);
-    } else {
-      Filter2DVertical<2>(intermediate_result, dest, dest_stride, width, height,
-                          taps);
-    }
+    Filter2DVertical<2>(intermediate_result, width, height, taps, prediction,
+                        pred_stride);
   }
 }
 
@@ -788,7 +810,7 @@ void Convolve2D_NEON(const void* const reference,
 // increments. The first load covers the initial elements of src_x, while the
 // final load covers the taps.
 template <int grade_x>
-inline uint8x8x3_t LoadSrcVals(const uint8_t* src_x) {
+inline uint8x8x3_t LoadSrcVals(const uint8_t* const src_x) {
   uint8x8x3_t ret;
   const uint8x16_t src_val = vld1q_u8(src_x);
   ret.val[0] = vget_low_u8(src_val);
@@ -811,7 +833,7 @@ inline uint8x16_t GetPositive2TapFilter(const int tap_index) {
 }
 
 template <int grade_x>
-inline void ConvolveKernelHorizontal2Tap(const uint8_t* src,
+inline void ConvolveKernelHorizontal2Tap(const uint8_t* const src,
                                          const ptrdiff_t src_stride,
                                          const int width, const int subpixel_x,
                                          const int step_x,
@@ -843,7 +865,7 @@ inline void ConvolveKernelHorizontal2Tap(const uint8_t* src,
     // on x.
     const uint8x8_t taps[2] = {VQTbl1U8(filter_taps0, filter_indices),
                                VQTbl1U8(filter_taps1, filter_indices)};
-    int y = 0;
+    int y = intermediate_height;
     do {
       // Load a pool of samples to select from using stepped indices.
       const uint8x16_t src_vals = vld1q_u8(src_x);
@@ -860,7 +882,7 @@ inline void ConvolveKernelHorizontal2Tap(const uint8_t* src,
                              kInterRoundBitsHorizontal - 1));
       src_x += src_stride;
       intermediate += kIntermediateStride;
-    } while (++y < intermediate_height);
+    } while (--y != 0);
     return;
   }
 
@@ -883,7 +905,7 @@ inline void ConvolveKernelHorizontal2Tap(const uint8_t* src,
     // on x.
     const uint8x8_t taps[2] = {VQTbl1U8(filter_taps0, filter_indices),
                                VQTbl1U8(filter_taps1, filter_indices)};
-    int y = 0;
+    int y = intermediate_height;
     do {
       // Load a pool of samples to select from using stepped indices.
       const uint8x8x3_t src_vals = LoadSrcVals<grade_x>(src_x);
@@ -900,7 +922,7 @@ inline void ConvolveKernelHorizontal2Tap(const uint8_t* src,
                              kInterRoundBitsHorizontal - 1));
       src_x += src_stride;
       intermediate_x += kIntermediateStride;
-    } while (++y < intermediate_height);
+    } while (--y != 0);
     x += 8;
     p += step_x8;
   } while (x < width);
@@ -921,7 +943,7 @@ inline uint8x16_t GetPositive4TapFilter(const int tap_index) {
 
 // This filter is only possible when width <= 4.
 void ConvolveKernelHorizontalPositive4Tap(
-    const uint8_t* src, const ptrdiff_t src_stride, const int subpixel_x,
+    const uint8_t* const src, const ptrdiff_t src_stride, const int subpixel_x,
     const int step_x, const int intermediate_height, int16_t* intermediate) {
   const int kernel_offset = 2;
   const int ref_x = subpixel_x >> kScaleSubPixelBits;
@@ -950,7 +972,7 @@ void ConvolveKernelHorizontalPositive4Tap(
 
   const uint8x8_t src_indices =
       vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits));
-  int y = 0;
+  int y = intermediate_height;
   do {
     // Load a pool of samples to select from using stepped index vectors.
     const uint8x16_t src_vals = vld1q_u8(src_x);
@@ -970,7 +992,7 @@ void ConvolveKernelHorizontalPositive4Tap(
 
     src_x += src_stride;
     intermediate += kIntermediateStride;
-  } while (++y < intermediate_height);
+  } while (--y != 0);
 }
 
 // Pre-transpose the 4 tap filters in |kAbsHalfSubPixelFilters|[4].
@@ -988,7 +1010,7 @@ inline uint8x16_t GetSigned4TapFilter(const int tap_index) {
 
 // This filter is only possible when width <= 4.
 inline void ConvolveKernelHorizontalSigned4Tap(
-    const uint8_t* src, const ptrdiff_t src_stride, const int subpixel_x,
+    const uint8_t* const src, const ptrdiff_t src_stride, const int subpixel_x,
     const int step_x, const int intermediate_height, int16_t* intermediate) {
   const int kernel_offset = 2;
   const int ref_x = subpixel_x >> kScaleSubPixelBits;
@@ -1025,7 +1047,7 @@ inline void ConvolveKernelHorizontalSigned4Tap(
                                     vadd_u8(src_indices_base, vdup_n_u8(2)),
                                     vadd_u8(src_indices_base, vdup_n_u8(3))};
 
-  int y = 0;
+  int y = intermediate_height;
   do {
     // Load a pool of samples to select from using stepped indices.
     const uint8x16_t src_vals = vld1q_u8(src_x);
@@ -1042,7 +1064,7 @@ inline void ConvolveKernelHorizontalSigned4Tap(
                            kInterRoundBitsHorizontal - 1));
     src_x += src_stride;
     intermediate += kIntermediateStride;
-  } while (++y < intermediate_height);
+  } while (--y != 0);
 }
 
 // Pre-transpose the 6 tap filters in |kAbsHalfSubPixelFilters|[0].
@@ -1063,9 +1085,9 @@ inline uint8x16_t GetSigned6TapFilter(const int tap_index) {
 // This filter is only possible when width >= 8.
 template <int grade_x>
 inline void ConvolveKernelHorizontalSigned6Tap(
-    const uint8_t* src, const ptrdiff_t src_stride, const int width,
+    const uint8_t* const src, const ptrdiff_t src_stride, const int width,
     const int subpixel_x, const int step_x, const int intermediate_height,
-    int16_t* intermediate) {
+    int16_t* const intermediate) {
   const int kernel_offset = 1;
   const uint8x8_t one = vdup_n_u8(1);
   const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
@@ -1107,7 +1129,7 @@ inline void ConvolveKernelHorizontalSigned6Tap(
     for (int i = 0; i < 6; ++i) {
       taps[i] = VQTbl1U8(filter_taps[i], filter_indices);
     }
-    int y = 0;
+    int y = intermediate_height;
     do {
       // Load a pool of samples to select from using stepped indices.
       const uint8x8x3_t src_vals = LoadSrcVals<grade_x>(src_x);
@@ -1122,7 +1144,7 @@ inline void ConvolveKernelHorizontalSigned6Tap(
                              kInterRoundBitsHorizontal - 1));
       src_x += src_stride;
       intermediate_x += kIntermediateStride;
-    } while (++y < intermediate_height);
+    } while (--y != 0);
     x += 8;
     p += step_x8;
   } while (x < width);
@@ -1156,9 +1178,9 @@ inline int8x16_t GetMixed6TapFilter(const int tap_index) {
 // This filter is only possible when width >= 8.
 template <int grade_x>
 inline void ConvolveKernelHorizontalMixed6Tap(
-    const uint8_t* src, const ptrdiff_t src_stride, const int width,
+    const uint8_t* const src, const ptrdiff_t src_stride, const int width,
     const int subpixel_x, const int step_x, const int intermediate_height,
-    int16_t* intermediate) {
+    int16_t* const intermediate) {
   const int kernel_offset = 1;
   const uint8x8_t one = vdup_n_u8(1);
   const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
@@ -1205,7 +1227,7 @@ inline void ConvolveKernelHorizontalMixed6Tap(
     mixed_taps[0] = vmovl_s8(VQTbl1S8(mixed_filter_taps[0], filter_indices));
     mixed_taps[1] = vmovl_s8(VQTbl1S8(mixed_filter_taps[1], filter_indices));
 
-    int y = 0;
+    int y = intermediate_height;
     do {
       // Load a pool of samples to select from using stepped indices.
       const uint8x8x3_t src_vals = LoadSrcVals<grade_x>(src_x);
@@ -1224,7 +1246,7 @@ inline void ConvolveKernelHorizontalMixed6Tap(
                                              kInterRoundBitsHorizontal - 1));
       src_x += src_stride;
       intermediate_x += kIntermediateStride;
-    } while (++y < intermediate_height);
+    } while (--y != 0);
     x += 8;
     p += step_x8;
   } while (x < width);
@@ -1250,9 +1272,9 @@ inline uint8x16_t GetSigned8TapFilter(const int tap_index) {
 // This filter is only possible when width >= 8.
 template <int grade_x>
 inline void ConvolveKernelHorizontalSigned8Tap(
-    const uint8_t* src, const ptrdiff_t src_stride, const int width,
+    const uint8_t* const src, const ptrdiff_t src_stride, const int width,
     const int subpixel_x, const int step_x, const int intermediate_height,
-    int16_t* intermediate) {
+    int16_t* const intermediate) {
   const uint8x8_t one = vdup_n_u8(1);
   const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
   const int ref_x = subpixel_x >> kScaleSubPixelBits;
@@ -1290,7 +1312,7 @@ inline void ConvolveKernelHorizontalSigned8Tap(
       taps[i] = VQTbl1U8(filter_taps[i], filter_indices);
     }
 
-    int y = 0;
+    int y = intermediate_height;
     do {
       // Load a pool of samples to select from using stepped indices.
       const uint8x8x3_t src_vals = LoadSrcVals<grade_x>(src_x);
@@ -1306,7 +1328,7 @@ inline void ConvolveKernelHorizontalSigned8Tap(
                              kInterRoundBitsHorizontal - 1));
       src_x += src_stride;
       intermediate_x += kIntermediateStride;
-    } while (++y < intermediate_height);
+    } while (--y != 0);
     x += 8;
     p += step_x8;
   } while (x < width);
@@ -1314,9 +1336,9 @@ inline void ConvolveKernelHorizontalSigned8Tap(
 
 // This function handles blocks of width 2 or 4.
 template <int num_taps, int grade_y, int width, bool is_compound>
-void ConvolveVerticalScale4xH(const int16_t* src, const int subpixel_y,
+void ConvolveVerticalScale4xH(const int16_t* const src, const int subpixel_y,
                               const int filter_index, const int step_y,
-                              const int height, void* dest,
+                              const int height, void* const dest,
                               const ptrdiff_t dest_stride) {
   constexpr ptrdiff_t src_stride = kIntermediateStride;
   const int16_t* src_y = src;
@@ -1327,8 +1349,8 @@ void ConvolveVerticalScale4xH(const int16_t* src, const int subpixel_y,
 
   int p = subpixel_y & 1023;
   int prev_p = p;
-  int y = 0;
-  do {  // y < height
+  int y = height;
+  do {
     for (int i = 0; i < num_taps; ++i) {
       s[i] = vld1_s16(src_y + i * src_stride);
     }
@@ -1381,16 +1403,16 @@ void ConvolveVerticalScale4xH(const int16_t* src, const int subpixel_y,
     prev_p = p;
     dest16_y += dest_stride;
     dest_y += dest_stride;
-
-    y += 2;
-  } while (y < height);
+    y -= 2;
+  } while (y != 0);
 }
 
 template <int num_taps, int grade_y, bool is_compound>
-inline void ConvolveVerticalScale(const int16_t* src, const int width,
+inline void ConvolveVerticalScale(const int16_t* const src, const int width,
                                   const int subpixel_y, const int filter_index,
                                   const int step_y, const int height,
-                                  void* dest, const ptrdiff_t dest_stride) {
+                                  void* const dest,
+                                  const ptrdiff_t dest_stride) {
   constexpr ptrdiff_t src_stride = kIntermediateStride;
   // A possible improvement is to use arithmetic to decide how many times to
   // apply filters to same source before checking whether to load new srcs.
@@ -1401,15 +1423,15 @@ inline void ConvolveVerticalScale(const int16_t* src, const int width,
   uint8_t* dest_y;
 
   int x = 0;
-  do {  // x < width
-    const int16_t* src_x = src + x;
+  do {
+    const int16_t* const src_x = src + x;
     const int16_t* src_y = src_x;
     dest16_y = static_cast<uint16_t*>(dest) + x;
     dest_y = static_cast<uint8_t*>(dest) + x;
     int p = subpixel_y & 1023;
     int prev_p = p;
-    int y = 0;
-    do {  // y < height
+    int y = height;
+    do {
       for (int i = 0; i < num_taps; ++i) {
         s[i] = vld1q_s16(src_y + i * src_stride);
       }
@@ -1448,9 +1470,8 @@ inline void ConvolveVerticalScale(const int16_t* src, const int width,
       prev_p = p;
       dest16_y += dest_stride;
       dest_y += dest_stride;
-
-      y += 2;
-    } while (y < height);
+      y -= 2;
+    } while (y != 0);
     x += 8;
   } while (x < width);
 }
@@ -1462,7 +1483,7 @@ void ConvolveScale2D_NEON(const void* const reference,
                           const int vertical_filter_index, const int subpixel_x,
                           const int subpixel_y, const int step_x,
                           const int step_y, const int width, const int height,
-                          void* prediction, const ptrdiff_t pred_stride) {
+                          void* const prediction, const ptrdiff_t pred_stride) {
   const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
   const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
   assert(step_x <= 2048);
@@ -1699,12 +1720,13 @@ void ConvolveHorizontal_NEON(const void* const reference,
                              const int /*vertical_filter_index*/,
                              const int horizontal_filter_id,
                              const int /*vertical_filter_id*/, const int width,
-                             const int height, void* prediction,
+                             const int height, void* const prediction,
                              const ptrdiff_t pred_stride) {
   const int filter_index = GetFilterIndex(horizontal_filter_index, width);
   // Set |src| to the outermost tap.
-  const auto* src = static_cast<const uint8_t*>(reference) - kHorizontalOffset;
-  auto* dest = static_cast<uint8_t*>(prediction);
+  const auto* const src =
+      static_cast<const uint8_t*>(reference) - kHorizontalOffset;
+  auto* const dest = static_cast<uint8_t*>(prediction);
 
   DoHorizontalPass(src, reference_stride, dest, pred_stride, width, height,
                    horizontal_filter_id, filter_index);
@@ -1719,14 +1741,14 @@ uint16x8_t Compound1DShift(const int16x8_t sum) {
 
 template <int filter_index, bool is_compound = false,
           bool negative_outside_taps = false>
-void FilterVertical(const uint8_t* src, const ptrdiff_t src_stride,
+void FilterVertical(const uint8_t* const src, const ptrdiff_t src_stride,
                     void* const dst, const ptrdiff_t dst_stride,
                     const int width, const int height,
                     const uint8x8_t* const taps) {
   const int num_taps = GetNumTapsInFilter(filter_index);
   const int next_row = num_taps - 1;
-  auto* dst8 = static_cast<uint8_t*>(dst);
-  auto* dst16 = static_cast<uint16_t*>(dst);
+  auto* const dst8 = static_cast<uint8_t*>(dst);
+  auto* const dst16 = static_cast<uint16_t*>(dst);
   assert(width >= 8);
 
   int x = 0;
@@ -1754,6 +1776,9 @@ void FilterVertical(const uint8_t* src, const ptrdiff_t src_stride,
       }
     }
 
+    // Decreasing the y loop counter produces worse code with clang.
+    // Don't unroll this loop since it generates too much code and the decoder
+    // is even slower.
     int y = 0;
     do {
       srcs[next_row] = vld1_u8(src_x);
@@ -1804,7 +1829,7 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
     srcs[0] = Load4(src);
     src += src_stride;
 
-    int y = 0;
+    int y = height;
     do {
       srcs[0] = Load4<1>(src, srcs[0]);
       src += src_stride;
@@ -1829,8 +1854,8 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
       }
 
       srcs[0] = srcs[2];
-      y += 2;
-    } while (y < height);
+      y -= 2;
+    } while (y != 0);
   } else if (num_taps == 4) {
     srcs[4] = vdup_n_u8(0);
 
@@ -1842,7 +1867,7 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
     src += src_stride;
     srcs[1] = vext_u8(srcs[0], srcs[2], 4);
 
-    int y = 0;
+    int y = height;
     do {
       srcs[2] = Load4<1>(src, srcs[2]);
       src += src_stride;
@@ -1869,8 +1894,8 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
       srcs[0] = srcs[2];
       srcs[1] = srcs[3];
       srcs[2] = srcs[4];
-      y += 2;
-    } while (y < height);
+      y -= 2;
+    } while (y != 0);
   } else if (num_taps == 6) {
     srcs[6] = vdup_n_u8(0);
 
@@ -1887,7 +1912,7 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
     src += src_stride;
     srcs[3] = vext_u8(srcs[2], srcs[4], 4);
 
-    int y = 0;
+    int y = height;
     do {
       srcs[4] = Load4<1>(src, srcs[4]);
       src += src_stride;
@@ -1916,8 +1941,8 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
       srcs[2] = srcs[4];
       srcs[3] = srcs[5];
       srcs[4] = srcs[6];
-      y += 2;
-    } while (y < height);
+      y -= 2;
+    } while (y != 0);
   } else if (num_taps == 8) {
     srcs[8] = vdup_n_u8(0);
 
@@ -1939,7 +1964,7 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
     src += src_stride;
     srcs[5] = vext_u8(srcs[4], srcs[6], 4);
 
-    int y = 0;
+    int y = height;
     do {
       srcs[6] = Load4<1>(src, srcs[6]);
       src += src_stride;
@@ -1970,8 +1995,8 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
       srcs[4] = srcs[6];
       srcs[5] = srcs[7];
       srcs[6] = srcs[8];
-      y += 2;
-    } while (y < height);
+      y -= 2;
+    } while (y != 0);
   }
 }
 
@@ -2186,14 +2211,14 @@ void ConvolveVertical_NEON(const void* const reference,
                            const int vertical_filter_index,
                            const int /*horizontal_filter_id*/,
                            const int vertical_filter_id, const int width,
-                           const int height, void* prediction,
+                           const int height, void* const prediction,
                            const ptrdiff_t pred_stride) {
   const int filter_index = GetFilterIndex(vertical_filter_index, height);
   const int vertical_taps = GetNumTapsInFilter(filter_index);
   const ptrdiff_t src_stride = reference_stride;
   const auto* src = static_cast<const uint8_t*>(reference) -
                     (vertical_taps / 2 - 1) * src_stride;
-  auto* dest = static_cast<uint8_t*>(prediction);
+  auto* const dest = static_cast<uint8_t*>(prediction);
   const ptrdiff_t dest_stride = pred_stride;
   assert(vertical_filter_id != 0);
 
@@ -2303,7 +2328,7 @@ void ConvolveCompoundCopy_NEON(
     const void* const reference, const ptrdiff_t reference_stride,
     const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
     const int /*horizontal_filter_id*/, const int /*vertical_filter_id*/,
-    const int width, const int height, void* prediction,
+    const int width, const int height, void* const prediction,
     const ptrdiff_t /*pred_stride*/) {
   const auto* src = static_cast<const uint8_t*>(reference);
   const ptrdiff_t src_stride = reference_stride;
@@ -2312,7 +2337,7 @@ void ConvolveCompoundCopy_NEON(
       kInterRoundBitsVertical - kInterRoundBitsCompoundVertical;
 
   if (width >= 16) {
-    int y = 0;
+    int y = height;
     do {
       int x = 0;
       do {
@@ -2328,20 +2353,20 @@ void ConvolveCompoundCopy_NEON(
       } while (x < width);
       src += src_stride;
       dest += width;
-    } while (++y < height);
+    } while (--y != 0);
   } else if (width == 8) {
-    int y = 0;
+    int y = height;
     do {
       const uint8x8_t v_src = vld1_u8(&src[0]);
       const uint16x8_t v_dest = vshll_n_u8(v_src, final_shift);
       vst1q_u16(&dest[0], v_dest);
       src += src_stride;
       dest += width;
-    } while (++y < height);
-  } else { /* width == 4 */
+    } while (--y != 0);
+  } else {  // width == 4
     uint8x8_t v_src = vdup_n_u8(0);
 
-    int y = 0;
+    int y = height;
     do {
       v_src = Load4<0>(&src[0], v_src);
       src += src_stride;
@@ -2350,8 +2375,8 @@ void ConvolveCompoundCopy_NEON(
       const uint16x8_t v_dest = vshll_n_u8(v_src, final_shift);
       vst1q_u16(&dest[0], v_dest);
       dest += 4 << 1;
-      y += 2;
-    } while (y < height);
+      y -= 2;
+    } while (y != 0);
   }
 }
 
@@ -2359,14 +2384,14 @@ void ConvolveCompoundVertical_NEON(
     const void* const reference, const ptrdiff_t reference_stride,
     const int /*horizontal_filter_index*/, const int vertical_filter_index,
     const int /*horizontal_filter_id*/, const int vertical_filter_id,
-    const int width, const int height, void* prediction,
+    const int width, const int height, void* const prediction,
     const ptrdiff_t /*pred_stride*/) {
   const int filter_index = GetFilterIndex(vertical_filter_index, height);
   const int vertical_taps = GetNumTapsInFilter(filter_index);
   const ptrdiff_t src_stride = reference_stride;
   const auto* src = static_cast<const uint8_t*>(reference) -
                     (vertical_taps / 2 - 1) * src_stride;
-  auto* dest = static_cast<uint16_t*>(prediction);
+  auto* const dest = static_cast<uint16_t*>(prediction);
   assert(vertical_filter_id != 0);
 
   uint8x8_t taps[8];
@@ -2454,24 +2479,39 @@ void ConvolveCompoundHorizontal_NEON(
     const void* const reference, const ptrdiff_t reference_stride,
     const int horizontal_filter_index, const int /*vertical_filter_index*/,
     const int horizontal_filter_id, const int /*vertical_filter_id*/,
-    const int width, const int height, void* prediction,
+    const int width, const int height, void* const prediction,
     const ptrdiff_t /*pred_stride*/) {
   const int filter_index = GetFilterIndex(horizontal_filter_index, width);
-  const auto* src = static_cast<const uint8_t*>(reference) - kHorizontalOffset;
-  auto* dest = static_cast<uint16_t*>(prediction);
+  const auto* const src =
+      static_cast<const uint8_t*>(reference) - kHorizontalOffset;
+  auto* const dest = static_cast<uint16_t*>(prediction);
 
   DoHorizontalPass</*is_2d=*/false, /*is_compound=*/true>(
       src, reference_stride, dest, width, width, height, horizontal_filter_id,
       filter_index);
 }
 
+template <int vertical_taps>
+void Compound2DVertical(const uint16_t* const intermediate_result,
+                        const int width, const int height, const int16x8_t taps,
+                        void* const prediction) {
+  auto* const dest = static_cast<uint16_t*>(prediction);
+  if (width == 4) {
+    Filter2DVerticalWidth4<vertical_taps, /*is_compound=*/true>(
+        intermediate_result, dest, width, height, taps);
+  } else {
+    Filter2DVerticalWidth8AndUp<vertical_taps, /*is_compound=*/true>(
+        intermediate_result, dest, width, width, height, taps);
+  }
+}
+
 void ConvolveCompound2D_NEON(const void* const reference,
                              const ptrdiff_t reference_stride,
                              const int horizontal_filter_index,
                              const int vertical_filter_index,
                              const int horizontal_filter_id,
                              const int vertical_filter_id, const int width,
-                             const int height, void* prediction,
+                             const int height, void* const prediction,
                              const ptrdiff_t /*pred_stride*/) {
   // The output of the horizontal filter, i.e. the intermediate_result, is
   // guaranteed to fit in int16_t.
@@ -2492,55 +2532,26 @@ void ConvolveCompound2D_NEON(const void* const reference,
   const auto* const src = static_cast<const uint8_t*>(reference) -
                           (vertical_taps / 2 - 1) * src_stride -
                           kHorizontalOffset;
-
   DoHorizontalPass</*is_2d=*/true, /*is_compound=*/true>(
       src, src_stride, intermediate_result, width, width, intermediate_height,
       horizontal_filter_id, horiz_filter_index);
 
   // Vertical filter.
-  auto* dest = static_cast<uint16_t*>(prediction);
   assert(vertical_filter_id != 0);
-
-  const ptrdiff_t dest_stride = width;
   const int16x8_t taps = vmovl_s8(
       vld1_s8(kHalfSubPixelFilters[vert_filter_index][vertical_filter_id]));
-
   if (vertical_taps == 8) {
-    if (width == 4) {
-      Filter2DVertical4xH<8, /*is_compound=*/true>(intermediate_result, dest,
-                                                   dest_stride, height, taps);
-    } else {
-      Filter2DVertical<8, /*is_compound=*/true>(
-          intermediate_result, dest, dest_stride, width, height, taps);
-    }
+    Compound2DVertical<8>(intermediate_result, width, height, taps, prediction);
   } else if (vertical_taps == 6) {
-    if (width == 4) {
-      Filter2DVertical4xH<6, /*is_compound=*/true>(intermediate_result, dest,
-                                                   dest_stride, height, taps);
-    } else {
-      Filter2DVertical<6, /*is_compound=*/true>(
-          intermediate_result, dest, dest_stride, width, height, taps);
-    }
+    Compound2DVertical<6>(intermediate_result, width, height, taps, prediction);
   } else if (vertical_taps == 4) {
-    if (width == 4) {
-      Filter2DVertical4xH<4, /*is_compound=*/true>(intermediate_result, dest,
-                                                   dest_stride, height, taps);
-    } else {
-      Filter2DVertical<4, /*is_compound=*/true>(
-          intermediate_result, dest, dest_stride, width, height, taps);
-    }
+    Compound2DVertical<4>(intermediate_result, width, height, taps, prediction);
   } else {  // |vertical_taps| == 2
-    if (width == 4) {
-      Filter2DVertical4xH<2, /*is_compound=*/true>(intermediate_result, dest,
-                                                   dest_stride, height, taps);
-    } else {
-      Filter2DVertical<2, /*is_compound=*/true>(
-          intermediate_result, dest, dest_stride, width, height, taps);
-    }
+    Compound2DVertical<2>(intermediate_result, width, height, taps, prediction);
   }
 }
 
-inline void HalfAddHorizontal(const uint8_t* src, uint8_t* dst) {
+inline void HalfAddHorizontal(const uint8_t* const src, uint8_t* const dst) {
   const uint8x16_t left = vld1q_u8(src);
   const uint8x16_t right = vld1q_u8(src + 1);
   vst1q_u8(dst, vrhaddq_u8(left, right));
@@ -2554,7 +2565,7 @@ inline void IntraBlockCopyHorizontal(const uint8_t* src,
   const ptrdiff_t src_remainder_stride = src_stride - (width - 16);
   const ptrdiff_t dst_remainder_stride = dst_stride - (width - 16);
 
-  int y = 0;
+  int y = height;
   do {
     HalfAddHorizontal(src, dst);
     if (width >= 32) {
@@ -2586,7 +2597,7 @@ inline void IntraBlockCopyHorizontal(const uint8_t* src,
     }
     src += src_remainder_stride;
     dst += dst_remainder_stride;
-  } while (++y < height);
+  } while (--y != 0);
 }
 
 void ConvolveIntraBlockCopyHorizontal_NEON(
@@ -2610,7 +2621,7 @@ void ConvolveIntraBlockCopyHorizontal_NEON(
     IntraBlockCopyHorizontal<16>(src, reference_stride, height, dest,
                                  pred_stride);
   } else if (width == 8) {
-    int y = 0;
+    int y = height;
     do {
       const uint8x8_t left = vld1_u8(src);
       const uint8x8_t right = vld1_u8(src + 1);
@@ -2618,11 +2629,11 @@ void ConvolveIntraBlockCopyHorizontal_NEON(
 
       src += reference_stride;
       dest += pred_stride;
-    } while (++y < height);
+    } while (--y != 0);
   } else if (width == 4) {
     uint8x8_t left = vdup_n_u8(0);
     uint8x8_t right = vdup_n_u8(0);
-    int y = 0;
+    int y = height;
     do {
       left = Load4<0>(src, left);
       right = Load4<0>(src + 1, right);
@@ -2637,13 +2648,13 @@ void ConvolveIntraBlockCopyHorizontal_NEON(
       dest += pred_stride;
       StoreHi4(dest, result);
       dest += pred_stride;
-      y += 2;
-    } while (y < height);
+      y -= 2;
+    } while (y != 0);
   } else {
     assert(width == 2);
     uint8x8_t left = vdup_n_u8(0);
     uint8x8_t right = vdup_n_u8(0);
-    int y = 0;
+    int y = height;
     do {
       left = Load2<0>(src, left);
       right = Load2<0>(src + 1, right);
@@ -2658,8 +2669,8 @@ void ConvolveIntraBlockCopyHorizontal_NEON(
       dest += pred_stride;
       Store2<1>(dest, result);
       dest += pred_stride;
-      y += 2;
-    } while (y < height);
+      y -= 2;
+    } while (y != 0);
   }
 }
 
@@ -2694,7 +2705,7 @@ inline void IntraBlockCopyVertical(const uint8_t* src,
   }
   src += src_remainder_stride;
 
-  int y = 0;
+  int y = height;
   do {
     below[0] = vld1q_u8(src);
     if (width >= 32) {
@@ -2749,7 +2760,7 @@ inline void IntraBlockCopyVertical(const uint8_t* src,
       }
     }
     dst += dst_remainder_stride;
-  } while (++y < height);
+  } while (--y != 0);
 }
 
 void ConvolveIntraBlockCopyVertical_NEON(
@@ -2778,7 +2789,7 @@ void ConvolveIntraBlockCopyVertical_NEON(
     row = vld1_u8(src);
     src += reference_stride;
 
-    int y = 0;
+    int y = height;
     do {
       below = vld1_u8(src);
       src += reference_stride;
@@ -2787,13 +2798,13 @@ void ConvolveIntraBlockCopyVertical_NEON(
       dest += pred_stride;
 
       row = below;
-    } while (++y < height);
+    } while (--y != 0);
   } else if (width == 4) {
     uint8x8_t row = Load4(src);
     uint8x8_t below = vdup_n_u8(0);
     src += reference_stride;
 
-    int y = 0;
+    int y = height;
     do {
       below = Load4<0>(src, below);
       src += reference_stride;
@@ -2802,14 +2813,14 @@ void ConvolveIntraBlockCopyVertical_NEON(
       dest += pred_stride;
 
       row = below;
-    } while (++y < height);
+    } while (--y != 0);
   } else {
     assert(width == 2);
     uint8x8_t row = Load2(src);
     uint8x8_t below = vdup_n_u8(0);
     src += reference_stride;
 
-    int y = 0;
+    int y = height;
     do {
       below = Load2<0>(src, below);
       src += reference_stride;
@@ -2818,7 +2829,7 @@ void ConvolveIntraBlockCopyVertical_NEON(
       dest += pred_stride;
 
       row = below;
-    } while (++y < height);
+    } while (--y != 0);
   }
 }
 
@@ -2870,7 +2881,7 @@ inline void IntraBlockCopy2D(const uint8_t* src, const ptrdiff_t src_stride,
   }
   src += src_remainder_stride;
 
-  int y = 0;
+  int y = height;
   do {
     const uint16x8_t below_0 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
     vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[0], below_0), 2));
@@ -2981,7 +2992,7 @@ inline void IntraBlockCopy2D(const uint8_t* src, const ptrdiff_t src_stride,
     }
     src += src_remainder_stride;
     dst += dst_remainder_stride;
-  } while (++y < height);
+  } while (--y != 0);
 }
 
 void ConvolveIntraBlockCopy2D_NEON(
@@ -3013,7 +3024,7 @@ void ConvolveIntraBlockCopy2D_NEON(
 
     uint16x4_t row = vget_low_u16(vaddl_u8(left, right));
 
-    int y = 0;
+    int y = height;
     do {
       left = Load4<0>(src, left);
       right = Load4<0>(src + 1, right);
@@ -3032,8 +3043,8 @@ void ConvolveIntraBlockCopy2D_NEON(
       dest += pred_stride;
 
       row = vget_high_u16(below);
-      y += 2;
-    } while (y < height);
+      y -= 2;
+    } while (y != 0);
   } else {
     uint8x8_t left = Load2(src);
     uint8x8_t right = Load2(src + 1);
@@ -3041,7 +3052,7 @@ void ConvolveIntraBlockCopy2D_NEON(
 
     uint16x4_t row = vget_low_u16(vaddl_u8(left, right));
 
-    int y = 0;
+    int y = height;
     do {
       left = Load2<0>(src, left);
       right = Load2<0>(src + 1, right);
@@ -3060,8 +3071,8 @@ void ConvolveIntraBlockCopy2D_NEON(
       dest += pred_stride;
 
       row = vget_high_u16(below);
-      y += 2;
-    } while (y < height);
+      y -= 2;
+    } while (y != 0);
   }
 }
 
@@ -3093,7 +3104,7 @@ void ConvolveInit_NEON() { low_bitdepth::Init8bpp(); }
 }  // namespace dsp
 }  // namespace libgav1
 
-#else  // !LIBGAV1_ENABLE_NEON
+#else   // !LIBGAV1_ENABLE_NEON
 
 namespace libgav1 {
 namespace dsp {