1 files changed, 2830 insertions, 0 deletions
diff --git a/src/dsp/x86/convolve_sse4.cc b/src/dsp/x86/convolve_sse4.cc
new file mode 100644
index 0000000..3a0fff5
--- /dev/null
+++ b/src/dsp/x86/convolve_sse4.cc
@@ -0,0 +1,2830 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/convolve.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+#include <smmintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+#include "src/dsp/convolve.inc"
+
+// Multiply every entry in |src[]| by the corresponding entry in |taps[]| and
+// sum. The filters in |taps[]| are pre-shifted by 1. This prevents the final
+// sum from outranging int16_t.
+template <int filter_index>
+__m128i SumOnePassTaps(const __m128i* const src, const __m128i* const taps) {
+  __m128i sum;
+  if (filter_index < 2) {
+    // 6 taps.
+    const __m128i v_madd_21 = _mm_maddubs_epi16(src[0], taps[0]);  // k2k1
+    const __m128i v_madd_43 = _mm_maddubs_epi16(src[1], taps[1]);  // k4k3
+    const __m128i v_madd_65 = _mm_maddubs_epi16(src[2], taps[2]);  // k6k5
+    sum = _mm_add_epi16(v_madd_21, v_madd_43);
+    sum = _mm_add_epi16(sum, v_madd_65);
+  } else if (filter_index == 2) {
+    // 8 taps.
+    const __m128i v_madd_10 = _mm_maddubs_epi16(src[0], taps[0]);  // k1k0
+    const __m128i v_madd_32 = _mm_maddubs_epi16(src[1], taps[1]);  // k3k2
+    const __m128i v_madd_54 = _mm_maddubs_epi16(src[2], taps[2]);  // k5k4
+    const __m128i v_madd_76 = _mm_maddubs_epi16(src[3], taps[3]);  // k7k6
+    const __m128i v_sum_3210 = _mm_add_epi16(v_madd_10, v_madd_32);
+    const __m128i v_sum_7654 = _mm_add_epi16(v_madd_54, v_madd_76);
+    sum = _mm_add_epi16(v_sum_7654, v_sum_3210);
+  } else if (filter_index == 3) {
+    // 2 taps.
+    sum = _mm_maddubs_epi16(src[0], taps[0]);  // k4k3
+  } else {
+    // 4 taps.
+    const __m128i v_madd_32 = _mm_maddubs_epi16(src[0], taps[0]);  // k3k2
+    const __m128i v_madd_54 = _mm_maddubs_epi16(src[1], taps[1]);  // k5k4
+    sum = _mm_add_epi16(v_madd_32, v_madd_54);
+  }
+  return sum;
+}
+
+template <int filter_index>
+__m128i SumHorizontalTaps(const uint8_t* const src,
+                          const __m128i* const v_tap) {
+  __m128i v_src[4];
+  const __m128i src_long = LoadUnaligned16(src);
+  const __m128i src_long_dup_lo = _mm_unpacklo_epi8(src_long, src_long);
+  const __m128i src_long_dup_hi = _mm_unpackhi_epi8(src_long, src_long);
+
+  if (filter_index < 2) {
+    // 6 taps.
+    v_src[0] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 3);   // _21
+    v_src[1] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 7);   // _43
+    v_src[2] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 11);  // _65
+  } else if (filter_index == 2) {
+    // 8 taps.
+    v_src[0] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 1);   // _10
+    v_src[1] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 5);   // _32
+    v_src[2] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 9);   // _54
+    v_src[3] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 13);  // _76
+  } else if (filter_index == 3) {
+    // 2 taps.
+    v_src[0] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 7);  // _43
+  } else if (filter_index > 3) {
+    // 4 taps.
+    v_src[0] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 5);  // _32
+    v_src[1] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 9);  // _54
+  }
+  const __m128i sum = SumOnePassTaps<filter_index>(v_src, v_tap);
+  return sum;
+}
+
+template <int filter_index>
+__m128i SimpleHorizontalTaps(const uint8_t* const src,
+                             const __m128i* const v_tap) {
+  __m128i sum = SumHorizontalTaps<filter_index>(src, v_tap);
+
+  // Normally the Horizontal pass does the downshift in two passes:
+  // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
+  // kInterRoundBitsHorizontal). Each one uses a rounding shift. Combining them
+  // requires adding the rounding offset from the skipped shift.
+  constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2);
+
+  sum = _mm_add_epi16(sum, _mm_set1_epi16(first_shift_rounding_bit));
+  sum = RightShiftWithRounding_S16(sum, kFilterBits - 1);
+  return _mm_packus_epi16(sum, sum);
+}
+
+template <int filter_index>
+__m128i HorizontalTaps8To16(const uint8_t* const src,
+                            const __m128i* const v_tap) {
+  const __m128i sum = SumHorizontalTaps<filter_index>(src, v_tap);
+
+  return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
+}
+
+template <int filter_index>
+__m128i SumHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride,
+                             const __m128i* const v_tap) {
+  const __m128i input0 = LoadLo8(&src[2]);
+  const __m128i input1 = LoadLo8(&src[2 + src_stride]);
+
+  if (filter_index == 3) {
+    // 03 04 04 05 05 06 06 07 ....
+    const __m128i input0_dup =
+        _mm_srli_si128(_mm_unpacklo_epi8(input0, input0), 3);
+    // 13 14 14 15 15 16 16 17 ....
+    const __m128i input1_dup =
+        _mm_srli_si128(_mm_unpacklo_epi8(input1, input1), 3);
+    const __m128i v_src_43 = _mm_unpacklo_epi64(input0_dup, input1_dup);
+    const __m128i v_sum_43 = _mm_maddubs_epi16(v_src_43, v_tap[0]);  // k4k3
+    return v_sum_43;
+  }
+
+  // 02 03 03 04 04 05 05 06 06 07 ....
+  const __m128i input0_dup =
+      _mm_srli_si128(_mm_unpacklo_epi8(input0, input0), 1);
+  // 12 13 13 14 14 15 15 16 16 17 ....
+  const __m128i input1_dup =
+      _mm_srli_si128(_mm_unpacklo_epi8(input1, input1), 1);
+  // 04 05 05 06 06 07 07 08 ...
+  const __m128i input0_dup_54 = _mm_srli_si128(input0_dup, 4);
+  // 14 15 15 16 16 17 17 18 ...
+  const __m128i input1_dup_54 = _mm_srli_si128(input1_dup, 4);
+  const __m128i v_src_32 = _mm_unpacklo_epi64(input0_dup, input1_dup);
+  const __m128i v_src_54 = _mm_unpacklo_epi64(input0_dup_54, input1_dup_54);
+  const __m128i v_madd_32 = _mm_maddubs_epi16(v_src_32, v_tap[0]);  // k3k2
+  const __m128i v_madd_54 = _mm_maddubs_epi16(v_src_54, v_tap[1]);  // k5k4
+  const __m128i v_sum_5432 = _mm_add_epi16(v_madd_54, v_madd_32);
+  return v_sum_5432;
+}
+
+template <int filter_index>
+__m128i SimpleHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride,
+                                const __m128i* const v_tap) {
+  __m128i sum = SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
+
+  // Normally the Horizontal pass does the downshift in two passes:
+  // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
+  // kInterRoundBitsHorizontal). Each one uses a rounding shift. Combining them
+  // requires adding the rounding offset from the skipped shift.
+  constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2);
+
+  sum = _mm_add_epi16(sum, _mm_set1_epi16(first_shift_rounding_bit));
+  sum = RightShiftWithRounding_S16(sum, kFilterBits - 1);
+  return _mm_packus_epi16(sum, sum);
+}
+
+template <int filter_index>
+__m128i HorizontalTaps8To16_2x2(const uint8_t* src, const ptrdiff_t src_stride,
+                                const __m128i* const v_tap) {
+  const __m128i sum =
+      SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
+
+  return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
+}
+
+template <int num_taps, int step, int filter_index, bool is_2d = false,
+          bool is_compound = false>
+void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride,
+                      void* const dest, const ptrdiff_t pred_stride,
+                      const int width, const int height,
+                      const __m128i* const v_tap) {
+  auto* dest8 = static_cast<uint8_t*>(dest);
+  auto* dest16 = static_cast<uint16_t*>(dest);
+
+  // 4 tap filters are never used when width > 4.
+  if (num_taps != 4 && width > 4) {
+    int y = 0;
+    do {
+      int x = 0;
+      do {
+        if (is_2d || is_compound) {
+          const __m128i v_sum =
+              HorizontalTaps8To16<filter_index>(&src[x], v_tap);
+          if (is_2d) {
+            StoreAligned16(&dest16[x], v_sum);
+          } else {
+            StoreUnaligned16(&dest16[x], v_sum);
+          }
+        } else {
+          const __m128i result =
+              SimpleHorizontalTaps<filter_index>(&src[x], v_tap);
+          StoreLo8(&dest8[x], result);
+        }
+        x += step;
+      } while (x < width);
+      src += src_stride;
+      dest8 += pred_stride;
+      dest16 += pred_stride;
+    } while (++y < height);
+    return;
+  }
+
+  // Horizontal passes only needs to account for |num_taps| 2 and 4 when
+  // |width| <= 4.
+  assert(width <= 4);
+  assert(num_taps <= 4);
+  if (num_taps <= 4) {
+    if (width == 4) {
+      int y = 0;
+      do {
+        if (is_2d || is_compound) {
+          const __m128i v_sum = HorizontalTaps8To16<filter_index>(src, v_tap);
+          StoreLo8(dest16, v_sum);
+        } else {
+          const __m128i result = SimpleHorizontalTaps<filter_index>(src, v_tap);
+          Store4(&dest8[0], result);
+        }
+        src += src_stride;
+        dest8 += pred_stride;
+        dest16 += pred_stride;
+      } while (++y < height);
+      return;
+    }
+
+    if (!is_compound) {
+      int y = 0;
+      do {
+        if (is_2d) {
+          const __m128i sum =
+              HorizontalTaps8To16_2x2<filter_index>(src, src_stride, v_tap);
+          Store4(&dest16[0], sum);
+          dest16 += pred_stride;
+          Store4(&dest16[0], _mm_srli_si128(sum, 8));
+          dest16 += pred_stride;
+        } else {
+          const __m128i sum =
+              SimpleHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
+          Store2(dest8, sum);
+          dest8 += pred_stride;
+          Store2(dest8, _mm_srli_si128(sum, 4));
+          dest8 += pred_stride;
+        }
+
+        src += src_stride << 1;
+        y += 2;
+      } while (y < height - 1);
+
+      // The 2d filters have an odd |height| because the horizontal pass
+      // generates context for the vertical pass.
+      if (is_2d) {
+        assert(height % 2 == 1);
+        __m128i sum;
+        const __m128i input = LoadLo8(&src[2]);
+        if (filter_index == 3) {
+          // 03 04 04 05 05 06 06 07 ....
+          const __m128i v_src_43 =
+              _mm_srli_si128(_mm_unpacklo_epi8(input, input), 3);
+          sum = _mm_maddubs_epi16(v_src_43, v_tap[0]);  // k4k3
+        } else {
+          // 02 03 03 04 04 05 05 06 06 07 ....
+          const __m128i v_src_32 =
+              _mm_srli_si128(_mm_unpacklo_epi8(input, input), 1);
+          // 04 05 05 06 06 07 07 08 ...
+          const __m128i v_src_54 = _mm_srli_si128(v_src_32, 4);
+          const __m128i v_madd_32 =
+              _mm_maddubs_epi16(v_src_32, v_tap[0]);  // k3k2
+          const __m128i v_madd_54 =
+              _mm_maddubs_epi16(v_src_54, v_tap[1]);  // k5k4
+          sum = _mm_add_epi16(v_madd_54, v_madd_32);
+        }
+        sum = RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
+        Store4(dest16, sum);
+      }
+    }
+  }
+}
+
+template <int num_taps, bool is_2d_vertical = false>
+LIBGAV1_ALWAYS_INLINE void SetupTaps(const __m128i* const filter,
+                                     __m128i* v_tap) {
+  if (num_taps == 8) {
+    v_tap[0] = _mm_shufflelo_epi16(*filter, 0x0);   // k1k0
+    v_tap[1] = _mm_shufflelo_epi16(*filter, 0x55);  // k3k2
+    v_tap[2] = _mm_shufflelo_epi16(*filter, 0xaa);  // k5k4
+    v_tap[3] = _mm_shufflelo_epi16(*filter, 0xff);  // k7k6
+    if (is_2d_vertical) {
+      v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
+      v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]);
+      v_tap[2] = _mm_cvtepi8_epi16(v_tap[2]);
+      v_tap[3] = _mm_cvtepi8_epi16(v_tap[3]);
+    } else {
+      v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
+      v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]);
+      v_tap[2] = _mm_unpacklo_epi64(v_tap[2], v_tap[2]);
+      v_tap[3] = _mm_unpacklo_epi64(v_tap[3], v_tap[3]);
+    }
+  } else if (num_taps == 6) {
+    const __m128i adjusted_filter = _mm_srli_si128(*filter, 1);
+    v_tap[0] = _mm_shufflelo_epi16(adjusted_filter, 0x0);   // k2k1
+    v_tap[1] = _mm_shufflelo_epi16(adjusted_filter, 0x55);  // k4k3
+    v_tap[2] = _mm_shufflelo_epi16(adjusted_filter, 0xaa);  // k6k5
+    if (is_2d_vertical) {
+      v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
+      v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]);
+      v_tap[2] = _mm_cvtepi8_epi16(v_tap[2]);
+    } else {
+      v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
+      v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]);
+      v_tap[2] = _mm_unpacklo_epi64(v_tap[2], v_tap[2]);
+    }
+  } else if (num_taps == 4) {
+    v_tap[0] = _mm_shufflelo_epi16(*filter, 0x55);  // k3k2
+    v_tap[1] = _mm_shufflelo_epi16(*filter, 0xaa);  // k5k4
+    if (is_2d_vertical) {
+      v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
+      v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]);
+    } else {
+      v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
+      v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]);
+    }
+  } else {  // num_taps == 2
+    const __m128i adjusted_filter = _mm_srli_si128(*filter, 1);
+    v_tap[0] = _mm_shufflelo_epi16(adjusted_filter, 0x55);  // k4k3
+    if (is_2d_vertical) {
+      v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
+    } else {
+      v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
+    }
+  }
+}
+
+template <int num_taps, bool is_compound>
+__m128i SimpleSum2DVerticalTaps(const __m128i* const src,
+                                const __m128i* const taps) {
+  __m128i sum_lo = _mm_madd_epi16(_mm_unpacklo_epi16(src[0], src[1]), taps[0]);
+  __m128i sum_hi = _mm_madd_epi16(_mm_unpackhi_epi16(src[0], src[1]), taps[0]);
+  if (num_taps >= 4) {
+    __m128i madd_lo =
+        _mm_madd_epi16(_mm_unpacklo_epi16(src[2], src[3]), taps[1]);
+    __m128i madd_hi =
+        _mm_madd_epi16(_mm_unpackhi_epi16(src[2], src[3]), taps[1]);
+    sum_lo = _mm_add_epi32(sum_lo, madd_lo);
+    sum_hi = _mm_add_epi32(sum_hi, madd_hi);
+    if (num_taps >= 6) {
+      madd_lo = _mm_madd_epi16(_mm_unpacklo_epi16(src[4], src[5]), taps[2]);
+      madd_hi = _mm_madd_epi16(_mm_unpackhi_epi16(src[4], src[5]), taps[2]);
+      sum_lo = _mm_add_epi32(sum_lo, madd_lo);
+      sum_hi = _mm_add_epi32(sum_hi, madd_hi);
+      if (num_taps == 8) {
+        madd_lo = _mm_madd_epi16(_mm_unpacklo_epi16(src[6], src[7]), taps[3]);
+        madd_hi = _mm_madd_epi16(_mm_unpackhi_epi16(src[6], src[7]), taps[3]);
+        sum_lo = _mm_add_epi32(sum_lo, madd_lo);
+        sum_hi = _mm_add_epi32(sum_hi, madd_hi);
+      }
+    }
+  }
+
+  if (is_compound) {
+    return _mm_packs_epi32(
+        RightShiftWithRounding_S32(sum_lo, kInterRoundBitsCompoundVertical - 1),
+        RightShiftWithRounding_S32(sum_hi,
+                                   kInterRoundBitsCompoundVertical - 1));
+  }
+
+  return _mm_packs_epi32(
+      RightShiftWithRounding_S32(sum_lo, kInterRoundBitsVertical - 1),
+      RightShiftWithRounding_S32(sum_hi, kInterRoundBitsVertical - 1));
+}
+
+template <int num_taps, bool is_compound = false>
+void Filter2DVertical(const uint16_t* src, void* const dst,
+                      const ptrdiff_t dst_stride, const int width,
+                      const int height, const __m128i* const taps) {
+  assert(width >= 8);
+  constexpr int next_row = num_taps - 1;
+  // The Horizontal pass uses |width| as |stride| for the intermediate buffer.
+  const ptrdiff_t src_stride = width;
+
+  auto* dst8 = static_cast<uint8_t*>(dst);
+  auto* dst16 = static_cast<uint16_t*>(dst);
+
+  int x = 0;
+  do {
+    __m128i srcs[8];
+    const uint16_t* src_x = src + x;
+    srcs[0] = LoadAligned16(src_x);
+    src_x += src_stride;
+    if (num_taps >= 4) {
+      srcs[1] = LoadAligned16(src_x);
+      src_x += src_stride;
+      srcs[2] = LoadAligned16(src_x);
+      src_x += src_stride;
+      if (num_taps >= 6) {
+        srcs[3] = LoadAligned16(src_x);
+        src_x += src_stride;
+        srcs[4] = LoadAligned16(src_x);
+        src_x += src_stride;
+        if (num_taps == 8) {
+          srcs[5] = LoadAligned16(src_x);
+          src_x += src_stride;
+          srcs[6] = LoadAligned16(src_x);
+          src_x += src_stride;
+        }
+      }
+    }
+
+    int y = 0;
+    do {
+      srcs[next_row] = LoadAligned16(src_x);
+      src_x += src_stride;
+
+      const __m128i sum =
+          SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps);
+      if (is_compound) {
+        StoreUnaligned16(dst16 + x + y * dst_stride, sum);
+      } else {
+        StoreLo8(dst8 + x + y * dst_stride, _mm_packus_epi16(sum, sum));
+      }
+
+      srcs[0] = srcs[1];
+      if (num_taps >= 4) {
+        srcs[1] = srcs[2];
+        srcs[2] = srcs[3];
+        if (num_taps >= 6) {
+          srcs[3] = srcs[4];
+          srcs[4] = srcs[5];
+          if (num_taps == 8) {
+            srcs[5] = srcs[6];
+            srcs[6] = srcs[7];
+          }
+        }
+      }
+    } while (++y < height);
+    x += 8;
+  } while (x < width);
+}
+
+// Take advantage of |src_stride| == |width| to process two rows at a time.
+template <int num_taps, bool is_compound = false>
+void Filter2DVertical4xH(const uint16_t* src, void* const dst,
+                         const ptrdiff_t dst_stride, const int height,
+                         const __m128i* const taps) {
+  auto* dst8 = static_cast<uint8_t*>(dst);
+  auto* dst16 = static_cast<uint16_t*>(dst);
+
+  __m128i srcs[9];
+  srcs[0] = LoadAligned16(src);
+  src += 8;
+  if (num_taps >= 4) {
+    srcs[2] = LoadAligned16(src);
+    src += 8;
+    srcs[1] = _mm_unpacklo_epi64(_mm_srli_si128(srcs[0], 8), srcs[2]);
+    if (num_taps >= 6) {
+      srcs[4] = LoadAligned16(src);
+      src += 8;
+      srcs[3] = _mm_unpacklo_epi64(_mm_srli_si128(srcs[2], 8), srcs[4]);
+      if (num_taps == 8) {
+        srcs[6] = LoadAligned16(src);
+        src += 8;
+        srcs[5] = _mm_unpacklo_epi64(_mm_srli_si128(srcs[4], 8), srcs[6]);
+      }
+    }
+  }
+
+  int y = 0;
+  do {
+    srcs[num_taps] = LoadAligned16(src);
+    src += 8;
+    srcs[num_taps - 1] = _mm_unpacklo_epi64(
+        _mm_srli_si128(srcs[num_taps - 2], 8), srcs[num_taps]);
+
+    const __m128i sum =
+        SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps);
+    if (is_compound) {
+      StoreUnaligned16(dst16, sum);
+      dst16 += 4 << 1;
+    } else {
+      const __m128i results = _mm_packus_epi16(sum, sum);
+      Store4(dst8, results);
+      dst8 += dst_stride;
+      Store4(dst8, _mm_srli_si128(results, 4));
+      dst8 += dst_stride;
+    }
+
+    srcs[0] = srcs[2];
+    if (num_taps >= 4) {
+      srcs[1] = srcs[3];
+      srcs[2] = srcs[4];
+      if (num_taps >= 6) {
+        srcs[3] = srcs[5];
+        srcs[4] = srcs[6];
+        if (num_taps == 8) {
+          srcs[5] = srcs[7];
+          srcs[6] = srcs[8];
+        }
+      }
+    }
+    y += 2;
+  } while (y < height);
+}
+
+// Take advantage of |src_stride| == |width| to process four rows at a time.
+template <int num_taps>
+void Filter2DVertical2xH(const uint16_t* src, void* const dst,
+                         const ptrdiff_t dst_stride, const int height,
+                         const __m128i* const taps) {
+  constexpr int next_row = (num_taps < 6) ? 4 : 8;
+
+  auto* dst8 = static_cast<uint8_t*>(dst);
+
+  __m128i srcs[9];
+  srcs[0] = LoadAligned16(src);
+  src += 8;
+  if (num_taps >= 6) {
+    srcs[4] = LoadAligned16(src);
+    src += 8;
+    srcs[1] = _mm_alignr_epi8(srcs[4], srcs[0], 4);
+    if (num_taps == 8) {
+      srcs[2] = _mm_alignr_epi8(srcs[4], srcs[0], 8);
+      srcs[3] = _mm_alignr_epi8(srcs[4], srcs[0], 12);
+    }
+  }
+
+  int y = 0;
+  do {
+    srcs[next_row] = LoadAligned16(src);
+    src += 8;
+    if (num_taps == 2) {
+      srcs[1] = _mm_alignr_epi8(srcs[4], srcs[0], 4);
+    } else if (num_taps == 4) {
+      srcs[1] = _mm_alignr_epi8(srcs[4], srcs[0], 4);
+      srcs[2] = _mm_alignr_epi8(srcs[4], srcs[0], 8);
+      srcs[3] = _mm_alignr_epi8(srcs[4], srcs[0], 12);
+    } else if (num_taps == 6) {
+      srcs[2] = _mm_alignr_epi8(srcs[4], srcs[0], 8);
+      srcs[3] = _mm_alignr_epi8(srcs[4], srcs[0], 12);
+      srcs[5] = _mm_alignr_epi8(srcs[8], srcs[4], 4);
+    } else if (num_taps == 8) {
+      srcs[5] = _mm_alignr_epi8(srcs[8], srcs[4], 4);
+      srcs[6] = _mm_alignr_epi8(srcs[8], srcs[4], 8);
+      srcs[7] = _mm_alignr_epi8(srcs[8], srcs[4], 12);
+    }
+
+    const __m128i sum =
+        SimpleSum2DVerticalTaps<num_taps, /*is_compound=*/false>(srcs, taps);
+    const __m128i results = _mm_packus_epi16(sum, sum);
+
+    Store2(dst8, results);
+    dst8 += dst_stride;
+    Store2(dst8, _mm_srli_si128(results, 2));
+    // When |height| <= 4 the taps are restricted to 2 and 4 tap variants.
+    // Therefore we don't need to check this condition when |height| > 4.
+    if (num_taps <= 4 && height == 2) return;
+    dst8 += dst_stride;
+    Store2(dst8, _mm_srli_si128(results, 4));
+    dst8 += dst_stride;
+    Store2(dst8, _mm_srli_si128(results, 6));
+    dst8 += dst_stride;
+
+    srcs[0] = srcs[4];
+    if (num_taps == 6) {
+      srcs[1] = srcs[5];
+      srcs[4] = srcs[8];
+    } else if (num_taps == 8) {
+      srcs[1] = srcs[5];
+      srcs[2] = srcs[6];
+      srcs[3] = srcs[7];
+      srcs[4] = srcs[8];
+    }
+
+    y += 4;
+  } while (y < height);
+}
+
+template <bool is_2d = false, bool is_compound = false>
+LIBGAV1_ALWAYS_INLINE void DoHorizontalPass(
+    const uint8_t* const src, const ptrdiff_t src_stride, void* const dst,
+    const ptrdiff_t dst_stride, const int width, const int height,
+    const int filter_id, const int filter_index) {
+  assert(filter_id != 0);
+  __m128i v_tap[4];
+  const __m128i v_horizontal_filter =
+      LoadLo8(kHalfSubPixelFilters[filter_index][filter_id]);
+
+  if (filter_index == 2) {  // 8 tap.
+    SetupTaps<8>(&v_horizontal_filter, v_tap);
+    FilterHorizontal<8, 8, 2, is_2d, is_compound>(
+        src, src_stride, dst, dst_stride, width, height, v_tap);
+  } else if (filter_index == 1) {  // 6 tap.
+    SetupTaps<6>(&v_horizontal_filter, v_tap);
+    FilterHorizontal<6, 8, 1, is_2d, is_compound>(
+        src, src_stride, dst, dst_stride, width, height, v_tap);
+  } else if (filter_index == 0) {  // 6 tap.
+    SetupTaps<6>(&v_horizontal_filter, v_tap);
+    FilterHorizontal<6, 8, 0, is_2d, is_compound>(
+        src, src_stride, dst, dst_stride, width, height, v_tap);
+  } else if (filter_index == 4) {  // 4 tap.
+    SetupTaps<4>(&v_horizontal_filter, v_tap);
+    FilterHorizontal<4, 8, 4, is_2d, is_compound>(
+        src, src_stride, dst, dst_stride, width, height, v_tap);
+  } else if (filter_index == 5) {  // 4 tap.
+    SetupTaps<4>(&v_horizontal_filter, v_tap);
+    FilterHorizontal<4, 8, 5, is_2d, is_compound>(
+        src, src_stride, dst, dst_stride, width, height, v_tap);
+  } else {  // 2 tap.
+    SetupTaps<2>(&v_horizontal_filter, v_tap);
+    FilterHorizontal<2, 8, 3, is_2d, is_compound>(
+        src, src_stride, dst, dst_stride, width, height, v_tap);
+  }
+}
+
+void Convolve2D_SSE4_1(const void* const reference,
+                       const ptrdiff_t reference_stride,
+                       const int horizontal_filter_index,
+                       const int vertical_filter_index,
+                       const int horizontal_filter_id,
+                       const int vertical_filter_id, const int width,
+                       const int height, void* prediction,
+                       const ptrdiff_t pred_stride) {
+  const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
+  const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
+  const int vertical_taps = GetNumTapsInFilter(vert_filter_index);
+
+  // The output of the horizontal filter is guaranteed to fit in 16 bits.
+  alignas(16) uint16_t
+      intermediate_result[kMaxSuperBlockSizeInPixels *
+                          (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
+  const int intermediate_height = height + vertical_taps - 1;
+
+  const ptrdiff_t src_stride = reference_stride;
+  const auto* src = static_cast<const uint8_t*>(reference) -
+                    (vertical_taps / 2 - 1) * src_stride - kHorizontalOffset;
+
+  DoHorizontalPass</*is_2d=*/true>(src, src_stride, intermediate_result, width,
+                                   width, intermediate_height,
+                                   horizontal_filter_id, horiz_filter_index);
+
+  // Vertical filter.
+  auto* dest = static_cast<uint8_t*>(prediction);
+  const ptrdiff_t dest_stride = pred_stride;
+  assert(vertical_filter_id != 0);
+
+  __m128i taps[4];
+  const __m128i v_filter =
+      LoadLo8(kHalfSubPixelFilters[vert_filter_index][vertical_filter_id]);
+
+  if (vertical_taps == 8) {
+    SetupTaps<8, /*is_2d_vertical=*/true>(&v_filter, taps);
+    if (width == 2) {
+      Filter2DVertical2xH<8>(intermediate_result, dest, dest_stride, height,
+                             taps);
+    } else if (width == 4) {
+      Filter2DVertical4xH<8>(intermediate_result, dest, dest_stride, height,
+                             taps);
+    } else {
+      Filter2DVertical<8>(intermediate_result, dest, dest_stride, width, height,
+                          taps);
+    }
+  } else if (vertical_taps == 6) {
+    SetupTaps<6, /*is_2d_vertical=*/true>(&v_filter, taps);
+    if (width == 2) {
+      Filter2DVertical2xH<6>(intermediate_result, dest, dest_stride, height,
+                             taps);
+    } else if (width == 4) {
+      Filter2DVertical4xH<6>(intermediate_result, dest, dest_stride, height,
+                             taps);
+    } else {
+      Filter2DVertical<6>(intermediate_result, dest, dest_stride, width, height,
+                          taps);
+    }
+  } else if (vertical_taps == 4) {
+    SetupTaps<4, /*is_2d_vertical=*/true>(&v_filter, taps);
+    if (width == 2) {
+      Filter2DVertical2xH<4>(intermediate_result, dest, dest_stride, height,
+                             taps);
+    } else if (width == 4) {
+      Filter2DVertical4xH<4>(intermediate_result, dest, dest_stride, height,
+                             taps);
+    } else {
+      Filter2DVertical<4>(intermediate_result, dest, dest_stride, width, height,
+                          taps);
+    }
+  } else {  // |vertical_taps| == 2
+    SetupTaps<2, /*is_2d_vertical=*/true>(&v_filter, taps);
+    if (width == 2) {
+      Filter2DVertical2xH<2>(intermediate_result, dest, dest_stride, height,
+                             taps);
+    } else if (width == 4) {
+      Filter2DVertical4xH<2>(intermediate_result, dest, dest_stride, height,
+                             taps);
+    } else {
+      Filter2DVertical<2>(intermediate_result, dest, dest_stride, width, height,
+                          taps);
+    }
+  }
+}
+
+// The 1D compound shift is always |kInterRoundBitsHorizontal|, even for 1D
+// Vertical calculations.
+__m128i Compound1DShift(const __m128i sum) {
+  return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
+}
+
+template <int filter_index>
+__m128i SumVerticalTaps(const __m128i* const srcs, const __m128i* const v_tap) {
+  __m128i v_src[4];
+
+  if (filter_index < 2) {
+    // 6 taps.
+    v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
+    v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]);
+    v_src[2] = _mm_unpacklo_epi8(srcs[4], srcs[5]);
+  } else if (filter_index == 2) {
+    // 8 taps.
+    v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
+    v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]);
+    v_src[2] = _mm_unpacklo_epi8(srcs[4], srcs[5]);
+    v_src[3] = _mm_unpacklo_epi8(srcs[6], srcs[7]);
+  } else if (filter_index == 3) {
+    // 2 taps.
+    v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
+  } else if (filter_index > 3) {
+    // 4 taps.
+    v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
+    v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]);
+  }
+  const __m128i sum = SumOnePassTaps<filter_index>(v_src, v_tap);
+  return sum;
+}
+
+template <int filter_index, bool is_compound = false>
+void FilterVertical(const uint8_t* src, const ptrdiff_t src_stride,
+                    void* const dst, const ptrdiff_t dst_stride,
+                    const int width, const int height,
+                    const __m128i* const v_tap) {
+  const int num_taps = GetNumTapsInFilter(filter_index);
+  const int next_row = num_taps - 1;
+  auto* dst8 = static_cast<uint8_t*>(dst);
+  auto* dst16 = static_cast<uint16_t*>(dst);
+  assert(width >= 8);
+
+  int x = 0;
+  do {
+    const uint8_t* src_x = src + x;
+    __m128i srcs[8];
+    srcs[0] = LoadLo8(src_x);
+    src_x += src_stride;
+    if (num_taps >= 4) {
+      srcs[1] = LoadLo8(src_x);
+      src_x += src_stride;
+      srcs[2] = LoadLo8(src_x);
+      src_x += src_stride;
+      if (num_taps >= 6) {
+        srcs[3] = LoadLo8(src_x);
+        src_x += src_stride;
+        srcs[4] = LoadLo8(src_x);
+        src_x += src_stride;
+        if (num_taps == 8) {
+          srcs[5] = LoadLo8(src_x);
+          src_x += src_stride;
+          srcs[6] = LoadLo8(src_x);
+          src_x += src_stride;
+        }
+      }
+    }
+
+    int y = 0;
+    do {
+      srcs[next_row] = LoadLo8(src_x);
+      src_x += src_stride;
+
+      const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+      if (is_compound) {
+        const __m128i results = Compound1DShift(sums);
+        StoreUnaligned16(dst16 + x + y * dst_stride, results);
+      } else {
+        const __m128i results =
+            RightShiftWithRounding_S16(sums, kFilterBits - 1);
+        StoreLo8(dst8 + x + y * dst_stride, _mm_packus_epi16(results, results));
+      }
+
+      srcs[0] = srcs[1];
+      if (num_taps >= 4) {
+        srcs[1] = srcs[2];
+        srcs[2] = srcs[3];
+        if (num_taps >= 6) {
+          srcs[3] = srcs[4];
+          srcs[4] = srcs[5];
+          if (num_taps == 8) {
+            srcs[5] = srcs[6];
+            srcs[6] = srcs[7];
+          }
+        }
+      }
+    } while (++y < height);
+    x += 8;
+  } while (x < width);
+}
+
+template <int filter_index, bool is_compound = false>
+void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
+                       void* const dst, const ptrdiff_t dst_stride,
+                       const int height, const __m128i* const v_tap) {
+  const int num_taps = GetNumTapsInFilter(filter_index);
+  auto* dst8 = static_cast<uint8_t*>(dst);
+  auto* dst16 = static_cast<uint16_t*>(dst);
+
+  __m128i srcs[9];
+
+  if (num_taps == 2) {
+    srcs[2] = _mm_setzero_si128();
+    // 00 01 02 03
+    srcs[0] = Load4(src);
+    src += src_stride;
+
+    int y = 0;
+    do {
+      // 10 11 12 13
+      const __m128i a = Load4(src);
+      // 00 01 02 03 10 11 12 13
+      srcs[0] = _mm_unpacklo_epi32(srcs[0], a);
+      src += src_stride;
+      // 20 21 22 23
+      srcs[2] = Load4(src);
+      src += src_stride;
+      // 10 11 12 13 20 21 22 23
+      srcs[1] = _mm_unpacklo_epi32(a, srcs[2]);
+
+      const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+      if (is_compound) {
+        const __m128i results = Compound1DShift(sums);
+        StoreUnaligned16(dst16, results);
+        dst16 += 4 << 1;
+      } else {
+        const __m128i results_16 =
+            RightShiftWithRounding_S16(sums, kFilterBits - 1);
+        const __m128i results = _mm_packus_epi16(results_16, results_16);
+        Store4(dst8, results);
+        dst8 += dst_stride;
+        Store4(dst8, _mm_srli_si128(results, 4));
+        dst8 += dst_stride;
+      }
+
+      srcs[0] = srcs[2];
+      y += 2;
+    } while (y < height);
+  } else if (num_taps == 4) {
+    srcs[4] = _mm_setzero_si128();
+    // 00 01 02 03
+    srcs[0] = Load4(src);
+    src += src_stride;
+    // 10 11 12 13
+    const __m128i a = Load4(src);
+    // 00 01 02 03 10 11 12 13
+    srcs[0] = _mm_unpacklo_epi32(srcs[0], a);
+    src += src_stride;
+    // 20 21 22 23
+    srcs[2] = Load4(src);
+    src += src_stride;
+    // 10 11 12 13 20 21 22 23
+    srcs[1] = _mm_unpacklo_epi32(a, srcs[2]);
+
+    int y = 0;
+    do {
+      // 30 31 32 33
+      const __m128i b = Load4(src);
+      // 20 21 22 23 30 31 32 33
+      srcs[2] = _mm_unpacklo_epi32(srcs[2], b);
+      src += src_stride;
+      // 40 41 42 43
+      srcs[4] = Load4(src);
+      src += src_stride;
+      // 30 31 32 33 40 41 42 43
+      srcs[3] = _mm_unpacklo_epi32(b, srcs[4]);
+
+      const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+      if (is_compound) {
+        const __m128i results = Compound1DShift(sums);
+        StoreUnaligned16(dst16, results);
+        dst16 += 4 << 1;
+      } else {
+        const __m128i results_16 =
+            RightShiftWithRounding_S16(sums, kFilterBits - 1);
+        const __m128i results = _mm_packus_epi16(results_16, results_16);
+        Store4(dst8, results);
+        dst8 += dst_stride;
+        Store4(dst8, _mm_srli_si128(results, 4));
+        dst8 += dst_stride;
+      }
+
+      srcs[0] = srcs[2];
+      srcs[1] = srcs[3];
+      srcs[2] = srcs[4];
+      y += 2;
+    } while (y < height);
+  } else if (num_taps == 6) {
+    srcs[6] = _mm_setzero_si128();
+    // 00 01 02 03
+    srcs[0] = Load4(src);
+    src += src_stride;
+    // 10 11 12 13
+    const __m128i a = Load4(src);
+    // 00 01 02 03 10 11 12 13
+    srcs[0] = _mm_unpacklo_epi32(srcs[0], a);
+    src += src_stride;
+    // 20 21 22 23
+    srcs[2] = Load4(src);
+    src += src_stride;
+    // 10 11 12 13 20 21 22 23
+    srcs[1] = _mm_unpacklo_epi32(a, srcs[2]);
+    // 30 31 32 33
+    const __m128i b = Load4(src);
+    // 20 21 22 23 30 31 32 33
+    srcs[2] = _mm_unpacklo_epi32(srcs[2], b);
+    src += src_stride;
+    // 40 41 42 43
+    srcs[4] = Load4(src);
+    src += src_stride;
+    // 30 31 32 33 40 41 42 43
+    srcs[3] = _mm_unpacklo_epi32(b, srcs[4]);
+
+    int y = 0;
+    do {
+      // 50 51 52 53
+      const __m128i c = Load4(src);
+      // 40 41 42 43 50 51 52 53
+      srcs[4] = _mm_unpacklo_epi32(srcs[4], c);
+      src += src_stride;
+      // 60 61 62 63
+      srcs[6] = Load4(src);
+      src += src_stride;
+      // 50 51 52 53 60 61 62 63
+      srcs[5] = _mm_unpacklo_epi32(c, srcs[6]);
+
+      const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+      if (is_compound) {
+        const __m128i results = Compound1DShift(sums);
+        StoreUnaligned16(dst16, results);
+        dst16 += 4 << 1;
+      } else {
+        const __m128i results_16 =
+            RightShiftWithRounding_S16(sums, kFilterBits - 1);
+        const __m128i results = _mm_packus_epi16(results_16, results_16);
+        Store4(dst8, results);
+        dst8 += dst_stride;
+        Store4(dst8, _mm_srli_si128(results, 4));
+        dst8 += dst_stride;
+      }
+
+      srcs[0] = srcs[2];
+      srcs[1] = srcs[3];
+      srcs[2] = srcs[4];
+      srcs[3] = srcs[5];
+      srcs[4] = srcs[6];
+      y += 2;
+    } while (y < height);
+  } else if (num_taps == 8) {
+    srcs[8] = _mm_setzero_si128();
+    // 00 01 02 03
+    srcs[0] = Load4(src);
+    src += src_stride;
+    // 10 11 12 13
+    const __m128i a = Load4(src);
+    // 00 01 02 03 10 11 12 13
+    srcs[0] = _mm_unpacklo_epi32(srcs[0], a);
+    src += src_stride;
+    // 20 21 22 23
+    srcs[2] = Load4(src);
+    src += src_stride;
+    // 10 11 12 13 20 21 22 23
+    srcs[1] = _mm_unpacklo_epi32(a, srcs[2]);
+    // 30 31 32 33
+    const __m128i b = Load4(src);
+    // 20 21 22 23 30 31 32 33
+    srcs[2] = _mm_unpacklo_epi32(srcs[2], b);
+    src += src_stride;
+    // 40 41 42 43
+    srcs[4] = Load4(src);
+    src += src_stride;
+    // 30 31 32 33 40 41 42 43
+    srcs[3] = _mm_unpacklo_epi32(b, srcs[4]);
+    // 50 51 52 53
+    const __m128i c = Load4(src);
+    // 40 41 42 43 50 51 52 53
+    srcs[4] = _mm_unpacklo_epi32(srcs[4], c);
+    src += src_stride;
+    // 60 61 62 63
+    srcs[6] = Load4(src);
+    src += src_stride;
+    // 50 51 52 53 60 61 62 63
+    srcs[5] = _mm_unpacklo_epi32(c, srcs[6]);
+
+    int y = 0;
+    do {
+      // 70 71 72 73
+      const __m128i d = Load4(src);
+      // 60 61 62 63 70 71 72 73
+      srcs[6] = _mm_unpacklo_epi32(srcs[6], d);
+      src += src_stride;
+      // 80 81 82 83
+      srcs[8] = Load4(src);
+      src += src_stride;
+      // 70 71 72 73 80 81 82 83
+      srcs[7] = _mm_unpacklo_epi32(d, srcs[8]);
+
+      const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+      if (is_compound) {
+        const __m128i results = Compound1DShift(sums);
+        StoreUnaligned16(dst16, results);
+        dst16 += 4 << 1;
+      } else {
+        const __m128i results_16 =
+            RightShiftWithRounding_S16(sums, kFilterBits - 1);
+        const __m128i results = _mm_packus_epi16(results_16, results_16);
+        Store4(dst8, results);
+        dst8 += dst_stride;
+        Store4(dst8, _mm_srli_si128(results, 4));
+        dst8 += dst_stride;
+      }
+
+      srcs[0] = srcs[2];
+      srcs[1] = srcs[3];
+      srcs[2] = srcs[4];
+      srcs[3] = srcs[5];
+      srcs[4] = srcs[6];
+      srcs[5] = srcs[7];
+      srcs[6] = srcs[8];
+      y += 2;
+    } while (y < height);
+  }
+}
+
+template <int filter_index, bool negative_outside_taps = false>
+void FilterVertical2xH(const uint8_t* src, const ptrdiff_t src_stride,
+                       void* const dst, const ptrdiff_t dst_stride,
+                       const int height, const __m128i* const v_tap) {
+  const int num_taps = GetNumTapsInFilter(filter_index);
+  auto* dst8 = static_cast<uint8_t*>(dst);
+
+  __m128i srcs[9];
+
+  if (num_taps == 2) {
+    srcs[2] = _mm_setzero_si128();
+    // 00 01
+    srcs[0] = Load2(src);
+    src += src_stride;
+
+    int y = 0;
+    do {
+      // 00 01 10 11
+      srcs[0] = Load2<1>(src, srcs[0]);
+      src += src_stride;
+      // 00 01 10 11 20 21
+      srcs[0] = Load2<2>(src, srcs[0]);
+      src += src_stride;
+      // 00 01 10 11 20 21 30 31
+      srcs[0] = Load2<3>(src, srcs[0]);
+      src += src_stride;
+      // 40 41
+      srcs[2] = Load2<0>(src, srcs[2]);
+      src += src_stride;
+      // 00 01 10 11 20 21 30 31 40 41
+      const __m128i srcs_0_2 = _mm_unpacklo_epi64(srcs[0], srcs[2]);
+      // 10 11 20 21 30 31 40 41
+      srcs[1] = _mm_srli_si128(srcs_0_2, 2);
+      // This uses srcs[0]..srcs[1].
+      const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+      const __m128i results_16 =
+          RightShiftWithRounding_S16(sums, kFilterBits - 1);
+      const __m128i results = _mm_packus_epi16(results_16, results_16);
+
+      Store2(dst8, results);
+      dst8 += dst_stride;
+      Store2(dst8, _mm_srli_si128(results, 2));
+      if (height == 2) return;
+      dst8 += dst_stride;
+      Store2(dst8, _mm_srli_si128(results, 4));
+      dst8 += dst_stride;
+      Store2(dst8, _mm_srli_si128(results, 6));
+      dst8 += dst_stride;
+
+      srcs[0] = srcs[2];
+      y += 4;
+    } while (y < height);
+  } else if (num_taps == 4) {
+    srcs[4] = _mm_setzero_si128();
+
+    // 00 01
+    srcs[0] = Load2(src);
+    src += src_stride;
+    // 00 01 10 11
+    srcs[0] = Load2<1>(src, srcs[0]);
+    src += src_stride;
+    // 00 01 10 11 20 21
+    srcs[0] = Load2<2>(src, srcs[0]);
+    src += src_stride;
+
+    int y = 0;
+    do {
+      // 00 01 10 11 20 21 30 31
+      srcs[0] = Load2<3>(src, srcs[0]);
+      src += src_stride;
+      // 40 41
+      srcs[4] = Load2<0>(src, srcs[4]);
+      src += src_stride;
+      // 40 41 50 51
+      srcs[4] = Load2<1>(src, srcs[4]);
+      src += src_stride;
+      // 40 41 50 51 60 61
+      srcs[4] = Load2<2>(src, srcs[4]);
+      src += src_stride;
+      // 00 01 10 11 20 21 30 31 40 41 50 51 60 61
+      const __m128i srcs_0_4 = _mm_unpacklo_epi64(srcs[0], srcs[4]);
+      // 10 11 20 21 30 31 40 41
+      srcs[1] = _mm_srli_si128(srcs_0_4, 2);
+      // 20 21 30 31 40 41 50 51
+      srcs[2] = _mm_srli_si128(srcs_0_4, 4);
+      // 30 31 40 41 50 51 60 61
+      srcs[3] = _mm_srli_si128(srcs_0_4, 6);
+
+      // This uses srcs[0]..srcs[3].
+      const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+      const __m128i results_16 =
+          RightShiftWithRounding_S16(sums, kFilterBits - 1);
+      const __m128i results = _mm_packus_epi16(results_16, results_16);
+
+      Store2(dst8, results);
+      dst8 += dst_stride;
+      Store2(dst8, _mm_srli_si128(results, 2));
+      if (height == 2) return;
+      dst8 += dst_stride;
+      Store2(dst8, _mm_srli_si128(results, 4));
+      dst8 += dst_stride;
+      Store2(dst8, _mm_srli_si128(results, 6));
+      dst8 += dst_stride;
+
+      srcs[0] = srcs[4];
+      y += 4;
+    } while (y < height);
+  } else if (num_taps == 6) {
+    // During the vertical pass the number of taps is restricted when
+    // |height| <= 4.
+    assert(height > 4);
+    srcs[8] = _mm_setzero_si128();
+
+    // 00 01
+    srcs[0] = Load2(src);
+    src += src_stride;
+    // 00 01 10 11
+    srcs[0] = Load2<1>(src, srcs[0]);
+    src += src_stride;
+    // 00 01 10 11 20 21
+    srcs[0] = Load2<2>(src, srcs[0]);
+    src += src_stride;
+    // 00 01 10 11 20 21 30 31
+    srcs[0] = Load2<3>(src, srcs[0]);
+    src += src_stride;
+    // 40 41
+    srcs[4] = Load2(src);
+    src += src_stride;
+    // 00 01 10 11 20 21 30 31 40 41 50 51 60 61
+    const __m128i srcs_0_4x = _mm_unpacklo_epi64(srcs[0], srcs[4]);
+    // 10 11 20 21 30 31 40 41
+    srcs[1] = _mm_srli_si128(srcs_0_4x, 2);
+
+    int y = 0;
+    do {
+      // 40 41 50 51
+      srcs[4] = Load2<1>(src, srcs[4]);
+      src += src_stride;
+      // 40 41 50 51 60 61
+      srcs[4] = Load2<2>(src, srcs[4]);
+      src += src_stride;
+      // 40 41 50 51 60 61 70 71
+      srcs[4] = Load2<3>(src, srcs[4]);
+      src += src_stride;
+      // 80 81
+      srcs[8] = Load2<0>(src, srcs[8]);
+      src += src_stride;
+      // 00 01 10 11 20 21 30 31 40 41 50 51 60 61
+      const __m128i srcs_0_4 = _mm_unpacklo_epi64(srcs[0], srcs[4]);
+      // 20 21 30 31 40 41 50 51
+      srcs[2] = _mm_srli_si128(srcs_0_4, 4);
+      // 30 31 40 41 50 51 60 61
+      srcs[3] = _mm_srli_si128(srcs_0_4, 6);
+      const __m128i srcs_4_8 = _mm_unpacklo_epi64(srcs[4], srcs[8]);
+      // 50 51 60 61 70 71 80 81
+      srcs[5] = _mm_srli_si128(srcs_4_8, 2);
+
+      // This uses srcs[0]..srcs[5].
+      const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+      const __m128i results_16 =
+          RightShiftWithRounding_S16(sums, kFilterBits - 1);
+      const __m128i results = _mm_packus_epi16(results_16, results_16);
+
+      Store2(dst8, results);
+      dst8 += dst_stride;
+      Store2(dst8, _mm_srli_si128(results, 2));
+      dst8 += dst_stride;
+      Store2(dst8, _mm_srli_si128(results, 4));
+      dst8 += dst_stride;
+      Store2(dst8, _mm_srli_si128(results, 6));
+      dst8 += dst_stride;
+
+      srcs[0] = srcs[4];
+      srcs[1] = srcs[5];
+      srcs[4] = srcs[8];
+      y += 4;
+    } while (y < height);
+  } else if (num_taps == 8) {
+    // During the vertical pass the number of taps is restricted when
+    // |height| <= 4.
+    assert(height > 4);
+    srcs[8] = _mm_setzero_si128();
+    // 00 01
+    srcs[0] = Load2(src);
+    src += src_stride;
+    // 00 01 10 11
+    srcs[0] = Load2<1>(src, srcs[0]);
+    src += src_stride;
+    // 00 01 10 11 20 21
+    srcs[0] = Load2<2>(src, srcs[0]);
+    src += src_stride;
+    // 00 01 10 11 20 21 30 31
+    srcs[0] = Load2<3>(src, srcs[0]);
+    src += src_stride;
+    // 40 41
+    srcs[4] = Load2(src);
+    src += src_stride;
+    // 40 41 50 51
+    srcs[4] = Load2<1>(src, srcs[4]);
+    src += src_stride;
+    // 40 41 50 51 60 61
+    srcs[4] = Load2<2>(src, srcs[4]);
+    src += src_stride;
+
+    // 00 01 10 11 20 21 30 31 40 41 50 51 60 61
+    const __m128i srcs_0_4 = _mm_unpacklo_epi64(srcs[0], srcs[4]);
+    // 10 11 20 21 30 31 40 41
+    srcs[1] = _mm_srli_si128(srcs_0_4, 2);
+    // 20 21 30 31 40 41 50 51
+    srcs[2] = _mm_srli_si128(srcs_0_4, 4);
+    // 30 31 40 41 50 51 60 61
+    srcs[3] = _mm_srli_si128(srcs_0_4, 6);
+
+    int y = 0;
+    do {
+      // 40 41 50 51 60 61 70 71
+      srcs[4] = Load2<3>(src, srcs[4]);
+      src += src_stride;
+      // 80 81
+      srcs[8] = Load2<0>(src, srcs[8]);
+      src += src_stride;
+      // 80 81 90 91
+      srcs[8] = Load2<1>(src, srcs[8]);
+      src += src_stride;
+      // 80 81 90 91 a0 a1
+      srcs[8] = Load2<2>(src, srcs[8]);
+      src += src_stride;
+
+      // 40 41 50 51 60 61 70 71 80 81 90 91 a0 a1
+      const __m128i srcs_4_8 = _mm_unpacklo_epi64(srcs[4], srcs[8]);
+      // 50 51 60 61 70 71 80 81
+      srcs[5] = _mm_srli_si128(srcs_4_8, 2);
+      // 60 61 70 71 80 81 90 91
+      srcs[6] = _mm_srli_si128(srcs_4_8, 4);
+      // 70 71 80 81 90 91 a0 a1
+      srcs[7] = _mm_srli_si128(srcs_4_8, 6);
+
+      // This uses srcs[0]..srcs[7].
+      const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+      const __m128i results_16 =
+          RightShiftWithRounding_S16(sums, kFilterBits - 1);
+      const __m128i results = _mm_packus_epi16(results_16, results_16);
+
+      Store2(dst8, results);
+      dst8 += dst_stride;
+      Store2(dst8, _mm_srli_si128(results, 2));
+      dst8 += dst_stride;
+      Store2(dst8, _mm_srli_si128(results, 4));
+      dst8 += dst_stride;
+      Store2(dst8, _mm_srli_si128(results, 6));
+      dst8 += dst_stride;
+
+      srcs[0] = srcs[4];
+      srcs[1] = srcs[5];
+      srcs[2] = srcs[6];
+      srcs[3] = srcs[7];
+      srcs[4] = srcs[8];
+      y += 4;
+    } while (y < height);
+  }
+}
+
+void ConvolveVertical_SSE4_1(const void* const reference,
+                             const ptrdiff_t reference_stride,
+                             const int /*horizontal_filter_index*/,
+                             const int vertical_filter_index,
+                             const int /*horizontal_filter_id*/,
+                             const int vertical_filter_id, const int width,
+                             const int height, void* prediction,
+                             const ptrdiff_t pred_stride) {
+  const int filter_index = GetFilterIndex(vertical_filter_index, height);
+  const int vertical_taps = GetNumTapsInFilter(filter_index);
+  const ptrdiff_t src_stride = reference_stride;
+  const auto* src = static_cast<const uint8_t*>(reference) -
+                    (vertical_taps / 2 - 1) * src_stride;
+  auto* dest = static_cast<uint8_t*>(prediction);
+  const ptrdiff_t dest_stride = pred_stride;
+  assert(vertical_filter_id != 0);
+
+  __m128i taps[4];
+  const __m128i v_filter =
+      LoadLo8(kHalfSubPixelFilters[filter_index][vertical_filter_id]);
+
+  if (filter_index < 2) {  // 6 tap.
+    SetupTaps<6>(&v_filter, taps);
+    if (width == 2) {
+      FilterVertical2xH<0>(src, src_stride, dest, dest_stride, height, taps);
+    } else if (width == 4) {
+      FilterVertical4xH<0>(src, src_stride, dest, dest_stride, height, taps);
+    } else {
+      FilterVertical<0>(src, src_stride, dest, dest_stride, width, height,
+                        taps);
+    }
+  } else if (filter_index == 2) {  // 8 tap.
+    SetupTaps<8>(&v_filter, taps);
+    if (width == 2) {
+      FilterVertical2xH<2>(src, src_stride, dest, dest_stride, height, taps);
+    } else if (width == 4) {
+      FilterVertical4xH<2>(src, src_stride, dest, dest_stride, height, taps);
+    } else {
+      FilterVertical<2>(src, src_stride, dest, dest_stride, width, height,
+                        taps);
+    }
+  } else if (filter_index == 3) {  // 2 tap.
+    SetupTaps<2>(&v_filter, taps);
+    if (width == 2) {
+      FilterVertical2xH<3>(src, src_stride, dest, dest_stride, height, taps);
+    } else if (width == 4) {
+      FilterVertical4xH<3>(src, src_stride, dest, dest_stride, height, taps);
+    } else {
+      FilterVertical<3>(src, src_stride, dest, dest_stride, width, height,
+                        taps);
+    }
+  } else if (filter_index == 4) {  // 4 tap.
+    SetupTaps<4>(&v_filter, taps);
+    if (width == 2) {
+      FilterVertical2xH<4>(src, src_stride, dest, dest_stride, height, taps);
+    } else if (width == 4) {
+      FilterVertical4xH<4>(src, src_stride, dest, dest_stride, height, taps);
+    } else {
+      FilterVertical<4>(src, src_stride, dest, dest_stride, width, height,
+                        taps);
+    }
+  } else {
+    // TODO(slavarnway): Investigate adding |filter_index| == 1 special cases.
+    // See convolve_neon.cc
+    SetupTaps<4>(&v_filter, taps);
+
+    if (width == 2) {
+      FilterVertical2xH<5>(src, src_stride, dest, dest_stride, height, taps);
+    } else if (width == 4) {
+      FilterVertical4xH<5>(src, src_stride, dest, dest_stride, height, taps);
+    } else {
+      FilterVertical<5>(src, src_stride, dest, dest_stride, width, height,
+                        taps);
+    }
+  }
+}
+
+void ConvolveCompoundCopy_SSE4(const void* const reference,
+                               const ptrdiff_t reference_stride,
+                               const int /*horizontal_filter_index*/,
+                               const int /*vertical_filter_index*/,
+                               const int /*horizontal_filter_id*/,
+                               const int /*vertical_filter_id*/,
+                               const int width, const int height,
+                               void* prediction, const ptrdiff_t pred_stride) {
+  const auto* src = static_cast<const uint8_t*>(reference);
+  const ptrdiff_t src_stride = reference_stride;
+  auto* dest = static_cast<uint16_t*>(prediction);
+  constexpr int kRoundBitsVertical =
+      kInterRoundBitsVertical - kInterRoundBitsCompoundVertical;
+  if (width >= 16) {
+    int y = height;
+    do {
+      int x = 0;
+      do {
+        const __m128i v_src = LoadUnaligned16(&src[x]);
+        const __m128i v_src_ext_lo = _mm_cvtepu8_epi16(v_src);
+        const __m128i v_src_ext_hi =
+            _mm_cvtepu8_epi16(_mm_srli_si128(v_src, 8));
+        const __m128i v_dest_lo =
+            _mm_slli_epi16(v_src_ext_lo, kRoundBitsVertical);
+        const __m128i v_dest_hi =
+            _mm_slli_epi16(v_src_ext_hi, kRoundBitsVertical);
+        // TODO(slavarnway): Investigate using aligned stores.
+        StoreUnaligned16(&dest[x], v_dest_lo);
+        StoreUnaligned16(&dest[x + 8], v_dest_hi);
+        x += 16;
+      } while (x < width);
+      src += src_stride;
+      dest += pred_stride;
+    } while (--y != 0);
+  } else if (width == 8) {
+    int y = height;
+    do {
+      const __m128i v_src = LoadLo8(&src[0]);
+      const __m128i v_src_ext = _mm_cvtepu8_epi16(v_src);
+      const __m128i v_dest = _mm_slli_epi16(v_src_ext, kRoundBitsVertical);
+      StoreUnaligned16(&dest[0], v_dest);
+      src += src_stride;
+      dest += pred_stride;
+    } while (--y != 0);
+  } else { /* width == 4 */
+    int y = height;
+    do {
+      const __m128i v_src0 = Load4(&src[0]);
+      const __m128i v_src1 = Load4(&src[src_stride]);
+      const __m128i v_src = _mm_unpacklo_epi32(v_src0, v_src1);
+      const __m128i v_src_ext = _mm_cvtepu8_epi16(v_src);
+      const __m128i v_dest = _mm_slli_epi16(v_src_ext, kRoundBitsVertical);
+      StoreLo8(&dest[0], v_dest);
+      StoreHi8(&dest[pred_stride], v_dest);
+      src += src_stride * 2;
+      dest += pred_stride * 2;
+      y -= 2;
+    } while (y != 0);
+  }
+}
+
+void ConvolveCompoundVertical_SSE4_1(
+    const void* const reference, const ptrdiff_t reference_stride,
+    const int /*horizontal_filter_index*/, const int vertical_filter_index,
+    const int /*horizontal_filter_id*/, const int vertical_filter_id,
+    const int width, const int height, void* prediction,
+    const ptrdiff_t /*pred_stride*/) {
+  const int filter_index = GetFilterIndex(vertical_filter_index, height);
+  const int vertical_taps = GetNumTapsInFilter(filter_index);
+  const ptrdiff_t src_stride = reference_stride;
+  const auto* src = static_cast<const uint8_t*>(reference) -
+                    (vertical_taps / 2 - 1) * src_stride;
+  auto* dest = static_cast<uint16_t*>(prediction);
+  assert(vertical_filter_id != 0);
+
+  __m128i taps[4];
+  const __m128i v_filter =
+      LoadLo8(kHalfSubPixelFilters[filter_index][vertical_filter_id]);
+
+  if (filter_index < 2) {  // 6 tap.
+    SetupTaps<6>(&v_filter, taps);
+    if (width == 4) {
+      FilterVertical4xH<0, /*is_compound=*/true>(src, src_stride, dest, 4,
+                                                 height, taps);
+    } else {
+      FilterVertical<0, /*is_compound=*/true>(src, src_stride, dest, width,
+                                              width, height, taps);
+    }
+  } else if (filter_index == 2) {  // 8 tap.
+    SetupTaps<8>(&v_filter, taps);
+
+    if (width == 4) {
+      FilterVertical4xH<2, /*is_compound=*/true>(src, src_stride, dest, 4,
+                                                 height, taps);
+    } else {
+      FilterVertical<2, /*is_compound=*/true>(src, src_stride, dest, width,
+                                              width, height, taps);
+    }
+  } else if (filter_index == 3) {  // 2 tap.
+    SetupTaps<2>(&v_filter, taps);
+
+    if (width == 4) {
+      FilterVertical4xH<3, /*is_compound=*/true>(src, src_stride, dest, 4,
+                                                 height, taps);
+    } else {
+      FilterVertical<3, /*is_compound=*/true>(src, src_stride, dest, width,
+                                              width, height, taps);
+    }
+  } else if (filter_index == 4) {  // 4 tap.
+    SetupTaps<4>(&v_filter, taps);
+
+    if (width == 4) {
+      FilterVertical4xH<4, /*is_compound=*/true>(src, src_stride, dest, 4,
+                                                 height, taps);
+    } else {
+      FilterVertical<4, /*is_compound=*/true>(src, src_stride, dest, width,
+                                              width, height, taps);
+    }
+  } else {
+    SetupTaps<4>(&v_filter, taps);
+
+    if (width == 4) {
+      FilterVertical4xH<5, /*is_compound=*/true>(src, src_stride, dest, 4,
+                                                 height, taps);
+    } else {
+      FilterVertical<5, /*is_compound=*/true>(src, src_stride, dest, width,
+                                              width, height, taps);
+    }
+  }
+}
+
+void ConvolveHorizontal_SSE4_1(const void* const reference,
+                               const ptrdiff_t reference_stride,
+                               const int horizontal_filter_index,
+                               const int /*vertical_filter_index*/,
+                               const int horizontal_filter_id,
+                               const int /*vertical_filter_id*/,
+                               const int width, const int height,
+                               void* prediction, const ptrdiff_t pred_stride) {
+  const int filter_index = GetFilterIndex(horizontal_filter_index, width);
+  // Set |src| to the outermost tap.
+  const auto* src = static_cast<const uint8_t*>(reference) - kHorizontalOffset;
+  auto* dest = static_cast<uint8_t*>(prediction);
+
+  DoHorizontalPass(src, reference_stride, dest, pred_stride, width, height,
+                   horizontal_filter_id, filter_index);
+}
+
+void ConvolveCompoundHorizontal_SSE4_1(
+    const void* const reference, const ptrdiff_t reference_stride,
+    const int horizontal_filter_index, const int /*vertical_filter_index*/,
+    const int horizontal_filter_id, const int /*vertical_filter_id*/,
+    const int width, const int height, void* prediction,
+    const ptrdiff_t /*pred_stride*/) {
+  const int filter_index = GetFilterIndex(horizontal_filter_index, width);
+  const auto* src = static_cast<const uint8_t*>(reference) - kHorizontalOffset;
+  auto* dest = static_cast<uint16_t*>(prediction);
+
+  DoHorizontalPass</*is_2d=*/false, /*is_compound=*/true>(
+      src, reference_stride, dest, width, width, height, horizontal_filter_id,
+      filter_index);
+}
+
+void ConvolveCompound2D_SSE4_1(const void* const reference,
+                               const ptrdiff_t reference_stride,
+                               const int horizontal_filter_index,
+                               const int vertical_filter_index,
+                               const int horizontal_filter_id,
+                               const int vertical_filter_id, const int width,
+                               const int height, void* prediction,
+                               const ptrdiff_t /*pred_stride*/) {
+  // The output of the horizontal filter, i.e. the intermediate_result, is
+  // guaranteed to fit in int16_t.
+  alignas(16) uint16_t
+      intermediate_result[kMaxSuperBlockSizeInPixels *
+                          (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
+
+  // Horizontal filter.
+  // Filter types used for width <= 4 are different from those for width > 4.
+  // When width > 4, the valid filter index range is always [0, 3].
+  // When width <= 4, the valid filter index range is always [4, 5].
+  // Similarly for height.
+  const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
+  const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
+  const int vertical_taps = GetNumTapsInFilter(vert_filter_index);
+  const int intermediate_height = height + vertical_taps - 1;
+  const ptrdiff_t src_stride = reference_stride;
+  const auto* const src = static_cast<const uint8_t*>(reference) -
+                          (vertical_taps / 2 - 1) * src_stride -
+                          kHorizontalOffset;
+
+  DoHorizontalPass</*is_2d=*/true, /*is_compound=*/true>(
+      src, src_stride, intermediate_result, width, width, intermediate_height,
+      horizontal_filter_id, horiz_filter_index);
+
+  // Vertical filter.
+  auto* dest = static_cast<uint16_t*>(prediction);
+  assert(vertical_filter_id != 0);
+
+  const ptrdiff_t dest_stride = width;
+  __m128i taps[4];
+  const __m128i v_filter =
+      LoadLo8(kHalfSubPixelFilters[vert_filter_index][vertical_filter_id]);
+
+  if (vertical_taps == 8) {
+    SetupTaps<8, /*is_2d_vertical=*/true>(&v_filter, taps);
+    if (width == 4) {
+      Filter2DVertical4xH<8, /*is_compound=*/true>(intermediate_result, dest,
+                                                   dest_stride, height, taps);
+    } else {
+      Filter2DVertical<8, /*is_compound=*/true>(
+          intermediate_result, dest, dest_stride, width, height, taps);
+    }
+  } else if (vertical_taps == 6) {
+    SetupTaps<6, /*is_2d_vertical=*/true>(&v_filter, taps);
+    if (width == 4) {
+      Filter2DVertical4xH<6, /*is_compound=*/true>(intermediate_result, dest,
+                                                   dest_stride, height, taps);
+    } else {
+      Filter2DVertical<6, /*is_compound=*/true>(
+          intermediate_result, dest, dest_stride, width, height, taps);
+    }
+  } else if (vertical_taps == 4) {
+    SetupTaps<4, /*is_2d_vertical=*/true>(&v_filter, taps);
+    if (width == 4) {
+      Filter2DVertical4xH<4, /*is_compound=*/true>(intermediate_result, dest,
+                                                   dest_stride, height, taps);
+    } else {
+      Filter2DVertical<4, /*is_compound=*/true>(
+          intermediate_result, dest, dest_stride, width, height, taps);
+    }
+  } else {  // |vertical_taps| == 2
+    SetupTaps<2, /*is_2d_vertical=*/true>(&v_filter, taps);
+    if (width == 4) {
+      Filter2DVertical4xH<2, /*is_compound=*/true>(intermediate_result, dest,
+                                                   dest_stride, height, taps);
+    } else {
+      Filter2DVertical<2, /*is_compound=*/true>(
+          intermediate_result, dest, dest_stride, width, height, taps);
+    }
+  }
+}
+
+// Pre-transposed filters.
+template <int filter_index>
+inline void GetHalfSubPixelFilter(__m128i* output) {
+  // Filter 0
+  alignas(
+      16) static constexpr int8_t kHalfSubPixel6TapSignedFilterColumns[6][16] =
+      {{0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0},
+       {0, -3, -5, -6, -7, -7, -8, -7, -7, -6, -6, -6, -5, -4, -2, -1},
+       {64, 63, 61, 58, 55, 51, 47, 42, 38, 33, 29, 24, 19, 14, 9, 4},
+       {0, 4, 9, 14, 19, 24, 29, 33, 38, 42, 47, 51, 55, 58, 61, 63},
+       {0, -1, -2, -4, -5, -6, -6, -6, -7, -7, -8, -7, -7, -6, -5, -3},
+       {0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}};
+  // Filter 1
+  alignas(16) static constexpr int8_t
+      kHalfSubPixel6TapMixedSignedFilterColumns[6][16] = {
+          {0, 1, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, 0, 0, 0},
+          {0, 14, 13, 11, 10, 9, 8, 8, 7, 6, 5, 4, 3, 2, 2, 1},
+          {64, 31, 31, 31, 30, 29, 28, 27, 26, 24, 23, 22, 21, 20, 18, 17},
+          {0, 17, 18, 20, 21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 31, 31},
+          {0, 1, 2, 2, 3, 4, 5, 6, 7, 8, 8, 9, 10, 11, 13, 14},
+          {0, 0, 0, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, 0, 1}};
+  // Filter 2
+  alignas(
+      16) static constexpr int8_t kHalfSubPixel8TapSignedFilterColumns[8][16] =
+      {{0, -1, -1, -1, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, 0},
+       {0, 1, 3, 4, 5, 5, 5, 5, 6, 5, 4, 4, 3, 3, 2, 1},
+       {0, -3, -6, -9, -11, -11, -12, -12, -12, -11, -10, -9, -7, -5, -3, -1},
+       {64, 63, 62, 60, 58, 54, 50, 45, 40, 35, 30, 24, 19, 13, 8, 4},
+       {0, 4, 8, 13, 19, 24, 30, 35, 40, 45, 50, 54, 58, 60, 62, 63},
+       {0, -1, -3, -5, -7, -9, -10, -11, -12, -12, -12, -11, -11, -9, -6, -3},
+       {0, 1, 2, 3, 3, 4, 4, 5, 6, 5, 5, 5, 5, 4, 3, 1},
+       {0, 0, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -1, -1, -1}};
+  // Filter 3
+  alignas(16) static constexpr uint8_t kHalfSubPixel2TapFilterColumns[2][16] = {
+      {64, 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4},
+      {0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60}};
+  // Filter 4
+  alignas(
+      16) static constexpr int8_t kHalfSubPixel4TapSignedFilterColumns[4][16] =
+      {{0, -2, -4, -5, -6, -6, -7, -6, -6, -5, -5, -5, -4, -3, -2, -1},
+       {64, 63, 61, 58, 55, 51, 47, 42, 38, 33, 29, 24, 19, 14, 9, 4},
+       {0, 4, 9, 14, 19, 24, 29, 33, 38, 42, 47, 51, 55, 58, 61, 63},
+       {0, -1, -2, -3, -4, -5, -5, -5, -6, -6, -7, -6, -6, -5, -4, -2}};
+  // Filter 5
+  alignas(
+      16) static constexpr uint8_t kSubPixel4TapPositiveFilterColumns[4][16] = {
+      {0, 15, 13, 11, 10, 9, 8, 7, 6, 6, 5, 4, 3, 2, 2, 1},
+      {64, 31, 31, 31, 30, 29, 28, 27, 26, 24, 23, 22, 21, 20, 18, 17},
+      {0, 17, 18, 20, 21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 31, 31},
+      {0, 1, 2, 2, 3, 4, 5, 6, 6, 7, 8, 9, 10, 11, 13, 15}};
+  switch (filter_index) {
+    case 0:
+      output[0] = LoadAligned16(kHalfSubPixel6TapSignedFilterColumns[0]);
+      output[1] = LoadAligned16(kHalfSubPixel6TapSignedFilterColumns[1]);
+      output[2] = LoadAligned16(kHalfSubPixel6TapSignedFilterColumns[2]);
+      output[3] = LoadAligned16(kHalfSubPixel6TapSignedFilterColumns[3]);
+      output[4] = LoadAligned16(kHalfSubPixel6TapSignedFilterColumns[4]);
+      output[5] = LoadAligned16(kHalfSubPixel6TapSignedFilterColumns[5]);
+      break;
+    case 1:
+      // The term "mixed" refers to the fact that the outer taps have a mix of
+      // negative and positive values.
+      output[0] = LoadAligned16(kHalfSubPixel6TapMixedSignedFilterColumns[0]);
+      output[1] = LoadAligned16(kHalfSubPixel6TapMixedSignedFilterColumns[1]);
+      output[2] = LoadAligned16(kHalfSubPixel6TapMixedSignedFilterColumns[2]);
+      output[3] = LoadAligned16(kHalfSubPixel6TapMixedSignedFilterColumns[3]);
+      output[4] = LoadAligned16(kHalfSubPixel6TapMixedSignedFilterColumns[4]);
+      output[5] = LoadAligned16(kHalfSubPixel6TapMixedSignedFilterColumns[5]);
+      break;
+    case 2:
+      output[0] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[0]);
+      output[1] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[1]);
+      output[2] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[2]);
+      output[3] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[3]);
+      output[4] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[4]);
+      output[5] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[5]);
+      output[6] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[6]);
+      output[7] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[7]);
+      break;
+    case 3:
+      output[0] = LoadAligned16(kHalfSubPixel2TapFilterColumns[0]);
+      output[1] = LoadAligned16(kHalfSubPixel2TapFilterColumns[1]);
+      break;
+    case 4:
+      output[0] = LoadAligned16(kHalfSubPixel4TapSignedFilterColumns[0]);
+      output[1] = LoadAligned16(kHalfSubPixel4TapSignedFilterColumns[1]);
+      output[2] = LoadAligned16(kHalfSubPixel4TapSignedFilterColumns[2]);
+      output[3] = LoadAligned16(kHalfSubPixel4TapSignedFilterColumns[3]);
+      break;
+    default:
+      assert(filter_index == 5);
+      output[0] = LoadAligned16(kSubPixel4TapPositiveFilterColumns[0]);
+      output[1] = LoadAligned16(kSubPixel4TapPositiveFilterColumns[1]);
+      output[2] = LoadAligned16(kSubPixel4TapPositiveFilterColumns[2]);
+      output[3] = LoadAligned16(kSubPixel4TapPositiveFilterColumns[3]);
+      break;
+  }
+}
+
+// There are many opportunities for overreading in scaled convolve, because
+// the range of starting points for filter windows is anywhere from 0 to 16
+// for 8 destination pixels, and the window sizes range from 2 to 8. To
+// accommodate this range concisely, we use |grade_x| to mean the most steps
+// in src that can be traversed in a single |step_x| increment, i.e. 1 or 2.
+// More importantly, |grade_x| answers the question "how many vector loads are
+// needed to cover the source values?"
+// When |grade_x| == 1, the maximum number of source values needed is 8 separate
+// starting positions plus 7 more to cover taps, all fitting into 16 bytes.
+// When |grade_x| > 1, we are guaranteed to exceed 8 whole steps in src for
+// every 8 |step_x| increments, on top of 8 possible taps. The first load covers
+// the starting sources for each kernel, while the final load covers the taps.
+// Since the offset value of src_x cannot exceed 8 and |num_taps| does not
+// exceed 4 when width <= 4, |grade_x| is set to 1 regardless of the value of
+// |step_x|.
+template <int num_taps, int grade_x>
+inline void PrepareSourceVectors(const uint8_t* src, const __m128i src_indices,
+                                 __m128i* const source /*[num_taps >> 1]*/) {
+  const __m128i src_vals = LoadUnaligned16(src);
+  source[0] = _mm_shuffle_epi8(src_vals, src_indices);
+  if (grade_x == 1) {
+    if (num_taps > 2) {
+      source[1] = _mm_shuffle_epi8(_mm_srli_si128(src_vals, 2), src_indices);
+    }
+    if (num_taps > 4) {
+      source[2] = _mm_shuffle_epi8(_mm_srli_si128(src_vals, 4), src_indices);
+    }
+    if (num_taps > 6) {
+      source[3] = _mm_shuffle_epi8(_mm_srli_si128(src_vals, 6), src_indices);
+    }
+  } else {
+    assert(grade_x > 1);
+    assert(num_taps != 4);
+    // grade_x > 1 also means width >= 8 && num_taps != 4
+    const __m128i src_vals_ext = LoadLo8(src + 16);
+    if (num_taps > 2) {
+      source[1] = _mm_shuffle_epi8(_mm_alignr_epi8(src_vals_ext, src_vals, 2),
+                                   src_indices);
+      source[2] = _mm_shuffle_epi8(_mm_alignr_epi8(src_vals_ext, src_vals, 4),
+                                   src_indices);
+    }
+    if (num_taps > 6) {
+      source[3] = _mm_shuffle_epi8(_mm_alignr_epi8(src_vals_ext, src_vals, 6),
+                                   src_indices);
+    }
+  }
+}
+
+template <int num_taps>
+inline void PrepareHorizontalTaps(const __m128i subpel_indices,
+                                  const __m128i* filter_taps,
+                                  __m128i* out_taps) {
+  const __m128i scale_index_offsets =
+      _mm_srli_epi16(subpel_indices, kFilterIndexShift);
+  const __m128i filter_index_mask = _mm_set1_epi8(kSubPixelMask);
+  const __m128i filter_indices =
+      _mm_and_si128(_mm_packus_epi16(scale_index_offsets, scale_index_offsets),
+                    filter_index_mask);
+  // Line up taps for maddubs_epi16.
+  // The unpack is also assumed to be lighter than shift+alignr.
+  for (int k = 0; k < (num_taps >> 1); ++k) {
+    const __m128i taps0 = _mm_shuffle_epi8(filter_taps[2 * k], filter_indices);
+    const __m128i taps1 =
+        _mm_shuffle_epi8(filter_taps[2 * k + 1], filter_indices);
+    out_taps[k] = _mm_unpacklo_epi8(taps0, taps1);
+  }
+}
+
+inline __m128i HorizontalScaleIndices(const __m128i subpel_indices) {
+  const __m128i src_indices16 =
+      _mm_srli_epi16(subpel_indices, kScaleSubPixelBits);
+  const __m128i src_indices = _mm_packus_epi16(src_indices16, src_indices16);
+  return _mm_unpacklo_epi8(src_indices,
+                           _mm_add_epi8(src_indices, _mm_set1_epi8(1)));
+}
+
+template <int grade_x, int filter_index, int num_taps>
+inline void ConvolveHorizontalScale(const uint8_t* src, ptrdiff_t src_stride,
+                                    int width, int subpixel_x, int step_x,
+                                    int intermediate_height,
+                                    int16_t* intermediate) {
+  // Account for the 0-taps that precede the 2 nonzero taps.
+  const int kernel_offset = (8 - num_taps) >> 1;
+  const int ref_x = subpixel_x >> kScaleSubPixelBits;
+  const int step_x8 = step_x << 3;
+  __m128i filter_taps[num_taps];
+  GetHalfSubPixelFilter<filter_index>(filter_taps);
+  const __m128i index_steps =
+      _mm_mullo_epi16(_mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0),
+                      _mm_set1_epi16(static_cast<int16_t>(step_x)));
+
+  __m128i taps[num_taps >> 1];
+  __m128i source[num_taps >> 1];
+  int p = subpixel_x;
+  // Case when width <= 4 is possible.
+  if (filter_index >= 3) {
+    if (filter_index > 3 || width <= 4) {
+      const uint8_t* src_x =
+          &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
+      // Only add steps to the 10-bit truncated p to avoid overflow.
+      const __m128i p_fraction = _mm_set1_epi16(p & 1023);
+      const __m128i subpel_indices = _mm_add_epi16(index_steps, p_fraction);
+      PrepareHorizontalTaps<num_taps>(subpel_indices, filter_taps, taps);
+      const __m128i packed_indices = HorizontalScaleIndices(subpel_indices);
+
+      int y = intermediate_height;
+      do {
+        // Load and line up source values with the taps. Width 4 means no need
+        // to load extended source.
+        PrepareSourceVectors<num_taps, /*grade_x=*/1>(src_x, packed_indices,
+                                                      source);
+
+        StoreLo8(intermediate, RightShiftWithRounding_S16(
+                                   SumOnePassTaps<filter_index>(source, taps),
+                                   kInterRoundBitsHorizontal - 1));
+        src_x += src_stride;
+        intermediate += kIntermediateStride;
+      } while (--y != 0);
+      return;
+    }
+  }
+
+  // |width| >= 8
+  int x = 0;
+  do {
+    const uint8_t* src_x =
+        &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
+    int16_t* intermediate_x = intermediate + x;
+    // Only add steps to the 10-bit truncated p to avoid overflow.
+    const __m128i p_fraction = _mm_set1_epi16(p & 1023);
+    const __m128i subpel_indices = _mm_add_epi16(index_steps, p_fraction);
+    PrepareHorizontalTaps<num_taps>(subpel_indices, filter_taps, taps);
+    const __m128i packed_indices = HorizontalScaleIndices(subpel_indices);
+
+    int y = intermediate_height;
+    do {
+      // For each x, a lane of src_k[k] contains src_x[k].
+      PrepareSourceVectors<num_taps, grade_x>(src_x, packed_indices, source);
+
+      // Shift by one less because the taps are halved.
+      StoreAligned16(
+          intermediate_x,
+          RightShiftWithRounding_S16(SumOnePassTaps<filter_index>(source, taps),
+                                     kInterRoundBitsHorizontal - 1));
+      src_x += src_stride;
+      intermediate_x += kIntermediateStride;
+    } while (--y != 0);
+    x += 8;
+    p += step_x8;
+  } while (x < width);
+}
+
+template <int num_taps>
+inline void PrepareVerticalTaps(const int8_t* taps, __m128i* output) {
+  // Avoid overreading the filter due to starting at kernel_offset.
+  // The only danger of overread is in the final filter, which has 4 taps.
+  const __m128i filter =
+      _mm_cvtepi8_epi16((num_taps > 4) ? LoadLo8(taps) : Load4(taps));
+  output[0] = _mm_shuffle_epi32(filter, 0);
+  if (num_taps > 2) {
+    output[1] = _mm_shuffle_epi32(filter, 0x55);
+  }
+  if (num_taps > 4) {
+    output[2] = _mm_shuffle_epi32(filter, 0xAA);
+  }
+  if (num_taps > 6) {
+    output[3] = _mm_shuffle_epi32(filter, 0xFF);
+  }
+}
+
+// Process eight 16 bit inputs and output eight 16 bit values.
+template <int num_taps, bool is_compound>
+inline __m128i Sum2DVerticalTaps(const __m128i* const src,
+                                 const __m128i* taps) {
+  const __m128i src_lo_01 = _mm_unpacklo_epi16(src[0], src[1]);
+  __m128i sum_lo = _mm_madd_epi16(src_lo_01, taps[0]);
+  const __m128i src_hi_01 = _mm_unpackhi_epi16(src[0], src[1]);
+  __m128i sum_hi = _mm_madd_epi16(src_hi_01, taps[0]);
+  if (num_taps > 2) {
+    const __m128i src_lo_23 = _mm_unpacklo_epi16(src[2], src[3]);
+    sum_lo = _mm_add_epi32(sum_lo, _mm_madd_epi16(src_lo_23, taps[1]));
+    const __m128i src_hi_23 = _mm_unpackhi_epi16(src[2], src[3]);
+    sum_hi = _mm_add_epi32(sum_hi, _mm_madd_epi16(src_hi_23, taps[1]));
+  }
+  if (num_taps > 4) {
+    const __m128i src_lo_45 = _mm_unpacklo_epi16(src[4], src[5]);
+    sum_lo = _mm_add_epi32(sum_lo, _mm_madd_epi16(src_lo_45, taps[2]));
+    const __m128i src_hi_45 = _mm_unpackhi_epi16(src[4], src[5]);
+    sum_hi = _mm_add_epi32(sum_hi, _mm_madd_epi16(src_hi_45, taps[2]));
+  }
+  if (num_taps > 6) {
+    const __m128i src_lo_67 = _mm_unpacklo_epi16(src[6], src[7]);
+    sum_lo = _mm_add_epi32(sum_lo, _mm_madd_epi16(src_lo_67, taps[3]));
+    const __m128i src_hi_67 = _mm_unpackhi_epi16(src[6], src[7]);
+    sum_hi = _mm_add_epi32(sum_hi, _mm_madd_epi16(src_hi_67, taps[3]));
+  }
+  if (is_compound) {
+    return _mm_packs_epi32(
+        RightShiftWithRounding_S32(sum_lo, kInterRoundBitsCompoundVertical - 1),
+        RightShiftWithRounding_S32(sum_hi,
+                                   kInterRoundBitsCompoundVertical - 1));
+  }
+  return _mm_packs_epi32(
+      RightShiftWithRounding_S32(sum_lo, kInterRoundBitsVertical - 1),
+      RightShiftWithRounding_S32(sum_hi, kInterRoundBitsVertical - 1));
+}
+
+// Bottom half of each src[k] is the source for one filter, and the top half
+// is the source for the other filter, for the next destination row.
+template <int num_taps, bool is_compound>
+__m128i Sum2DVerticalTaps4x2(const __m128i* const src, const __m128i* taps_lo,
+                             const __m128i* taps_hi) {
+  const __m128i src_lo_01 = _mm_unpacklo_epi16(src[0], src[1]);
+  __m128i sum_lo = _mm_madd_epi16(src_lo_01, taps_lo[0]);
+  const __m128i src_hi_01 = _mm_unpackhi_epi16(src[0], src[1]);
+  __m128i sum_hi = _mm_madd_epi16(src_hi_01, taps_hi[0]);
+  if (num_taps > 2) {
+    const __m128i src_lo_23 = _mm_unpacklo_epi16(src[2], src[3]);
+    sum_lo = _mm_add_epi32(sum_lo, _mm_madd_epi16(src_lo_23, taps_lo[1]));
+    const __m128i src_hi_23 = _mm_unpackhi_epi16(src[2], src[3]);
+    sum_hi = _mm_add_epi32(sum_hi, _mm_madd_epi16(src_hi_23, taps_hi[1]));
+  }
+  if (num_taps > 4) {
+    const __m128i src_lo_45 = _mm_unpacklo_epi16(src[4], src[5]);
+    sum_lo = _mm_add_epi32(sum_lo, _mm_madd_epi16(src_lo_45, taps_lo[2]));
+    const __m128i src_hi_45 = _mm_unpackhi_epi16(src[4], src[5]);
+    sum_hi = _mm_add_epi32(sum_hi, _mm_madd_epi16(src_hi_45, taps_hi[2]));
+  }
+  if (num_taps > 6) {
+    const __m128i src_lo_67 = _mm_unpacklo_epi16(src[6], src[7]);
+    sum_lo = _mm_add_epi32(sum_lo, _mm_madd_epi16(src_lo_67, taps_lo[3]));
+    const __m128i src_hi_67 = _mm_unpackhi_epi16(src[6], src[7]);
+    sum_hi = _mm_add_epi32(sum_hi, _mm_madd_epi16(src_hi_67, taps_hi[3]));
+  }
+
+  if (is_compound) {
+    return _mm_packs_epi32(
+        RightShiftWithRounding_S32(sum_lo, kInterRoundBitsCompoundVertical - 1),
+        RightShiftWithRounding_S32(sum_hi,
+                                   kInterRoundBitsCompoundVertical - 1));
+  }
+  return _mm_packs_epi32(
+      RightShiftWithRounding_S32(sum_lo, kInterRoundBitsVertical - 1),
+      RightShiftWithRounding_S32(sum_hi, kInterRoundBitsVertical - 1));
+}
+
+// |width_class| is 2, 4, or 8, according to the Store function that should be
+// used.
+template <int num_taps, int width_class, bool is_compound>
+#if LIBGAV1_MSAN
+__attribute__((no_sanitize_memory)) void ConvolveVerticalScale(
+#else
+inline void ConvolveVerticalScale(
+#endif
+    const int16_t* src, const int width, const int subpixel_y,
+    const int filter_index, const int step_y, const int height, void* dest,
+    const ptrdiff_t dest_stride) {
+  constexpr ptrdiff_t src_stride = kIntermediateStride;
+  constexpr int kernel_offset = (8 - num_taps) / 2;
+  const int16_t* src_y = src;
+  // |dest| is 16-bit in compound mode, Pixel otherwise.
+  auto* dest16_y = static_cast<uint16_t*>(dest);
+  auto* dest_y = static_cast<uint8_t*>(dest);
+  __m128i s[num_taps];
+
+  int p = subpixel_y & 1023;
+  int y = height;
+  if (width_class <= 4) {
+    __m128i filter_taps_lo[num_taps >> 1];
+    __m128i filter_taps_hi[num_taps >> 1];
+    do {  // y > 0
+      for (int i = 0; i < num_taps; ++i) {
+        s[i] = LoadLo8(src_y + i * src_stride);
+      }
+      int filter_id = (p >> 6) & kSubPixelMask;
+      const int8_t* filter0 =
+          kHalfSubPixelFilters[filter_index][filter_id] + kernel_offset;
+      PrepareVerticalTaps<num_taps>(filter0, filter_taps_lo);
+      p += step_y;
+      src_y = src + (p >> kScaleSubPixelBits) * src_stride;
+
+      for (int i = 0; i < num_taps; ++i) {
+        s[i] = LoadHi8(s[i], src_y + i * src_stride);
+      }
+      filter_id = (p >> 6) & kSubPixelMask;
+      const int8_t* filter1 =
+          kHalfSubPixelFilters[filter_index][filter_id] + kernel_offset;
+      PrepareVerticalTaps<num_taps>(filter1, filter_taps_hi);
+      p += step_y;
+      src_y = src + (p >> kScaleSubPixelBits) * src_stride;
+
+      const __m128i sums = Sum2DVerticalTaps4x2<num_taps, is_compound>(
+          s, filter_taps_lo, filter_taps_hi);
+      if (is_compound) {
+        assert(width_class > 2);
+        StoreLo8(dest16_y, sums);
+        dest16_y += dest_stride;
+        StoreHi8(dest16_y, sums);
+        dest16_y += dest_stride;
+      } else {
+        const __m128i result = _mm_packus_epi16(sums, sums);
+        if (width_class == 2) {
+          Store2(dest_y, result);
+          dest_y += dest_stride;
+          Store2(dest_y, _mm_srli_si128(result, 4));
+        } else {
+          Store4(dest_y, result);
+          dest_y += dest_stride;
+          Store4(dest_y, _mm_srli_si128(result, 4));
+        }
+        dest_y += dest_stride;
+      }
+      y -= 2;
+    } while (y != 0);
+    return;
+  }
+
+  // |width_class| >= 8
+  __m128i filter_taps[num_taps >> 1];
+  do {  // y > 0
+    src_y = src + (p >> kScaleSubPixelBits) * src_stride;
+    const int filter_id = (p >> 6) & kSubPixelMask;
+    const int8_t* filter =
+        kHalfSubPixelFilters[filter_index][filter_id] + kernel_offset;
+    PrepareVerticalTaps<num_taps>(filter, filter_taps);
+
+    int x = 0;
+    do {  // x < width
+      for (int i = 0; i < num_taps; ++i) {
+        s[i] = LoadUnaligned16(src_y + i * src_stride);
+      }
+
+      const __m128i sums =
+          Sum2DVerticalTaps<num_taps, is_compound>(s, filter_taps);
+      if (is_compound) {
+        StoreUnaligned16(dest16_y + x, sums);
+      } else {
+        StoreLo8(dest_y + x, _mm_packus_epi16(sums, sums));
+      }
+      x += 8;
+      src_y += 8;
+    } while (x < width);
+    p += step_y;
+    dest_y += dest_stride;
+    dest16_y += dest_stride;
+  } while (--y != 0);
+}
+
+template <bool is_compound>
+void ConvolveScale2D_SSE4_1(const void* const reference,
+                            const ptrdiff_t reference_stride,
+                            const int horizontal_filter_index,
+                            const int vertical_filter_index,
+                            const int subpixel_x, const int subpixel_y,
+                            const int step_x, const int step_y, const int width,
+                            const int height, void* prediction,
+                            const ptrdiff_t pred_stride) {
+  const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
+  const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
+  assert(step_x <= 2048);
+  // The output of the horizontal filter, i.e. the intermediate_result, is
+  // guaranteed to fit in int16_t.
+  // TODO(petersonab): Reduce intermediate block stride to width to make smaller
+  // blocks faster.
+  alignas(16) int16_t
+      intermediate_result[kMaxSuperBlockSizeInPixels *
+                          (2 * kMaxSuperBlockSizeInPixels + kSubPixelTaps)];
+  const int num_vert_taps = GetNumTapsInFilter(vert_filter_index);
+  const int intermediate_height =
+      (((height - 1) * step_y + (1 << kScaleSubPixelBits) - 1) >>
+       kScaleSubPixelBits) +
+      num_vert_taps;
+
+  // Horizontal filter.
+  // Filter types used for width <= 4 are different from those for width > 4.
+  // When width > 4, the valid filter index range is always [0, 3].
+  // When width <= 4, the valid filter index range is always [3, 5].
+  // Similarly for height.
+  int16_t* intermediate = intermediate_result;
+  const ptrdiff_t src_stride = reference_stride;
+  const auto* src = static_cast<const uint8_t*>(reference);
+  const int vert_kernel_offset = (8 - num_vert_taps) / 2;
+  src += vert_kernel_offset * src_stride;
+
+  // Derive the maximum value of |step_x| at which all source values fit in one
+  // 16-byte load. Final index is src_x + |num_taps| - 1 < 16
+  // step_x*7 is the final base sub-pixel index for the shuffle mask for filter
+  // inputs in each iteration on large blocks. When step_x is large, we need a
+  // second register and alignr in order to gather all filter inputs.
+  // |num_taps| - 1 is the offset for the shuffle of inputs to the final tap.
+  const int num_horiz_taps = GetNumTapsInFilter(horiz_filter_index);
+  const int kernel_start_ceiling = 16 - num_horiz_taps;
+  // This truncated quotient |grade_x_threshold| selects |step_x| such that:
+  // (step_x * 7) >> kScaleSubPixelBits < single load limit
+  const int grade_x_threshold =
+      (kernel_start_ceiling << kScaleSubPixelBits) / 7;
+  switch (horiz_filter_index) {
+    case 0:
+      if (step_x > grade_x_threshold) {
+        ConvolveHorizontalScale<2, 0, 6>(src, src_stride, width, subpixel_x,
+                                         step_x, intermediate_height,
+                                         intermediate);
+      } else {
+        ConvolveHorizontalScale<1, 0, 6>(src, src_stride, width, subpixel_x,
+                                         step_x, intermediate_height,
+                                         intermediate);
+      }
+      break;
+    case 1:
+      if (step_x > grade_x_threshold) {
+        ConvolveHorizontalScale<2, 1, 6>(src, src_stride, width, subpixel_x,
+                                         step_x, intermediate_height,
+                                         intermediate);
+
+      } else {
+        ConvolveHorizontalScale<1, 1, 6>(src, src_stride, width, subpixel_x,
+                                         step_x, intermediate_height,
+                                         intermediate);
+      }
+      break;
+    case 2:
+      if (step_x > grade_x_threshold) {
+        ConvolveHorizontalScale<2, 2, 8>(src, src_stride, width, subpixel_x,
+                                         step_x, intermediate_height,
+                                         intermediate);
+      } else {
+        ConvolveHorizontalScale<1, 2, 8>(src, src_stride, width, subpixel_x,
+                                         step_x, intermediate_height,
+                                         intermediate);
+      }
+      break;
+    case 3:
+      if (step_x > grade_x_threshold) {
+        ConvolveHorizontalScale<2, 3, 2>(src, src_stride, width, subpixel_x,
+                                         step_x, intermediate_height,
+                                         intermediate);
+      } else {
+        ConvolveHorizontalScale<1, 3, 2>(src, src_stride, width, subpixel_x,
+                                         step_x, intermediate_height,
+                                         intermediate);
+      }
+      break;
+    case 4:
+      assert(width <= 4);
+      ConvolveHorizontalScale<1, 4, 4>(src, src_stride, width, subpixel_x,
+                                       step_x, intermediate_height,
+                                       intermediate);
+      break;
+    default:
+      assert(horiz_filter_index == 5);
+      assert(width <= 4);
+      ConvolveHorizontalScale<1, 5, 4>(src, src_stride, width, subpixel_x,
+                                       step_x, intermediate_height,
+                                       intermediate);
+  }
+
+  // Vertical filter.
+  intermediate = intermediate_result;
+  switch (vert_filter_index) {
+    case 0:
+    case 1:
+      if (!is_compound && width == 2) {
+        ConvolveVerticalScale<6, 2, is_compound>(
+            intermediate, width, subpixel_y, vert_filter_index, step_y, height,
+            prediction, pred_stride);
+      } else if (width == 4) {
+        ConvolveVerticalScale<6, 4, is_compound>(
+            intermediate, width, subpixel_y, vert_filter_index, step_y, height,
+            prediction, pred_stride);
+      } else {
+        ConvolveVerticalScale<6, 8, is_compound>(
+            intermediate, width, subpixel_y, vert_filter_index, step_y, height,
+            prediction, pred_stride);
+      }
+      break;
+    case 2:
+      if (!is_compound && width == 2) {
+        ConvolveVerticalScale<8, 2, is_compound>(
+            intermediate, width, subpixel_y, vert_filter_index, step_y, height,
+            prediction, pred_stride);
+      } else if (width == 4) {
+        ConvolveVerticalScale<8, 4, is_compound>(
+            intermediate, width, subpixel_y, vert_filter_index, step_y, height,
+            prediction, pred_stride);
+      } else {
+        ConvolveVerticalScale<8, 8, is_compound>(
+            intermediate, width, subpixel_y, vert_filter_index, step_y, height,
+            prediction, pred_stride);
+      }
+      break;
+    case 3:
+      if (!is_compound && width == 2) {
+        ConvolveVerticalScale<2, 2, is_compound>(
+            intermediate, width, subpixel_y, vert_filter_index, step_y, height,
+            prediction, pred_stride);
+      } else if (width == 4) {
+        ConvolveVerticalScale<2, 4, is_compound>(
+            intermediate, width, subpixel_y, vert_filter_index, step_y, height,
+            prediction, pred_stride);
+      } else {
+        ConvolveVerticalScale<2, 8, is_compound>(
+            intermediate, width, subpixel_y, vert_filter_index, step_y, height,
+            prediction, pred_stride);
+      }
+      break;
+    default:
+      assert(vert_filter_index == 4 || vert_filter_index == 5);
+      if (!is_compound && width == 2) {
+        ConvolveVerticalScale<4, 2, is_compound>(
+            intermediate, width, subpixel_y, vert_filter_index, step_y, height,
+            prediction, pred_stride);
+      } else if (width == 4) {
+        ConvolveVerticalScale<4, 4, is_compound>(
+            intermediate, width, subpixel_y, vert_filter_index, step_y, height,
+            prediction, pred_stride);
+      } else {
+        ConvolveVerticalScale<4, 8, is_compound>(
+            intermediate, width, subpixel_y, vert_filter_index, step_y, height,
+            prediction, pred_stride);
+      }
+  }
+}
+
+inline void HalfAddHorizontal(const uint8_t* src, uint8_t* dst) {
+  const __m128i left = LoadUnaligned16(src);
+  const __m128i right = LoadUnaligned16(src + 1);
+  StoreUnaligned16(dst, _mm_avg_epu8(left, right));
+}
+
+template <int width>
+inline void IntraBlockCopyHorizontal(const uint8_t* src,
+                                     const ptrdiff_t src_stride,
+                                     const int height, uint8_t* dst,
+                                     const ptrdiff_t dst_stride) {
+  const ptrdiff_t src_remainder_stride = src_stride - (width - 16);
+  const ptrdiff_t dst_remainder_stride = dst_stride - (width - 16);
+
+  int y = height;
+  do {
+    HalfAddHorizontal(src, dst);
+    if (width >= 32) {
+      src += 16;
+      dst += 16;
+      HalfAddHorizontal(src, dst);
+      if (width >= 64) {
+        src += 16;
+        dst += 16;
+        HalfAddHorizontal(src, dst);
+        src += 16;
+        dst += 16;
+        HalfAddHorizontal(src, dst);
+        if (width == 128) {
+          src += 16;
+          dst += 16;
+          HalfAddHorizontal(src, dst);
+          src += 16;
+          dst += 16;
+          HalfAddHorizontal(src, dst);
+          src += 16;
+          dst += 16;
+          HalfAddHorizontal(src, dst);
+          src += 16;
+          dst += 16;
+          HalfAddHorizontal(src, dst);
+        }
+      }
+    }
+    src += src_remainder_stride;
+    dst += dst_remainder_stride;
+  } while (--y != 0);
+}
+
+void ConvolveIntraBlockCopyHorizontal_SSE4_1(
+    const void* const reference, const ptrdiff_t reference_stride,
+    const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
+    const int /*subpixel_x*/, const int /*subpixel_y*/, const int width,
+    const int height, void* const prediction, const ptrdiff_t pred_stride) {
+  const auto* src = static_cast<const uint8_t*>(reference);
+  auto* dest = static_cast<uint8_t*>(prediction);
+
+  if (width == 128) {
+    IntraBlockCopyHorizontal<128>(src, reference_stride, height, dest,
+                                  pred_stride);
+  } else if (width == 64) {
+    IntraBlockCopyHorizontal<64>(src, reference_stride, height, dest,
+                                 pred_stride);
+  } else if (width == 32) {
+    IntraBlockCopyHorizontal<32>(src, reference_stride, height, dest,
+                                 pred_stride);
+  } else if (width == 16) {
+    IntraBlockCopyHorizontal<16>(src, reference_stride, height, dest,
+                                 pred_stride);
+  } else if (width == 8) {
+    int y = height;
+    do {
+      const __m128i left = LoadLo8(src);
+      const __m128i right = LoadLo8(src + 1);
+      StoreLo8(dest, _mm_avg_epu8(left, right));
+
+      src += reference_stride;
+      dest += pred_stride;
+    } while (--y != 0);
+  } else if (width == 4) {
+    int y = height;
+    do {
+      __m128i left = Load4(src);
+      __m128i right = Load4(src + 1);
+      src += reference_stride;
+      left = _mm_unpacklo_epi32(left, Load4(src));
+      right = _mm_unpacklo_epi32(right, Load4(src + 1));
+      src += reference_stride;
+
+      const __m128i result = _mm_avg_epu8(left, right);
+
+      Store4(dest, result);
+      dest += pred_stride;
+      Store4(dest, _mm_srli_si128(result, 4));
+      dest += pred_stride;
+      y -= 2;
+    } while (y != 0);
+  } else {
+    assert(width == 2);
+    __m128i left = _mm_setzero_si128();
+    __m128i right = _mm_setzero_si128();
+    int y = height;
+    do {
+      left = Load2<0>(src, left);
+      right = Load2<0>(src + 1, right);
+      src += reference_stride;
+      left = Load2<1>(src, left);
+      right = Load2<1>(src + 1, right);
+      src += reference_stride;
+
+      const __m128i result = _mm_avg_epu8(left, right);
+
+      Store2(dest, result);
+      dest += pred_stride;
+      Store2(dest, _mm_srli_si128(result, 2));
+      dest += pred_stride;
+      y -= 2;
+    } while (y != 0);
+  }
+}
+
+template <int width>
+inline void IntraBlockCopyVertical(const uint8_t* src,
+                                   const ptrdiff_t src_stride, const int height,
+                                   uint8_t* dst, const ptrdiff_t dst_stride) {
+  const ptrdiff_t src_remainder_stride = src_stride - (width - 16);
+  const ptrdiff_t dst_remainder_stride = dst_stride - (width - 16);
+  __m128i row[8], below[8];
+
+  row[0] = LoadUnaligned16(src);
+  if (width >= 32) {
+    src += 16;
+    row[1] = LoadUnaligned16(src);
+    if (width >= 64) {
+      src += 16;
+      row[2] = LoadUnaligned16(src);
+      src += 16;
+      row[3] = LoadUnaligned16(src);
+      if (width == 128) {
+        src += 16;
+        row[4] = LoadUnaligned16(src);
+        src += 16;
+        row[5] = LoadUnaligned16(src);
+        src += 16;
+        row[6] = LoadUnaligned16(src);
+        src += 16;
+        row[7] = LoadUnaligned16(src);
+      }
+    }
+  }
+  src += src_remainder_stride;
+
+  int y = height;
+  do {
+    below[0] = LoadUnaligned16(src);
+    if (width >= 32) {
+      src += 16;
+      below[1] = LoadUnaligned16(src);
+      if (width >= 64) {
+        src += 16;
+        below[2] = LoadUnaligned16(src);
+        src += 16;
+        below[3] = LoadUnaligned16(src);
+        if (width == 128) {
+          src += 16;
+          below[4] = LoadUnaligned16(src);
+          src += 16;
+          below[5] = LoadUnaligned16(src);
+          src += 16;
+          below[6] = LoadUnaligned16(src);
+          src += 16;
+          below[7] = LoadUnaligned16(src);
+        }
+      }
+    }
+    src += src_remainder_stride;
+
+    StoreUnaligned16(dst, _mm_avg_epu8(row[0], below[0]));
+    row[0] = below[0];
+    if (width >= 32) {
+      dst += 16;
+      StoreUnaligned16(dst, _mm_avg_epu8(row[1], below[1]));
+      row[1] = below[1];
+      if (width >= 64) {
+        dst += 16;
+        StoreUnaligned16(dst, _mm_avg_epu8(row[2], below[2]));
+        row[2] = below[2];
+        dst += 16;
+        StoreUnaligned16(dst, _mm_avg_epu8(row[3], below[3]));
+        row[3] = below[3];
+        if (width >= 128) {
+          dst += 16;
+          StoreUnaligned16(dst, _mm_avg_epu8(row[4], below[4]));
+          row[4] = below[4];
+          dst += 16;
+          StoreUnaligned16(dst, _mm_avg_epu8(row[5], below[5]));
+          row[5] = below[5];
+          dst += 16;
+          StoreUnaligned16(dst, _mm_avg_epu8(row[6], below[6]));
+          row[6] = below[6];
+          dst += 16;
+          StoreUnaligned16(dst, _mm_avg_epu8(row[7], below[7]));
+          row[7] = below[7];
+        }
+      }
+    }
+    dst += dst_remainder_stride;
+  } while (--y != 0);
+}
+
+void ConvolveIntraBlockCopyVertical_SSE4_1(
+    const void* const reference, const ptrdiff_t reference_stride,
+    const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
+    const int /*horizontal_filter_id*/, const int /*vertical_filter_id*/,
+    const int width, const int height, void* const prediction,
+    const ptrdiff_t pred_stride) {
+  const auto* src = static_cast<const uint8_t*>(reference);
+  auto* dest = static_cast<uint8_t*>(prediction);
+
+  if (width == 128) {
+    IntraBlockCopyVertical<128>(src, reference_stride, height, dest,
+                                pred_stride);
+  } else if (width == 64) {
+    IntraBlockCopyVertical<64>(src, reference_stride, height, dest,
+                               pred_stride);
+  } else if (width == 32) {
+    IntraBlockCopyVertical<32>(src, reference_stride, height, dest,
+                               pred_stride);
+  } else if (width == 16) {
+    IntraBlockCopyVertical<16>(src, reference_stride, height, dest,
+                               pred_stride);
+  } else if (width == 8) {
+    __m128i row, below;
+    row = LoadLo8(src);
+    src += reference_stride;
+
+    int y = height;
+    do {
+      below = LoadLo8(src);
+      src += reference_stride;
+
+      StoreLo8(dest, _mm_avg_epu8(row, below));
+      dest += pred_stride;
+
+      row = below;
+    } while (--y != 0);
+  } else if (width == 4) {
+    __m128i row = Load4(src);
+    src += reference_stride;
+
+    int y = height;
+    do {
+      __m128i below = Load4(src);
+      src += reference_stride;
+
+      Store4(dest, _mm_avg_epu8(row, below));
+      dest += pred_stride;
+
+      row = below;
+    } while (--y != 0);
+  } else {
+    assert(width == 2);
+    __m128i row = Load2(src);
+    __m128i below = _mm_setzero_si128();
+    src += reference_stride;
+
+    int y = height;
+    do {
+      below = Load2<0>(src, below);
+      src += reference_stride;
+
+      Store2(dest, _mm_avg_epu8(row, below));
+      dest += pred_stride;
+
+      row = below;
+    } while (--y != 0);
+  }
+}
+
+// Load then add two uint8_t vectors. Return the uint16_t vector result.
+inline __m128i LoadU8AndAddLong(const uint8_t* src, const uint8_t* src1) {
+  const __m128i a = _mm_cvtepu8_epi16(LoadLo8(src));
+  const __m128i b = _mm_cvtepu8_epi16(LoadLo8(src1));
+  return _mm_add_epi16(a, b);
+}
+
+inline __m128i AddU16RightShift2AndPack(__m128i v0, __m128i v1) {
+  const __m128i a = _mm_add_epi16(v0, v1);
+  const __m128i b = _mm_srli_epi16(a, 1);
+  // Use avg here to shift right by 1 with round.
+  const __m128i c = _mm_avg_epu16(b, _mm_setzero_si128());
+  return _mm_packus_epi16(c, c);
+}
+
+template <int width>
+inline void IntraBlockCopy2D(const uint8_t* src, const ptrdiff_t src_stride,
+                             const int height, uint8_t* dst,
+                             const ptrdiff_t dst_stride) {
+  const ptrdiff_t src_remainder_stride = src_stride - (width - 8);
+  const ptrdiff_t dst_remainder_stride = dst_stride - (width - 8);
+  __m128i row[16];
+  row[0] = LoadU8AndAddLong(src, src + 1);
+  if (width >= 16) {
+    src += 8;
+    row[1] = LoadU8AndAddLong(src, src + 1);
+    if (width >= 32) {
+      src += 8;
+      row[2] = LoadU8AndAddLong(src, src + 1);
+      src += 8;
+      row[3] = LoadU8AndAddLong(src, src + 1);
+      if (width >= 64) {
+        src += 8;
+        row[4] = LoadU8AndAddLong(src, src + 1);
+        src += 8;
+        row[5] = LoadU8AndAddLong(src, src + 1);
+        src += 8;
+        row[6] = LoadU8AndAddLong(src, src + 1);
+        src += 8;
+        row[7] = LoadU8AndAddLong(src, src + 1);
+        if (width == 128) {
+          src += 8;
+          row[8] = LoadU8AndAddLong(src, src + 1);
+          src += 8;
+          row[9] = LoadU8AndAddLong(src, src + 1);
+          src += 8;
+          row[10] = LoadU8AndAddLong(src, src + 1);
+          src += 8;
+          row[11] = LoadU8AndAddLong(src, src + 1);
+          src += 8;
+          row[12] = LoadU8AndAddLong(src, src + 1);
+          src += 8;
+          row[13] = LoadU8AndAddLong(src, src + 1);
+          src += 8;
+          row[14] = LoadU8AndAddLong(src, src + 1);
+          src += 8;
+          row[15] = LoadU8AndAddLong(src, src + 1);
+        }
+      }
+    }
+  }
+  src += src_remainder_stride;
+
+  int y = height;
+  do {
+    const __m128i below_0 = LoadU8AndAddLong(src, src + 1);
+    StoreLo8(dst, AddU16RightShift2AndPack(row[0], below_0));
+    row[0] = below_0;
+    if (width >= 16) {
+      src += 8;
+      dst += 8;
+
+      const __m128i below_1 = LoadU8AndAddLong(src, src + 1);
+      StoreLo8(dst, AddU16RightShift2AndPack(row[1], below_1));
+      row[1] = below_1;
+      if (width >= 32) {
+        src += 8;
+        dst += 8;
+
+        const __m128i below_2 = LoadU8AndAddLong(src, src + 1);
+        StoreLo8(dst, AddU16RightShift2AndPack(row[2], below_2));
+        row[2] = below_2;
+        src += 8;
+        dst += 8;
+
+        const __m128i below_3 = LoadU8AndAddLong(src, src + 1);
+        StoreLo8(dst, AddU16RightShift2AndPack(row[3], below_3));
+        row[3] = below_3;
+        if (width >= 64) {
+          src += 8;
+          dst += 8;
+
+          const __m128i below_4 = LoadU8AndAddLong(src, src + 1);
+          StoreLo8(dst, AddU16RightShift2AndPack(row[4], below_4));
+          row[4] = below_4;
+          src += 8;
+          dst += 8;
+
+          const __m128i below_5 = LoadU8AndAddLong(src, src + 1);
+          StoreLo8(dst, AddU16RightShift2AndPack(row[5], below_5));
+          row[5] = below_5;
+          src += 8;
+          dst += 8;
+
+          const __m128i below_6 = LoadU8AndAddLong(src, src + 1);
+          StoreLo8(dst, AddU16RightShift2AndPack(row[6], below_6));
+          row[6] = below_6;
+          src += 8;
+          dst += 8;
+
+          const __m128i below_7 = LoadU8AndAddLong(src, src + 1);
+          StoreLo8(dst, AddU16RightShift2AndPack(row[7], below_7));
+          row[7] = below_7;
+          if (width == 128) {
+            src += 8;
+            dst += 8;
+
+            const __m128i below_8 = LoadU8AndAddLong(src, src + 1);
+            StoreLo8(dst, AddU16RightShift2AndPack(row[8], below_8));
+            row[8] = below_8;
+            src += 8;
+            dst += 8;
+
+            const __m128i below_9 = LoadU8AndAddLong(src, src + 1);
+            StoreLo8(dst, AddU16RightShift2AndPack(row[9], below_9));
+            row[9] = below_9;
+            src += 8;
+            dst += 8;
+
+            const __m128i below_10 = LoadU8AndAddLong(src, src + 1);
+            StoreLo8(dst, AddU16RightShift2AndPack(row[10], below_10));
+            row[10] = below_10;
+            src += 8;
+            dst += 8;
+
+            const __m128i below_11 = LoadU8AndAddLong(src, src + 1);
+            StoreLo8(dst, AddU16RightShift2AndPack(row[11], below_11));
+            row[11] = below_11;
+            src += 8;
+            dst += 8;
+
+            const __m128i below_12 = LoadU8AndAddLong(src, src + 1);
+            StoreLo8(dst, AddU16RightShift2AndPack(row[12], below_12));
+            row[12] = below_12;
+            src += 8;
+            dst += 8;
+
+            const __m128i below_13 = LoadU8AndAddLong(src, src + 1);
+            StoreLo8(dst, AddU16RightShift2AndPack(row[13], below_13));
+            row[13] = below_13;
+            src += 8;
+            dst += 8;
+
+            const __m128i below_14 = LoadU8AndAddLong(src, src + 1);
+            StoreLo8(dst, AddU16RightShift2AndPack(row[14], below_14));
+            row[14] = below_14;
+            src += 8;
+            dst += 8;
+
+            const __m128i below_15 = LoadU8AndAddLong(src, src + 1);
+            StoreLo8(dst, AddU16RightShift2AndPack(row[15], below_15));
+            row[15] = below_15;
+          }
+        }
+      }
+    }
+    src += src_remainder_stride;
+    dst += dst_remainder_stride;
+  } while (--y != 0);
+}
+
+void ConvolveIntraBlockCopy2D_SSE4_1(
+    const void* const reference, const ptrdiff_t reference_stride,
+    const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
+    const int /*horizontal_filter_id*/, const int /*vertical_filter_id*/,
+    const int width, const int height, void* const prediction,
+    const ptrdiff_t pred_stride) {
+  const auto* src = static_cast<const uint8_t*>(reference);
+  auto* dest = static_cast<uint8_t*>(prediction);
+  // Note: allow vertical access to height + 1. Because this function is only
+  // for u/v plane of intra block copy, such access is guaranteed to be within
+  // the prediction block.
+
+  if (width == 128) {
+    IntraBlockCopy2D<128>(src, reference_stride, height, dest, pred_stride);
+  } else if (width == 64) {
+    IntraBlockCopy2D<64>(src, reference_stride, height, dest, pred_stride);
+  } else if (width == 32) {
+    IntraBlockCopy2D<32>(src, reference_stride, height, dest, pred_stride);
+  } else if (width == 16) {
+    IntraBlockCopy2D<16>(src, reference_stride, height, dest, pred_stride);
+  } else if (width == 8) {
+    IntraBlockCopy2D<8>(src, reference_stride, height, dest, pred_stride);
+  } else if (width == 4) {
+    __m128i left = _mm_cvtepu8_epi16(Load4(src));
+    __m128i right = _mm_cvtepu8_epi16(Load4(src + 1));
+    src += reference_stride;
+
+    __m128i row = _mm_add_epi16(left, right);
+
+    int y = height;
+    do {
+      left = Load4(src);
+      right = Load4(src + 1);
+      src += reference_stride;
+      left = _mm_unpacklo_epi32(left, Load4(src));
+      right = _mm_unpacklo_epi32(right, Load4(src + 1));
+      src += reference_stride;
+
+      const __m128i below =
+          _mm_add_epi16(_mm_cvtepu8_epi16(left), _mm_cvtepu8_epi16(right));
+      const __m128i result =
+          AddU16RightShift2AndPack(_mm_unpacklo_epi64(row, below), below);
+
+      Store4(dest, result);
+      dest += pred_stride;
+      Store4(dest, _mm_srli_si128(result, 4));
+      dest += pred_stride;
+
+      row = _mm_srli_si128(below, 8);
+      y -= 2;
+    } while (y != 0);
+  } else {
+    __m128i left = Load2(src);
+    __m128i right = Load2(src + 1);
+    src += reference_stride;
+
+    __m128i row =
+        _mm_add_epi16(_mm_cvtepu8_epi16(left), _mm_cvtepu8_epi16(right));
+
+    int y = height;
+    do {
+      left = Load2<0>(src, left);
+      right = Load2<0>(src + 1, right);
+      src += reference_stride;
+      left = Load2<2>(src, left);
+      right = Load2<2>(src + 1, right);
+      src += reference_stride;
+
+      const __m128i below =
+          _mm_add_epi16(_mm_cvtepu8_epi16(left), _mm_cvtepu8_epi16(right));
+      const __m128i result =
+          AddU16RightShift2AndPack(_mm_unpacklo_epi64(row, below), below);
+
+      Store2(dest, result);
+      dest += pred_stride;
+      Store2(dest, _mm_srli_si128(result, 4));
+      dest += pred_stride;
+
+      row = _mm_srli_si128(below, 8);
+      y -= 2;
+    } while (y != 0);
+  }
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  dsp->convolve[0][0][0][1] = ConvolveHorizontal_SSE4_1;
+  dsp->convolve[0][0][1][0] = ConvolveVertical_SSE4_1;
+  dsp->convolve[0][0][1][1] = Convolve2D_SSE4_1;
+
+  dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_SSE4;
+  dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_SSE4_1;
+  dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_SSE4_1;
+  dsp->convolve[0][1][1][1] = ConvolveCompound2D_SSE4_1;
+
+  dsp->convolve[1][0][0][1] = ConvolveIntraBlockCopyHorizontal_SSE4_1;
+  dsp->convolve[1][0][1][0] = ConvolveIntraBlockCopyVertical_SSE4_1;
+  dsp->convolve[1][0][1][1] = ConvolveIntraBlockCopy2D_SSE4_1;
+
+  dsp->convolve_scale[0] = ConvolveScale2D_SSE4_1<false>;
+  dsp->convolve_scale[1] = ConvolveScale2D_SSE4_1<true>;
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+void ConvolveInit_SSE4_1() { low_bitdepth::Init8bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else  // !LIBGAV1_TARGETING_SSE4_1
+namespace libgav1 {
+namespace dsp {
+
+void ConvolveInit_SSE4_1() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_TARGETING_SSE4_1