13 files changed, 976 insertions, 1023 deletions
diff --git a/src/dsp/x86/average_blend_sse4.cc b/src/dsp/x86/average_blend_sse4.cc
index 911c5a9..c08b3d6 100644
--- a/src/dsp/x86/average_blend_sse4.cc
+++ b/src/dsp/x86/average_blend_sse4.cc
@@ -35,24 +35,46 @@ namespace {
 
 constexpr int kInterPostRoundBit = 4;
 
-inline void AverageBlend4Row(const int16_t* LIBGAV1_RESTRICT prediction_0,
-                             const int16_t* LIBGAV1_RESTRICT prediction_1,
-                             uint8_t* LIBGAV1_RESTRICT dest) {
-  const __m128i pred_0 = LoadLo8(prediction_0);
-  const __m128i pred_1 = LoadLo8(prediction_1);
-  __m128i res = _mm_add_epi16(pred_0, pred_1);
-  res = RightShiftWithRounding_S16(res, kInterPostRoundBit + 1);
-  Store4(dest, _mm_packus_epi16(res, res));
+inline void AverageBlend4x4Row(const int16_t* LIBGAV1_RESTRICT prediction_0,
+                               const int16_t* LIBGAV1_RESTRICT prediction_1,
+                               uint8_t* LIBGAV1_RESTRICT dest,
+                               const ptrdiff_t dest_stride) {
+  const __m128i pred_00 = LoadAligned16(prediction_0);
+  const __m128i pred_10 = LoadAligned16(prediction_1);
+  __m128i res_0 = _mm_add_epi16(pred_00, pred_10);
+  res_0 = RightShiftWithRounding_S16(res_0, kInterPostRoundBit + 1);
+  const __m128i pred_01 = LoadAligned16(prediction_0 + 8);
+  const __m128i pred_11 = LoadAligned16(prediction_1 + 8);
+  __m128i res_1 = _mm_add_epi16(pred_01, pred_11);
+  res_1 = RightShiftWithRounding_S16(res_1, kInterPostRoundBit + 1);
+  const __m128i result_pixels = _mm_packus_epi16(res_0, res_1);
+  Store4(dest, result_pixels);
+  dest += dest_stride;
+  const int result_1 = _mm_extract_epi32(result_pixels, 1);
+  memcpy(dest, &result_1, sizeof(result_1));
+  dest += dest_stride;
+  const int result_2 = _mm_extract_epi32(result_pixels, 2);
+  memcpy(dest, &result_2, sizeof(result_2));
+  dest += dest_stride;
+  const int result_3 = _mm_extract_epi32(result_pixels, 3);
+  memcpy(dest, &result_3, sizeof(result_3));
 }
 
 inline void AverageBlend8Row(const int16_t* LIBGAV1_RESTRICT prediction_0,
                              const int16_t* LIBGAV1_RESTRICT prediction_1,
-                             uint8_t* LIBGAV1_RESTRICT dest) {
-  const __m128i pred_0 = LoadAligned16(prediction_0);
-  const __m128i pred_1 = LoadAligned16(prediction_1);
-  __m128i res = _mm_add_epi16(pred_0, pred_1);
-  res = RightShiftWithRounding_S16(res, kInterPostRoundBit + 1);
-  StoreLo8(dest, _mm_packus_epi16(res, res));
+                             uint8_t* LIBGAV1_RESTRICT dest,
+                             const ptrdiff_t dest_stride) {
+  const __m128i pred_00 = LoadAligned16(prediction_0);
+  const __m128i pred_10 = LoadAligned16(prediction_1);
+  __m128i res_0 = _mm_add_epi16(pred_00, pred_10);
+  res_0 = RightShiftWithRounding_S16(res_0, kInterPostRoundBit + 1);
+  const __m128i pred_01 = LoadAligned16(prediction_0 + 8);
+  const __m128i pred_11 = LoadAligned16(prediction_1 + 8);
+  __m128i res_1 = _mm_add_epi16(pred_01, pred_11);
+  res_1 = RightShiftWithRounding_S16(res_1, kInterPostRoundBit + 1);
+  const __m128i result_pixels = _mm_packus_epi16(res_0, res_1);
+  StoreLo8(dest, result_pixels);
+  StoreHi8(dest + dest_stride, result_pixels);
 }
 
 inline void AverageBlendLargeRow(const int16_t* LIBGAV1_RESTRICT prediction_0,
@@ -85,35 +107,27 @@ void AverageBlend_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
   int y = height;
 
   if (width == 4) {
+    const ptrdiff_t dest_stride4 = dest_stride << 2;
+    constexpr ptrdiff_t width4 = 4 << 2;
     do {
-      // TODO(b/150326556): |prediction_[01]| values are packed. It is possible
-      // to load 8 values at a time.
-      AverageBlend4Row(pred_0, pred_1, dst);
-      dst += dest_stride;
-      pred_0 += width;
-      pred_1 += width;
-
-      AverageBlend4Row(pred_0, pred_1, dst);
-      dst += dest_stride;
-      pred_0 += width;
-      pred_1 += width;
+      AverageBlend4x4Row(pred_0, pred_1, dst, dest_stride);
+      dst += dest_stride4;
+      pred_0 += width4;
+      pred_1 += width4;
 
-      y -= 2;
+      y -= 4;
     } while (y != 0);
     return;
   }
 
   if (width == 8) {
+    const ptrdiff_t dest_stride2 = dest_stride << 1;
+    constexpr ptrdiff_t width2 = 8 << 1;
     do {
-      AverageBlend8Row(pred_0, pred_1, dst);
-      dst += dest_stride;
-      pred_0 += width;
-      pred_1 += width;
-
-      AverageBlend8Row(pred_0, pred_1, dst);
-      dst += dest_stride;
-      pred_0 += width;
-      pred_1 += width;
+      AverageBlend8Row(pred_0, pred_1, dst, dest_stride);
+      dst += dest_stride2;
+      pred_0 += width2;
+      pred_1 += width2;
 
       y -= 2;
     } while (y != 0);
diff --git a/src/dsp/x86/common_sse4_test.cc b/src/dsp/x86/common_sse4_test.cc
index 4ea811a..3288cfc 100644
--- a/src/dsp/x86/common_sse4_test.cc
+++ b/src/dsp/x86/common_sse4_test.cc
@@ -31,7 +31,7 @@ namespace {
 // INT16_MAX - ((1 << bits) >> 1). In particular, if bits == 16, then
 // RightShiftWithRounding_S16() is equal to RightShiftWithRounding() only for
 // negative values.
-TEST(CommonDspTest, SSE4RightShiftWithRoundingS16) {
+TEST(CommonDspTest, SSE41RightShiftWithRoundingS16) {
   for (int bits = 0; bits < 16; ++bits) {
     const int bias = (1 << bits) >> 1;
     for (int32_t value = INT16_MIN; value <= INT16_MAX; ++value) {
@@ -56,7 +56,7 @@ TEST(CommonDspTest, SSE4RightShiftWithRoundingS16) {
 
 #else  // !LIBGAV1_TARGETING_SSE4_1
 
-TEST(CommonDspTest, SSE4) {
+TEST(CommonDspTest, SSE41) {
   GTEST_SKIP() << "Build this module for x86(-64) with SSE4 enabled to enable "
                   "the tests.";
 }
diff --git a/src/dsp/x86/convolve_avx2.cc b/src/dsp/x86/convolve_avx2.cc
index 4126ca9..6e94347 100644
--- a/src/dsp/x86/convolve_avx2.cc
+++ b/src/dsp/x86/convolve_avx2.cc
@@ -39,17 +39,17 @@ namespace {
 // Multiply every entry in |src[]| by the corresponding entry in |taps[]| and
 // sum. The filters in |taps[]| are pre-shifted by 1. This prevents the final
 // sum from outranging int16_t.
-template <int filter_index>
+template <int num_taps>
 __m256i SumOnePassTaps(const __m256i* const src, const __m256i* const taps) {
   __m256i sum;
-  if (filter_index < 2) {
+  if (num_taps == 6) {
     // 6 taps.
     const __m256i v_madd_21 = _mm256_maddubs_epi16(src[0], taps[0]);  // k2k1
     const __m256i v_madd_43 = _mm256_maddubs_epi16(src[1], taps[1]);  // k4k3
     const __m256i v_madd_65 = _mm256_maddubs_epi16(src[2], taps[2]);  // k6k5
     sum = _mm256_add_epi16(v_madd_21, v_madd_43);
     sum = _mm256_add_epi16(sum, v_madd_65);
-  } else if (filter_index == 2) {
+  } else if (num_taps == 8) {
     // 8 taps.
     const __m256i v_madd_10 = _mm256_maddubs_epi16(src[0], taps[0]);  // k1k0
     const __m256i v_madd_32 = _mm256_maddubs_epi16(src[1], taps[1]);  // k3k2
@@ -58,7 +58,7 @@ __m256i SumOnePassTaps(const __m256i* const src, const __m256i* const taps) {
     const __m256i v_sum_3210 = _mm256_add_epi16(v_madd_10, v_madd_32);
     const __m256i v_sum_7654 = _mm256_add_epi16(v_madd_54, v_madd_76);
     sum = _mm256_add_epi16(v_sum_7654, v_sum_3210);
-  } else if (filter_index == 3) {
+  } else if (num_taps == 2) {
     // 2 taps.
     sum = _mm256_maddubs_epi16(src[0], taps[0]);  // k4k3
   } else {
@@ -70,7 +70,7 @@ __m256i SumOnePassTaps(const __m256i* const src, const __m256i* const taps) {
   return sum;
 }
 
-template <int filter_index>
+template <int num_taps>
 __m256i SumHorizontalTaps(const __m256i* const src,
                           const __m256i* const v_tap) {
   __m256i v_src[4];
@@ -78,32 +78,32 @@ __m256i SumHorizontalTaps(const __m256i* const src,
   const __m256i src_long_dup_lo = _mm256_unpacklo_epi8(src_long, src_long);
   const __m256i src_long_dup_hi = _mm256_unpackhi_epi8(src_long, src_long);
 
-  if (filter_index < 2) {
+  if (num_taps == 6) {
     // 6 taps.
     v_src[0] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 3);   // _21
     v_src[1] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 7);   // _43
     v_src[2] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 11);  // _65
-  } else if (filter_index == 2) {
+  } else if (num_taps == 8) {
     // 8 taps.
     v_src[0] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 1);   // _10
     v_src[1] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 5);   // _32
     v_src[2] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 9);   // _54
     v_src[3] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 13);  // _76
-  } else if (filter_index == 3) {
+  } else if (num_taps == 2) {
     // 2 taps.
     v_src[0] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 7);  // _43
-  } else if (filter_index > 3) {
+  } else {
     // 4 taps.
     v_src[0] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 5);  // _32
     v_src[1] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 9);  // _54
   }
-  return SumOnePassTaps<filter_index>(v_src, v_tap);
+  return SumOnePassTaps<num_taps>(v_src, v_tap);
 }
 
-template <int filter_index>
+template <int num_taps>
 __m256i SimpleHorizontalTaps(const __m256i* const src,
                              const __m256i* const v_tap) {
-  __m256i sum = SumHorizontalTaps<filter_index>(src, v_tap);
+  __m256i sum = SumHorizontalTaps<num_taps>(src, v_tap);
 
   // Normally the Horizontal pass does the downshift in two passes:
   // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
@@ -116,17 +116,16 @@ __m256i SimpleHorizontalTaps(const __m256i* const src,
   return _mm256_packus_epi16(sum, sum);
 }
 
-template <int filter_index>
+template <int num_taps>
 __m256i HorizontalTaps8To16(const __m256i* const src,
                             const __m256i* const v_tap) {
-  const __m256i sum = SumHorizontalTaps<filter_index>(src, v_tap);
+  const __m256i sum = SumHorizontalTaps<num_taps>(src, v_tap);
 
   return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
 }
 
 // Filter 2xh sizes.
-template <int num_taps, int filter_index, bool is_2d = false,
-          bool is_compound = false>
+template <int num_taps, bool is_2d = false, bool is_compound = false>
 void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
                       const ptrdiff_t src_stride,
                       void* LIBGAV1_RESTRICT const dest,
@@ -145,14 +144,14 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
       do {
         if (is_2d) {
           const __m128i sum =
-              HorizontalTaps8To16_2x2<filter_index>(src, src_stride, v_tap);
+              HorizontalTaps8To16_2x2<num_taps>(src, src_stride, v_tap);
           Store4(&dest16[0], sum);
           dest16 += pred_stride;
           Store4(&dest16[0], _mm_srli_si128(sum, 8));
           dest16 += pred_stride;
         } else {
           const __m128i sum =
-              SimpleHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
+              SimpleHorizontalTaps2x2<num_taps>(src, src_stride, v_tap);
           Store2(dest8, sum);
           dest8 += pred_stride;
           Store2(dest8, _mm_srli_si128(sum, 4));
@@ -169,7 +168,7 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
         assert(height % 2 == 1);
         __m128i sum;
         const __m128i input = LoadLo8(&src[2]);
-        if (filter_index == 3) {
+        if (num_taps == 2) {
           // 03 04 04 05 05 06 06 07 ....
           const __m128i v_src_43 =
               _mm_srli_si128(_mm_unpacklo_epi8(input, input), 3);
@@ -194,8 +193,7 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
 }
 
 // Filter widths >= 4.
-template <int num_taps, int filter_index, bool is_2d = false,
-          bool is_compound = false>
+template <int num_taps, bool is_2d = false, bool is_compound = false>
 void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
                       const ptrdiff_t src_stride,
                       void* LIBGAV1_RESTRICT const dest,
@@ -214,11 +212,11 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
           const __m256i src_long =
               SetrM128i(LoadUnaligned16(&src[x]), LoadUnaligned16(&src[x + 8]));
           const __m256i result =
-              HorizontalTaps8To16<filter_index>(&src_long, v_tap);
+              HorizontalTaps8To16<num_taps>(&src_long, v_tap);
           const __m256i src_long2 = SetrM128i(LoadUnaligned16(&src[x + 16]),
                                               LoadUnaligned16(&src[x + 24]));
           const __m256i result2 =
-              HorizontalTaps8To16<filter_index>(&src_long2, v_tap);
+              HorizontalTaps8To16<num_taps>(&src_long2, v_tap);
           if (is_2d) {
             StoreAligned32(&dest16[x], result);
             StoreAligned32(&dest16[x + 16], result2);
@@ -230,11 +228,11 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
           // Load src used to calculate dest8[7:0] and dest8[23:16].
           const __m256i src_long = LoadUnaligned32(&src[x]);
           const __m256i result =
-              SimpleHorizontalTaps<filter_index>(&src_long, v_tap);
+              SimpleHorizontalTaps<num_taps>(&src_long, v_tap);
           // Load src used to calculate dest8[15:8] and dest8[31:24].
           const __m256i src_long2 = LoadUnaligned32(&src[x + 8]);
           const __m256i result2 =
-              SimpleHorizontalTaps<filter_index>(&src_long2, v_tap);
+              SimpleHorizontalTaps<num_taps>(&src_long2, v_tap);
           // Combine results and store.
           StoreUnaligned32(&dest8[x], _mm256_unpacklo_epi64(result, result2));
         }
@@ -252,13 +250,12 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
         // Load into 2 128 bit lanes.
         const __m256i src_long =
             SetrM128i(LoadUnaligned16(&src[0]), LoadUnaligned16(&src[8]));
-        const __m256i result =
-            HorizontalTaps8To16<filter_index>(&src_long, v_tap);
+        const __m256i result = HorizontalTaps8To16<num_taps>(&src_long, v_tap);
         const __m256i src_long2 =
             SetrM128i(LoadUnaligned16(&src[src_stride]),
                       LoadUnaligned16(&src[8 + src_stride]));
         const __m256i result2 =
-            HorizontalTaps8To16<filter_index>(&src_long2, v_tap);
+            HorizontalTaps8To16<num_taps>(&src_long2, v_tap);
         if (is_2d) {
           StoreAligned32(&dest16[0], result);
           StoreAligned32(&dest16[pred_stride], result2);
@@ -270,12 +267,11 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
         // Load into 2 128 bit lanes.
         const __m256i src_long = SetrM128i(LoadUnaligned16(&src[0]),
                                            LoadUnaligned16(&src[src_stride]));
-        const __m256i result =
-            SimpleHorizontalTaps<filter_index>(&src_long, v_tap);
+        const __m256i result = SimpleHorizontalTaps<num_taps>(&src_long, v_tap);
         const __m256i src_long2 = SetrM128i(
             LoadUnaligned16(&src[8]), LoadUnaligned16(&src[8 + src_stride]));
         const __m256i result2 =
-            SimpleHorizontalTaps<filter_index>(&src_long2, v_tap);
+            SimpleHorizontalTaps<num_taps>(&src_long2, v_tap);
         const __m256i packed_result = _mm256_unpacklo_epi64(result, result2);
         StoreUnaligned16(&dest8[0], _mm256_castsi256_si128(packed_result));
         StoreUnaligned16(&dest8[pred_stride],
@@ -292,8 +288,7 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
     if (is_2d) {
       const __m256i src_long =
           SetrM128i(LoadUnaligned16(&src[0]), LoadUnaligned16(&src[8]));
-      const __m256i result =
-          HorizontalTaps8To16<filter_index>(&src_long, v_tap);
+      const __m256i result = HorizontalTaps8To16<num_taps>(&src_long, v_tap);
       StoreAligned32(&dest16[0], result);
     }
 
@@ -306,8 +301,7 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
       const __m128i next_row = LoadUnaligned16(&src[src_stride]);
       const __m256i src_long = SetrM128i(this_row, next_row);
       if (is_2d || is_compound) {
-        const __m256i result =
-            HorizontalTaps8To16<filter_index>(&src_long, v_tap);
+        const __m256i result = HorizontalTaps8To16<num_taps>(&src_long, v_tap);
         if (is_2d) {
           StoreAligned16(&dest16[0], _mm256_castsi256_si128(result));
           StoreAligned16(&dest16[pred_stride],
@@ -322,8 +316,7 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
         const __m128i next_row = LoadUnaligned16(&src[src_stride]);
         // Load into 2 128 bit lanes.
         const __m256i src_long = SetrM128i(this_row, next_row);
-        const __m256i result =
-            SimpleHorizontalTaps<filter_index>(&src_long, v_tap);
+        const __m256i result = SimpleHorizontalTaps<num_taps>(&src_long, v_tap);
         StoreLo8(&dest8[0], _mm256_castsi256_si128(result));
         StoreLo8(&dest8[pred_stride], _mm256_extracti128_si256(result, 1));
       }
@@ -337,8 +330,7 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
     // filter the remaining row.
     if (is_2d) {
       const __m256i src_long = _mm256_castsi128_si256(LoadUnaligned16(&src[0]));
-      const __m256i result =
-          HorizontalTaps8To16<filter_index>(&src_long, v_tap);
+      const __m256i result = HorizontalTaps8To16<num_taps>(&src_long, v_tap);
       StoreAligned16(&dest16[0], _mm256_castsi256_si128(result));
     }
 
@@ -351,8 +343,7 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
       const __m128i next_row = LoadUnaligned16(&src[src_stride]);
       const __m256i src_long = SetrM128i(this_row, next_row);
       if (is_2d || is_compound) {
-        const __m256i result =
-            HorizontalTaps8To16<filter_index>(&src_long, v_tap);
+        const __m256i result = HorizontalTaps8To16<num_taps>(&src_long, v_tap);
         StoreLo8(&dest16[0], _mm256_castsi256_si128(result));
         StoreLo8(&dest16[pred_stride], _mm256_extracti128_si256(result, 1));
       } else {
@@ -360,8 +351,7 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
         const __m128i next_row = LoadUnaligned16(&src[src_stride]);
         // Load into 2 128 bit lanes.
         const __m256i src_long = SetrM128i(this_row, next_row);
-        const __m256i result =
-            SimpleHorizontalTaps<filter_index>(&src_long, v_tap);
+        const __m256i result = SimpleHorizontalTaps<num_taps>(&src_long, v_tap);
         Store4(&dest8[0], _mm256_castsi256_si128(result));
         Store4(&dest8[pred_stride], _mm256_extracti128_si256(result, 1));
       }
@@ -375,8 +365,7 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
     // filter the remaining row.
     if (is_2d) {
       const __m256i src_long = _mm256_castsi128_si256(LoadUnaligned16(&src[0]));
-      const __m256i result =
-          HorizontalTaps8To16<filter_index>(&src_long, v_tap);
+      const __m256i result = HorizontalTaps8To16<num_taps>(&src_long, v_tap);
       StoreLo8(&dest16[0], _mm256_castsi256_si128(result));
     }
   }
@@ -554,18 +543,15 @@ LIBGAV1_ALWAYS_INLINE void DoHorizontalPass2xH(
   const __m128i v_horizontal_filter =
       LoadLo8(kHalfSubPixelFilters[filter_index][filter_id]);
 
-  if (filter_index == 4) {  // 4 tap.
-    SetupTaps<4>(&v_horizontal_filter, v_tap);
-    FilterHorizontal<4, 4, is_2d, is_compound>(src, src_stride, dst, dst_stride,
-                                               width, height, v_tap);
-  } else if (filter_index == 5) {  // 4 tap.
+  if ((filter_index & 0x4) != 0) {  // 4 tap.
+    // ((filter_index == 4) | (filter_index == 5))
     SetupTaps<4>(&v_horizontal_filter, v_tap);
-    FilterHorizontal<4, 5, is_2d, is_compound>(src, src_stride, dst, dst_stride,
-                                               width, height, v_tap);
+    FilterHorizontal<4, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+                                            width, height, v_tap);
   } else {  // 2 tap.
     SetupTaps<2>(&v_horizontal_filter, v_tap);
-    FilterHorizontal<2, 3, is_2d, is_compound>(src, src_stride, dst, dst_stride,
-                                               width, height, v_tap);
+    FilterHorizontal<2, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+                                            width, height, v_tap);
   }
 }
 
@@ -582,28 +568,25 @@ LIBGAV1_ALWAYS_INLINE void DoHorizontalPass(
 
   if (filter_index == 2) {  // 8 tap.
     SetupTaps<8>(&v_horizontal_filter, v_tap);
-    FilterHorizontal<8, 2, is_2d, is_compound>(src, src_stride, dst, dst_stride,
-                                               width, height, v_tap);
+    FilterHorizontal<8, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+                                            width, height, v_tap);
   } else if (filter_index == 1) {  // 6 tap.
     SetupTaps<6>(&v_horizontal_filter, v_tap);
-    FilterHorizontal<6, 1, is_2d, is_compound>(src, src_stride, dst, dst_stride,
-                                               width, height, v_tap);
+    FilterHorizontal<6, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+                                            width, height, v_tap);
   } else if (filter_index == 0) {  // 6 tap.
     SetupTaps<6>(&v_horizontal_filter, v_tap);
-    FilterHorizontal<6, 0, is_2d, is_compound>(src, src_stride, dst, dst_stride,
-                                               width, height, v_tap);
-  } else if (filter_index == 4) {  // 4 tap.
-    SetupTaps<4>(&v_horizontal_filter, v_tap);
-    FilterHorizontal<4, 4, is_2d, is_compound>(src, src_stride, dst, dst_stride,
-                                               width, height, v_tap);
-  } else if (filter_index == 5) {  // 4 tap.
+    FilterHorizontal<6, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+                                            width, height, v_tap);
+  } else if ((filter_index & 0x4) != 0) {  // 4 tap.
+    // ((filter_index == 4) | (filter_index == 5))
     SetupTaps<4>(&v_horizontal_filter, v_tap);
-    FilterHorizontal<4, 5, is_2d, is_compound>(src, src_stride, dst, dst_stride,
-                                               width, height, v_tap);
+    FilterHorizontal<4, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+                                            width, height, v_tap);
   } else {  // 2 tap.
     SetupTaps<2>(&v_horizontal_filter, v_tap);
-    FilterHorizontal<2, 3, is_2d, is_compound>(src, src_stride, dst, dst_stride,
-                                               width, height, v_tap);
+    FilterHorizontal<2, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+                                            width, height, v_tap);
   }
 }
 
@@ -617,7 +600,8 @@ void Convolve2D_AVX2(const void* LIBGAV1_RESTRICT const reference,
                      const ptrdiff_t pred_stride) {
   const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
   const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
-  const int vertical_taps = GetNumTapsInFilter(vert_filter_index);
+  const int vertical_taps =
+      GetNumTapsInFilter(vert_filter_index, vertical_filter_id);
 
   // The output of the horizontal filter is guaranteed to fit in 16 bits.
   alignas(32) uint16_t
@@ -730,61 +714,60 @@ __m256i Compound1DShift(const __m256i sum) {
   return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
 }
 
-template <int filter_index, bool unpack_high = false>
+template <int num_taps, bool unpack_high = false>
 __m256i SumVerticalTaps(const __m256i* const srcs, const __m256i* const v_tap) {
   __m256i v_src[4];
 
   if (!unpack_high) {
-    if (filter_index < 2) {
+    if (num_taps == 6) {
       // 6 taps.
       v_src[0] = _mm256_unpacklo_epi8(srcs[0], srcs[1]);
       v_src[1] = _mm256_unpacklo_epi8(srcs[2], srcs[3]);
       v_src[2] = _mm256_unpacklo_epi8(srcs[4], srcs[5]);
-    } else if (filter_index == 2) {
+    } else if (num_taps == 8) {
       // 8 taps.
       v_src[0] = _mm256_unpacklo_epi8(srcs[0], srcs[1]);
       v_src[1] = _mm256_unpacklo_epi8(srcs[2], srcs[3]);
       v_src[2] = _mm256_unpacklo_epi8(srcs[4], srcs[5]);
       v_src[3] = _mm256_unpacklo_epi8(srcs[6], srcs[7]);
-    } else if (filter_index == 3) {
+    } else if (num_taps == 2) {
       // 2 taps.
       v_src[0] = _mm256_unpacklo_epi8(srcs[0], srcs[1]);
-    } else if (filter_index > 3) {
+    } else {
       // 4 taps.
       v_src[0] = _mm256_unpacklo_epi8(srcs[0], srcs[1]);
       v_src[1] = _mm256_unpacklo_epi8(srcs[2], srcs[3]);
     }
   } else {
-    if (filter_index < 2) {
+    if (num_taps == 6) {
       // 6 taps.
       v_src[0] = _mm256_unpackhi_epi8(srcs[0], srcs[1]);
       v_src[1] = _mm256_unpackhi_epi8(srcs[2], srcs[3]);
       v_src[2] = _mm256_unpackhi_epi8(srcs[4], srcs[5]);
-    } else if (filter_index == 2) {
+    } else if (num_taps == 8) {
       // 8 taps.
       v_src[0] = _mm256_unpackhi_epi8(srcs[0], srcs[1]);
       v_src[1] = _mm256_unpackhi_epi8(srcs[2], srcs[3]);
       v_src[2] = _mm256_unpackhi_epi8(srcs[4], srcs[5]);
       v_src[3] = _mm256_unpackhi_epi8(srcs[6], srcs[7]);
-    } else if (filter_index == 3) {
+    } else if (num_taps == 2) {
       // 2 taps.
       v_src[0] = _mm256_unpackhi_epi8(srcs[0], srcs[1]);
-    } else if (filter_index > 3) {
+    } else {
       // 4 taps.
       v_src[0] = _mm256_unpackhi_epi8(srcs[0], srcs[1]);
       v_src[1] = _mm256_unpackhi_epi8(srcs[2], srcs[3]);
     }
   }
-  return SumOnePassTaps<filter_index>(v_src, v_tap);
+  return SumOnePassTaps<num_taps>(v_src, v_tap);
 }
 
-template <int filter_index, bool is_compound = false>
+template <int num_taps, bool is_compound = false>
 void FilterVertical32xH(const uint8_t* LIBGAV1_RESTRICT src,
                         const ptrdiff_t src_stride,
                         void* LIBGAV1_RESTRICT const dst,
                         const ptrdiff_t dst_stride, const int width,
                         const int height, const __m256i* const v_tap) {
-  const int num_taps = GetNumTapsInFilter(filter_index);
   const int next_row = num_taps - 1;
   auto* dst8 = static_cast<uint8_t*>(dst);
   auto* dst16 = static_cast<uint16_t*>(dst);
@@ -821,9 +804,9 @@ void FilterVertical32xH(const uint8_t* LIBGAV1_RESTRICT src,
       srcs[next_row] = LoadUnaligned32(src_x);
       src_x += src_stride;
 
-      const __m256i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+      const __m256i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
       const __m256i sums_hi =
-          SumVerticalTaps<filter_index, /*unpack_high=*/true>(srcs, v_tap);
+          SumVerticalTaps<num_taps, /*unpack_high=*/true>(srcs, v_tap);
       if (is_compound) {
         const __m256i results =
             Compound1DShift(_mm256_permute2x128_si256(sums, sums_hi, 0x20));
@@ -861,13 +844,12 @@ void FilterVertical32xH(const uint8_t* LIBGAV1_RESTRICT src,
   } while (x < width);
 }
 
-template <int filter_index, bool is_compound = false>
+template <int num_taps, bool is_compound = false>
 void FilterVertical16xH(const uint8_t* LIBGAV1_RESTRICT src,
                         const ptrdiff_t src_stride,
                         void* LIBGAV1_RESTRICT const dst,
                         const ptrdiff_t dst_stride, const int /*width*/,
                         const int height, const __m256i* const v_tap) {
-  const int num_taps = GetNumTapsInFilter(filter_index);
   const int next_row = num_taps;
   auto* dst8 = static_cast<uint8_t*>(dst);
   auto* dst16 = static_cast<uint16_t*>(dst);
@@ -922,9 +904,9 @@ void FilterVertical16xH(const uint8_t* LIBGAV1_RESTRICT src,
     srcs[next_row - 1] = _mm256_inserti128_si256(
         srcs[next_row - 1], _mm256_castsi256_si128(srcs[next_row]), 1);
 
-    const __m256i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+    const __m256i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
     const __m256i sums_hi =
-        SumVerticalTaps<filter_index, /*unpack_high=*/true>(srcs, v_tap);
+        SumVerticalTaps<num_taps, /*unpack_high=*/true>(srcs, v_tap);
     if (is_compound) {
       const __m256i results =
           Compound1DShift(_mm256_permute2x128_si256(sums, sums_hi, 0x20));
@@ -964,13 +946,12 @@ void FilterVertical16xH(const uint8_t* LIBGAV1_RESTRICT src,
   } while (y != 0);
 }
 
-template <int filter_index, bool is_compound = false>
+template <int num_taps, bool is_compound = false>
 void FilterVertical8xH(const uint8_t* LIBGAV1_RESTRICT src,
                        const ptrdiff_t src_stride,
                        void* LIBGAV1_RESTRICT const dst,
                        const ptrdiff_t dst_stride, const int /*width*/,
                        const int height, const __m256i* const v_tap) {
-  const int num_taps = GetNumTapsInFilter(filter_index);
   const int next_row = num_taps;
   auto* dst8 = static_cast<uint8_t*>(dst);
   auto* dst16 = static_cast<uint16_t*>(dst);
@@ -1025,7 +1006,7 @@ void FilterVertical8xH(const uint8_t* LIBGAV1_RESTRICT src,
     srcs[next_row - 1] = _mm256_inserti128_si256(
         srcs[next_row - 1], _mm256_castsi256_si128(srcs[next_row]), 1);
 
-    const __m256i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+    const __m256i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
     if (is_compound) {
       const __m256i results = Compound1DShift(sums);
       const __m128i this_dst = _mm256_castsi256_si128(results);
@@ -1062,13 +1043,12 @@ void FilterVertical8xH(const uint8_t* LIBGAV1_RESTRICT src,
   } while (y != 0);
 }
 
-template <int filter_index, bool is_compound = false>
+template <int num_taps, bool is_compound = false>
 void FilterVertical8xH(const uint8_t* LIBGAV1_RESTRICT src,
                        const ptrdiff_t src_stride,
                        void* LIBGAV1_RESTRICT const dst,
                        const ptrdiff_t dst_stride, const int /*width*/,
                        const int height, const __m128i* const v_tap) {
-  const int num_taps = GetNumTapsInFilter(filter_index);
   const int next_row = num_taps - 1;
   auto* dst8 = static_cast<uint8_t*>(dst);
   auto* dst16 = static_cast<uint16_t*>(dst);
@@ -1101,7 +1081,7 @@ void FilterVertical8xH(const uint8_t* LIBGAV1_RESTRICT src,
     srcs[next_row] = LoadLo8(src_x);
     src_x += src_stride;
 
-    const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+    const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
     if (is_compound) {
       const __m128i results = Compound1DShift(sums);
       StoreUnaligned16(dst16, results);
@@ -1137,7 +1117,8 @@ void ConvolveVertical_AVX2(const void* LIBGAV1_RESTRICT const reference,
                            const int height, void* LIBGAV1_RESTRICT prediction,
                            const ptrdiff_t pred_stride) {
   const int filter_index = GetFilterIndex(vertical_filter_index, height);
-  const int vertical_taps = GetNumTapsInFilter(filter_index);
+  const int vertical_taps =
+      GetNumTapsInFilter(filter_index, vertical_filter_id);
   const ptrdiff_t src_stride = reference_stride;
   const auto* src = static_cast<const uint8_t*>(reference) -
                     (vertical_taps / 2 - 1) * src_stride;
@@ -1151,43 +1132,43 @@ void ConvolveVertical_AVX2(const void* LIBGAV1_RESTRICT const reference,
   // Use 256 bits for width > 4.
   if (width > 4) {
     __m256i taps_256[4];
-    if (filter_index < 2) {  // 6 tap.
+    if (vertical_taps == 6) {  // 6 tap.
       SetupTaps<6>(&v_filter, taps_256);
       if (width == 8) {
-        FilterVertical8xH<0>(src, src_stride, dest, dest_stride, width, height,
+        FilterVertical8xH<6>(src, src_stride, dest, dest_stride, width, height,
                              taps_256);
       } else if (width == 16) {
-        FilterVertical16xH<0>(src, src_stride, dest, dest_stride, width, height,
+        FilterVertical16xH<6>(src, src_stride, dest, dest_stride, width, height,
                               taps_256);
       } else {
-        FilterVertical32xH<0>(src, src_stride, dest, dest_stride, width, height,
+        FilterVertical32xH<6>(src, src_stride, dest, dest_stride, width, height,
                               taps_256);
       }
-    } else if (filter_index == 2) {  // 8 tap.
+    } else if (vertical_taps == 8) {  // 8 tap.
       SetupTaps<8>(&v_filter, taps_256);
       if (width == 8) {
-        FilterVertical8xH<2>(src, src_stride, dest, dest_stride, width, height,
+        FilterVertical8xH<8>(src, src_stride, dest, dest_stride, width, height,
                              taps_256);
       } else if (width == 16) {
-        FilterVertical16xH<2>(src, src_stride, dest, dest_stride, width, height,
+        FilterVertical16xH<8>(src, src_stride, dest, dest_stride, width, height,
                               taps_256);
       } else {
-        FilterVertical32xH<2>(src, src_stride, dest, dest_stride, width, height,
+        FilterVertical32xH<8>(src, src_stride, dest, dest_stride, width, height,
                               taps_256);
       }
-    } else if (filter_index == 3) {  // 2 tap.
+    } else if (vertical_taps == 2) {  // 2 tap.
       SetupTaps<2>(&v_filter, taps_256);
       if (width == 8) {
-        FilterVertical8xH<3>(src, src_stride, dest, dest_stride, width, height,
+        FilterVertical8xH<2>(src, src_stride, dest, dest_stride, width, height,
                              taps_256);
       } else if (width == 16) {
-        FilterVertical16xH<3>(src, src_stride, dest, dest_stride, width, height,
+        FilterVertical16xH<2>(src, src_stride, dest, dest_stride, width, height,
                               taps_256);
       } else {
-        FilterVertical32xH<3>(src, src_stride, dest, dest_stride, width, height,
+        FilterVertical32xH<2>(src, src_stride, dest, dest_stride, width, height,
                               taps_256);
       }
-    } else if (filter_index == 4) {  // 4 tap.
+    } else {  // 4 tap.
       SetupTaps<4>(&v_filter, taps_256);
       if (width == 8) {
         FilterVertical8xH<4>(src, src_stride, dest, dest_stride, width, height,
@@ -1199,67 +1180,38 @@ void ConvolveVertical_AVX2(const void* LIBGAV1_RESTRICT const reference,
         FilterVertical32xH<4>(src, src_stride, dest, dest_stride, width, height,
                               taps_256);
       }
-    } else {
-      SetupTaps<4>(&v_filter, taps_256);
-      if (width == 8) {
-        FilterVertical8xH<5>(src, src_stride, dest, dest_stride, width, height,
-                             taps_256);
-      } else if (width == 16) {
-        FilterVertical16xH<5>(src, src_stride, dest, dest_stride, width, height,
-                              taps_256);
-      } else {
-        FilterVertical32xH<5>(src, src_stride, dest, dest_stride, width, height,
-                              taps_256);
-      }
     }
   } else {  // width <= 8
     // Use 128 bit code.
     __m128i taps[4];
 
-    if (filter_index < 2) {  // 6 tap.
+    if (vertical_taps == 6) {  // 6 tap.
       SetupTaps<6>(&v_filter, taps);
       if (width == 2) {
-        FilterVertical2xH<6, 0>(src, src_stride, dest, dest_stride, height,
-                                taps);
+        FilterVertical2xH<6>(src, src_stride, dest, dest_stride, height, taps);
       } else {
-        FilterVertical4xH<6, 0>(src, src_stride, dest, dest_stride, height,
-                                taps);
+        FilterVertical4xH<6>(src, src_stride, dest, dest_stride, height, taps);
       }
-    } else if (filter_index == 2) {  // 8 tap.
+    } else if (vertical_taps == 8) {  // 8 tap.
       SetupTaps<8>(&v_filter, taps);
       if (width == 2) {
-        FilterVertical2xH<8, 2>(src, src_stride, dest, dest_stride, height,
-                                taps);
+        FilterVertical2xH<8>(src, src_stride, dest, dest_stride, height, taps);
       } else {
-        FilterVertical4xH<8, 2>(src, src_stride, dest, dest_stride, height,
-                                taps);
+        FilterVertical4xH<8>(src, src_stride, dest, dest_stride, height, taps);
       }
-    } else if (filter_index == 3) {  // 2 tap.
+    } else if (vertical_taps == 2) {  // 2 tap.
       SetupTaps<2>(&v_filter, taps);
       if (width == 2) {
-        FilterVertical2xH<2, 3>(src, src_stride, dest, dest_stride, height,
-                                taps);
-      } else {
-        FilterVertical4xH<2, 3>(src, src_stride, dest, dest_stride, height,
-                                taps);
-      }
-    } else if (filter_index == 4) {  // 4 tap.
-      SetupTaps<4>(&v_filter, taps);
-      if (width == 2) {
-        FilterVertical2xH<4, 4>(src, src_stride, dest, dest_stride, height,
-                                taps);
+        FilterVertical2xH<2>(src, src_stride, dest, dest_stride, height, taps);
       } else {
-        FilterVertical4xH<4, 4>(src, src_stride, dest, dest_stride, height,
-                                taps);
+        FilterVertical4xH<2>(src, src_stride, dest, dest_stride, height, taps);
       }
-    } else {
+    } else {  // 4 tap.
       SetupTaps<4>(&v_filter, taps);
       if (width == 2) {
-        FilterVertical2xH<4, 5>(src, src_stride, dest, dest_stride, height,
-                                taps);
+        FilterVertical2xH<4>(src, src_stride, dest, dest_stride, height, taps);
       } else {
-        FilterVertical4xH<4, 5>(src, src_stride, dest, dest_stride, height,
-                                taps);
+        FilterVertical4xH<4>(src, src_stride, dest, dest_stride, height, taps);
       }
     }
   }
@@ -1272,7 +1224,8 @@ void ConvolveCompoundVertical_AVX2(
     const int vertical_filter_id, const int width, const int height,
     void* LIBGAV1_RESTRICT prediction, const ptrdiff_t /*pred_stride*/) {
   const int filter_index = GetFilterIndex(vertical_filter_index, height);
-  const int vertical_taps = GetNumTapsInFilter(filter_index);
+  const int vertical_taps =
+      GetNumTapsInFilter(filter_index, vertical_filter_id);
   const ptrdiff_t src_stride = reference_stride;
   const auto* src = static_cast<const uint8_t*>(reference) -
                     (vertical_taps / 2 - 1) * src_stride;
@@ -1286,43 +1239,43 @@ void ConvolveCompoundVertical_AVX2(
   // Use 256 bits for width > 4.
   if (width > 4) {
     __m256i taps_256[4];
-    if (filter_index < 2) {  // 6 tap.
+    if (vertical_taps == 6) {  // 6 tap.
       SetupTaps<6>(&v_filter, taps_256);
       if (width == 8) {
-        FilterVertical8xH<0, /*is_compound=*/true>(
+        FilterVertical8xH<6, /*is_compound=*/true>(
             src, src_stride, dest, dest_stride, width, height, taps_256);
       } else if (width == 16) {
-        FilterVertical16xH<0, /*is_compound=*/true>(
+        FilterVertical16xH<6, /*is_compound=*/true>(
             src, src_stride, dest, dest_stride, width, height, taps_256);
       } else {
-        FilterVertical32xH<0, /*is_compound=*/true>(
+        FilterVertical32xH<6, /*is_compound=*/true>(
             src, src_stride, dest, dest_stride, width, height, taps_256);
       }
-    } else if (filter_index == 2) {  // 8 tap.
+    } else if (vertical_taps == 8) {  // 8 tap.
       SetupTaps<8>(&v_filter, taps_256);
       if (width == 8) {
-        FilterVertical8xH<2, /*is_compound=*/true>(
+        FilterVertical8xH<8, /*is_compound=*/true>(
             src, src_stride, dest, dest_stride, width, height, taps_256);
       } else if (width == 16) {
-        FilterVertical16xH<2, /*is_compound=*/true>(
+        FilterVertical16xH<8, /*is_compound=*/true>(
             src, src_stride, dest, dest_stride, width, height, taps_256);
       } else {
-        FilterVertical32xH<2, /*is_compound=*/true>(
+        FilterVertical32xH<8, /*is_compound=*/true>(
             src, src_stride, dest, dest_stride, width, height, taps_256);
       }
-    } else if (filter_index == 3) {  // 2 tap.
+    } else if (vertical_taps == 2) {  // 2 tap.
       SetupTaps<2>(&v_filter, taps_256);
       if (width == 8) {
-        FilterVertical8xH<3, /*is_compound=*/true>(
+        FilterVertical8xH<2, /*is_compound=*/true>(
             src, src_stride, dest, dest_stride, width, height, taps_256);
       } else if (width == 16) {
-        FilterVertical16xH<3, /*is_compound=*/true>(
+        FilterVertical16xH<2, /*is_compound=*/true>(
             src, src_stride, dest, dest_stride, width, height, taps_256);
       } else {
-        FilterVertical32xH<3, /*is_compound=*/true>(
+        FilterVertical32xH<2, /*is_compound=*/true>(
             src, src_stride, dest, dest_stride, width, height, taps_256);
       }
-    } else if (filter_index == 4) {  // 4 tap.
+    } else {  // 4 tap.
       SetupTaps<4>(&v_filter, taps_256);
       if (width == 8) {
         FilterVertical8xH<4, /*is_compound=*/true>(
@@ -1334,43 +1287,27 @@ void ConvolveCompoundVertical_AVX2(
         FilterVertical32xH<4, /*is_compound=*/true>(
             src, src_stride, dest, dest_stride, width, height, taps_256);
       }
-    } else {
-      SetupTaps<4>(&v_filter, taps_256);
-      if (width == 8) {
-        FilterVertical8xH<5, /*is_compound=*/true>(
-            src, src_stride, dest, dest_stride, width, height, taps_256);
-      } else if (width == 16) {
-        FilterVertical16xH<5, /*is_compound=*/true>(
-            src, src_stride, dest, dest_stride, width, height, taps_256);
-      } else {
-        FilterVertical32xH<5, /*is_compound=*/true>(
-            src, src_stride, dest, dest_stride, width, height, taps_256);
-      }
     }
   } else {  // width <= 4
     // Use 128 bit code.
     __m128i taps[4];
 
-    if (filter_index < 2) {  // 6 tap.
+    if (vertical_taps == 6) {  // 6 tap.
       SetupTaps<6>(&v_filter, taps);
-      FilterVertical4xH<6, 0, /*is_compound=*/true>(src, src_stride, dest,
-                                                    dest_stride, height, taps);
-    } else if (filter_index == 2) {  // 8 tap.
+      FilterVertical4xH<6, /*is_compound=*/true>(src, src_stride, dest,
+                                                 dest_stride, height, taps);
+    } else if (vertical_taps == 8) {  // 8 tap.
       SetupTaps<8>(&v_filter, taps);
-      FilterVertical4xH<8, 2, /*is_compound=*/true>(src, src_stride, dest,
-                                                    dest_stride, height, taps);
-    } else if (filter_index == 3) {  // 2 tap.
+      FilterVertical4xH<8, /*is_compound=*/true>(src, src_stride, dest,
+                                                 dest_stride, height, taps);
+    } else if (vertical_taps == 2) {  // 2 tap.
       SetupTaps<2>(&v_filter, taps);
-      FilterVertical4xH<2, 3, /*is_compound=*/true>(src, src_stride, dest,
-                                                    dest_stride, height, taps);
-    } else if (filter_index == 4) {  // 4 tap.
-      SetupTaps<4>(&v_filter, taps);
-      FilterVertical4xH<4, 4, /*is_compound=*/true>(src, src_stride, dest,
-                                                    dest_stride, height, taps);
-    } else {
+      FilterVertical4xH<2, /*is_compound=*/true>(src, src_stride, dest,
+                                                 dest_stride, height, taps);
+    } else {  // 4 tap.
       SetupTaps<4>(&v_filter, taps);
-      FilterVertical4xH<4, 5, /*is_compound=*/true>(src, src_stride, dest,
-                                                    dest_stride, height, taps);
+      FilterVertical4xH<4, /*is_compound=*/true>(src, src_stride, dest,
+                                                 dest_stride, height, taps);
     }
   }
 }
@@ -1430,7 +1367,8 @@ void ConvolveCompound2D_AVX2(
     void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) {
   const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
   const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
-  const int vertical_taps = GetNumTapsInFilter(vert_filter_index);
+  const int vertical_taps =
+      GetNumTapsInFilter(vert_filter_index, vertical_filter_id);
 
   // The output of the horizontal filter is guaranteed to fit in 16 bits.
   alignas(32) uint16_t
diff --git a/src/dsp/x86/convolve_sse4.cc b/src/dsp/x86/convolve_sse4.cc
index f7e5a71..f427c4c 100644
--- a/src/dsp/x86/convolve_sse4.cc
+++ b/src/dsp/x86/convolve_sse4.cc
@@ -36,7 +36,7 @@ namespace {
 
 #include "src/dsp/x86/convolve_sse4.inc"
 
-template <int filter_index>
+template <int num_taps>
 __m128i SumHorizontalTaps(const uint8_t* LIBGAV1_RESTRICT const src,
                           const __m128i* const v_tap) {
   __m128i v_src[4];
@@ -44,33 +44,33 @@ __m128i SumHorizontalTaps(const uint8_t* LIBGAV1_RESTRICT const src,
   const __m128i src_long_dup_lo = _mm_unpacklo_epi8(src_long, src_long);
   const __m128i src_long_dup_hi = _mm_unpackhi_epi8(src_long, src_long);
 
-  if (filter_index < 2) {
+  if (num_taps == 6) {
     // 6 taps.
     v_src[0] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 3);   // _21
     v_src[1] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 7);   // _43
     v_src[2] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 11);  // _65
-  } else if (filter_index == 2) {
+  } else if (num_taps == 8) {
     // 8 taps.
     v_src[0] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 1);   // _10
     v_src[1] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 5);   // _32
     v_src[2] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 9);   // _54
     v_src[3] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 13);  // _76
-  } else if (filter_index == 3) {
+  } else if (num_taps == 2) {
     // 2 taps.
     v_src[0] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 7);  // _43
-  } else if (filter_index > 3) {
+  } else {
     // 4 taps.
     v_src[0] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 5);  // _32
     v_src[1] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 9);  // _54
   }
-  const __m128i sum = SumOnePassTaps<filter_index>(v_src, v_tap);
+  const __m128i sum = SumOnePassTaps<num_taps>(v_src, v_tap);
   return sum;
 }
 
-template <int filter_index>
+template <int num_taps>
 __m128i SimpleHorizontalTaps(const uint8_t* LIBGAV1_RESTRICT const src,
                              const __m128i* const v_tap) {
-  __m128i sum = SumHorizontalTaps<filter_index>(src, v_tap);
+  __m128i sum = SumHorizontalTaps<num_taps>(src, v_tap);
 
   // Normally the Horizontal pass does the downshift in two passes:
   // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
@@ -83,16 +83,15 @@ __m128i SimpleHorizontalTaps(const uint8_t* LIBGAV1_RESTRICT const src,
   return _mm_packus_epi16(sum, sum);
 }
 
-template <int filter_index>
+template <int num_taps>
 __m128i HorizontalTaps8To16(const uint8_t* LIBGAV1_RESTRICT const src,
                             const __m128i* const v_tap) {
-  const __m128i sum = SumHorizontalTaps<filter_index>(src, v_tap);
+  const __m128i sum = SumHorizontalTaps<num_taps>(src, v_tap);
 
   return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
 }
 
-template <int num_taps, int filter_index, bool is_2d = false,
-          bool is_compound = false>
+template <int num_taps, bool is_2d = false, bool is_compound = false>
 void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
                       const ptrdiff_t src_stride,
                       void* LIBGAV1_RESTRICT const dest,
@@ -108,16 +107,14 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
       int x = 0;
       do {
         if (is_2d || is_compound) {
-          const __m128i v_sum =
-              HorizontalTaps8To16<filter_index>(&src[x], v_tap);
+          const __m128i v_sum = HorizontalTaps8To16<num_taps>(&src[x], v_tap);
           if (is_2d) {
             StoreAligned16(&dest16[x], v_sum);
           } else {
             StoreUnaligned16(&dest16[x], v_sum);
           }
         } else {
-          const __m128i result =
-              SimpleHorizontalTaps<filter_index>(&src[x], v_tap);
+          const __m128i result = SimpleHorizontalTaps<num_taps>(&src[x], v_tap);
           StoreLo8(&dest8[x], result);
         }
         x += 8;
@@ -138,10 +135,10 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
       int y = height;
       do {
         if (is_2d || is_compound) {
-          const __m128i v_sum = HorizontalTaps8To16<filter_index>(src, v_tap);
+          const __m128i v_sum = HorizontalTaps8To16<num_taps>(src, v_tap);
           StoreLo8(dest16, v_sum);
         } else {
-          const __m128i result = SimpleHorizontalTaps<filter_index>(src, v_tap);
+          const __m128i result = SimpleHorizontalTaps<num_taps>(src, v_tap);
           Store4(&dest8[0], result);
         }
         src += src_stride;
@@ -157,14 +154,14 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
       do {
         if (is_2d) {
           const __m128i sum =
-              HorizontalTaps8To16_2x2<filter_index>(src, src_stride, v_tap);
+              HorizontalTaps8To16_2x2<num_taps>(src, src_stride, v_tap);
           Store4(&dest16[0], sum);
           dest16 += pred_stride;
           Store4(&dest16[0], _mm_srli_si128(sum, 8));
           dest16 += pred_stride;
         } else {
           const __m128i sum =
-              SimpleHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
+              SimpleHorizontalTaps2x2<num_taps>(src, src_stride, v_tap);
           Store2(dest8, sum);
           dest8 += pred_stride;
           Store2(dest8, _mm_srli_si128(sum, 4));
@@ -181,7 +178,7 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
         assert(height % 2 == 1);
         __m128i sum;
         const __m128i input = LoadLo8(&src[2]);
-        if (filter_index == 3) {
+        if (num_taps == 2) {
           // 03 04 04 05 05 06 06 07 ....
           const __m128i v_src_43 =
               _mm_srli_si128(_mm_unpacklo_epi8(input, input), 3);
@@ -218,28 +215,25 @@ LIBGAV1_ALWAYS_INLINE void DoHorizontalPass(
 
   if (filter_index == 2) {  // 8 tap.
     SetupTaps<8>(&v_horizontal_filter, v_tap);
-    FilterHorizontal<8, 2, is_2d, is_compound>(src, src_stride, dst, dst_stride,
-                                               width, height, v_tap);
+    FilterHorizontal<8, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+                                            width, height, v_tap);
   } else if (filter_index == 1) {  // 6 tap.
     SetupTaps<6>(&v_horizontal_filter, v_tap);
-    FilterHorizontal<6, 1, is_2d, is_compound>(src, src_stride, dst, dst_stride,
-                                               width, height, v_tap);
+    FilterHorizontal<6, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+                                            width, height, v_tap);
   } else if (filter_index == 0) {  // 6 tap.
     SetupTaps<6>(&v_horizontal_filter, v_tap);
-    FilterHorizontal<6, 0, is_2d, is_compound>(src, src_stride, dst, dst_stride,
-                                               width, height, v_tap);
-  } else if (filter_index == 4) {  // 4 tap.
-    SetupTaps<4>(&v_horizontal_filter, v_tap);
-    FilterHorizontal<4, 4, is_2d, is_compound>(src, src_stride, dst, dst_stride,
-                                               width, height, v_tap);
-  } else if (filter_index == 5) {  // 4 tap.
+    FilterHorizontal<6, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+                                            width, height, v_tap);
+  } else if ((filter_index & 0x4) != 0) {  // 4 tap.
+    // ((filter_index == 4) | (filter_index == 5))
     SetupTaps<4>(&v_horizontal_filter, v_tap);
-    FilterHorizontal<4, 5, is_2d, is_compound>(src, src_stride, dst, dst_stride,
-                                               width, height, v_tap);
+    FilterHorizontal<4, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+                                            width, height, v_tap);
   } else {  // 2 tap.
     SetupTaps<2>(&v_horizontal_filter, v_tap);
-    FilterHorizontal<2, 3, is_2d, is_compound>(src, src_stride, dst, dst_stride,
-                                               width, height, v_tap);
+    FilterHorizontal<2, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+                                            width, height, v_tap);
   }
 }
 
@@ -253,7 +247,8 @@ void Convolve2D_SSE4_1(const void* LIBGAV1_RESTRICT const reference,
                        const ptrdiff_t pred_stride) {
   const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
   const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
-  const int vertical_taps = GetNumTapsInFilter(vert_filter_index);
+  const int vertical_taps =
+      GetNumTapsInFilter(vert_filter_index, vertical_filter_id);
 
   // The output of the horizontal filter is guaranteed to fit in 16 bits.
   alignas(16) uint16_t
@@ -329,13 +324,12 @@ void Convolve2D_SSE4_1(const void* LIBGAV1_RESTRICT const reference,
   }
 }
 
-template <int filter_index, bool is_compound = false>
+template <int num_taps, bool is_compound = false>
 void FilterVertical(const uint8_t* LIBGAV1_RESTRICT src,
                     const ptrdiff_t src_stride,
                     void* LIBGAV1_RESTRICT const dst,
                     const ptrdiff_t dst_stride, const int width,
                     const int height, const __m128i* const v_tap) {
-  const int num_taps = GetNumTapsInFilter(filter_index);
   const int next_row = num_taps - 1;
   auto* dst8 = static_cast<uint8_t*>(dst);
   auto* dst16 = static_cast<uint16_t*>(dst);
@@ -373,7 +367,7 @@ void FilterVertical(const uint8_t* LIBGAV1_RESTRICT src,
       srcs[next_row] = LoadLo8(src_x);
       src_x += src_stride;
 
-      const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+      const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
       if (is_compound) {
         const __m128i results = Compound1DShift(sums);
         StoreUnaligned16(dst16_x, results);
@@ -410,7 +404,8 @@ void ConvolveVertical_SSE4_1(
     const int vertical_filter_id, const int width, const int height,
     void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) {
   const int filter_index = GetFilterIndex(vertical_filter_index, height);
-  const int vertical_taps = GetNumTapsInFilter(filter_index);
+  const int vertical_taps =
+      GetNumTapsInFilter(filter_index, vertical_filter_id);
   const ptrdiff_t src_stride = reference_stride;
   const auto* src = static_cast<const uint8_t*>(reference) -
                     (vertical_taps / 2 - 1) * src_stride;
@@ -422,63 +417,50 @@ void ConvolveVertical_SSE4_1(
   const __m128i v_filter =
       LoadLo8(kHalfSubPixelFilters[filter_index][vertical_filter_id]);
 
-  if (filter_index < 2) {  // 6 tap.
+  if (vertical_taps == 6) {  // 6 tap.
     SetupTaps<6>(&v_filter, taps);
     if (width == 2) {
-      FilterVertical2xH<6, 0>(src, src_stride, dest, dest_stride, height, taps);
+      FilterVertical2xH<6>(src, src_stride, dest, dest_stride, height, taps);
     } else if (width == 4) {
-      FilterVertical4xH<6, 0>(src, src_stride, dest, dest_stride, height, taps);
+      FilterVertical4xH<6>(src, src_stride, dest, dest_stride, height, taps);
     } else {
-      FilterVertical<0>(src, src_stride, dest, dest_stride, width, height,
+      FilterVertical<6>(src, src_stride, dest, dest_stride, width, height,
                         taps);
     }
-  } else if (filter_index == 2) {  // 8 tap.
+  } else if (vertical_taps == 8) {  // 8 tap.
     SetupTaps<8>(&v_filter, taps);
     if (width == 2) {
-      FilterVertical2xH<8, 2>(src, src_stride, dest, dest_stride, height, taps);
+      FilterVertical2xH<8>(src, src_stride, dest, dest_stride, height, taps);
     } else if (width == 4) {
-      FilterVertical4xH<8, 2>(src, src_stride, dest, dest_stride, height, taps);
+      FilterVertical4xH<8>(src, src_stride, dest, dest_stride, height, taps);
     } else {
-      FilterVertical<2>(src, src_stride, dest, dest_stride, width, height,
+      FilterVertical<8>(src, src_stride, dest, dest_stride, width, height,
                         taps);
     }
-  } else if (filter_index == 3) {  // 2 tap.
+  } else if (vertical_taps == 2) {  // 2 tap.
     SetupTaps<2>(&v_filter, taps);
     if (width == 2) {
-      FilterVertical2xH<2, 3>(src, src_stride, dest, dest_stride, height, taps);
+      FilterVertical2xH<2>(src, src_stride, dest, dest_stride, height, taps);
     } else if (width == 4) {
-      FilterVertical4xH<2, 3>(src, src_stride, dest, dest_stride, height, taps);
+      FilterVertical4xH<2>(src, src_stride, dest, dest_stride, height, taps);
     } else {
-      FilterVertical<3>(src, src_stride, dest, dest_stride, width, height,
+      FilterVertical<2>(src, src_stride, dest, dest_stride, width, height,
                         taps);
     }
-  } else if (filter_index == 4) {  // 4 tap.
+  } else {  // 4 tap
     SetupTaps<4>(&v_filter, taps);
     if (width == 2) {
-      FilterVertical2xH<4, 4>(src, src_stride, dest, dest_stride, height, taps);
+      FilterVertical2xH<4>(src, src_stride, dest, dest_stride, height, taps);
     } else if (width == 4) {
-      FilterVertical4xH<4, 4>(src, src_stride, dest, dest_stride, height, taps);
+      FilterVertical4xH<4>(src, src_stride, dest, dest_stride, height, taps);
     } else {
       FilterVertical<4>(src, src_stride, dest, dest_stride, width, height,
                         taps);
     }
-  } else {
-    // TODO(slavarnway): Investigate adding |filter_index| == 1 special cases.
-    // See convolve_neon.cc
-    SetupTaps<4>(&v_filter, taps);
-
-    if (width == 2) {
-      FilterVertical2xH<4, 5>(src, src_stride, dest, dest_stride, height, taps);
-    } else if (width == 4) {
-      FilterVertical4xH<4, 5>(src, src_stride, dest, dest_stride, height, taps);
-    } else {
-      FilterVertical<5>(src, src_stride, dest, dest_stride, width, height,
-                        taps);
-    }
   }
 }
 
-void ConvolveCompoundCopy_SSE4(
+void ConvolveCompoundCopy_SSE4_1(
     const void* LIBGAV1_RESTRICT const reference,
     const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
     const int /*vertical_filter_index*/, const int /*horizontal_filter_id*/,
@@ -502,7 +484,6 @@ void ConvolveCompoundCopy_SSE4(
             _mm_slli_epi16(v_src_ext_lo, kRoundBitsVertical);
         const __m128i v_dest_hi =
             _mm_slli_epi16(v_src_ext_hi, kRoundBitsVertical);
-        // TODO(slavarnway): Investigate using aligned stores.
         StoreUnaligned16(&dest[x], v_dest_lo);
         StoreUnaligned16(&dest[x + 8], v_dest_hi);
         x += 16;
@@ -544,7 +525,8 @@ void ConvolveCompoundVertical_SSE4_1(
     const int vertical_filter_id, const int width, const int height,
     void* LIBGAV1_RESTRICT prediction, const ptrdiff_t /*pred_stride*/) {
   const int filter_index = GetFilterIndex(vertical_filter_index, height);
-  const int vertical_taps = GetNumTapsInFilter(filter_index);
+  const int vertical_taps =
+      GetNumTapsInFilter(filter_index, vertical_filter_id);
   const ptrdiff_t src_stride = reference_stride;
   const auto* src = static_cast<const uint8_t*>(reference) -
                     (vertical_taps / 2 - 1) * src_stride;
@@ -555,55 +537,42 @@ void ConvolveCompoundVertical_SSE4_1(
   const __m128i v_filter =
       LoadLo8(kHalfSubPixelFilters[filter_index][vertical_filter_id]);
 
-  if (filter_index < 2) {  // 6 tap.
+  if (vertical_taps == 6) {  // 6 tap.
     SetupTaps<6>(&v_filter, taps);
     if (width == 4) {
-      FilterVertical4xH<6, 0, /*is_compound=*/true>(src, src_stride, dest, 4,
-                                                    height, taps);
+      FilterVertical4xH<6, /*is_compound=*/true>(src, src_stride, dest, 4,
+                                                 height, taps);
     } else {
-      FilterVertical<0, /*is_compound=*/true>(src, src_stride, dest, width,
+      FilterVertical<6, /*is_compound=*/true>(src, src_stride, dest, width,
                                               width, height, taps);
     }
-  } else if (filter_index == 2) {  // 8 tap.
+  } else if (vertical_taps == 8) {  // 8 tap.
     SetupTaps<8>(&v_filter, taps);
-
     if (width == 4) {
-      FilterVertical4xH<8, 2, /*is_compound=*/true>(src, src_stride, dest, 4,
-                                                    height, taps);
+      FilterVertical4xH<8, /*is_compound=*/true>(src, src_stride, dest, 4,
+                                                 height, taps);
     } else {
-      FilterVertical<2, /*is_compound=*/true>(src, src_stride, dest, width,
+      FilterVertical<8, /*is_compound=*/true>(src, src_stride, dest, width,
                                               width, height, taps);
     }
-  } else if (filter_index == 3) {  // 2 tap.
+  } else if (vertical_taps == 2) {  // 2 tap.
     SetupTaps<2>(&v_filter, taps);
-
     if (width == 4) {
-      FilterVertical4xH<2, 3, /*is_compound=*/true>(src, src_stride, dest, 4,
-                                                    height, taps);
+      FilterVertical4xH<2, /*is_compound=*/true>(src, src_stride, dest, 4,
+                                                 height, taps);
     } else {
-      FilterVertical<3, /*is_compound=*/true>(src, src_stride, dest, width,
+      FilterVertical<2, /*is_compound=*/true>(src, src_stride, dest, width,
                                               width, height, taps);
     }
-  } else if (filter_index == 4) {  // 4 tap.
+  } else {  // 4 tap
     SetupTaps<4>(&v_filter, taps);
-
     if (width == 4) {
-      FilterVertical4xH<4, 4, /*is_compound=*/true>(src, src_stride, dest, 4,
-                                                    height, taps);
+      FilterVertical4xH<4, /*is_compound=*/true>(src, src_stride, dest, 4,
+                                                 height, taps);
     } else {
       FilterVertical<4, /*is_compound=*/true>(src, src_stride, dest, width,
                                               width, height, taps);
     }
-  } else {
-    SetupTaps<4>(&v_filter, taps);
-
-    if (width == 4) {
-      FilterVertical4xH<4, 5, /*is_compound=*/true>(src, src_stride, dest, 4,
-                                                    height, taps);
-    } else {
-      FilterVertical<5, /*is_compound=*/true>(src, src_stride, dest, width,
-                                              width, height, taps);
-    }
   }
 }
 
@@ -656,7 +625,8 @@ void ConvolveCompound2D_SSE4_1(
   // Similarly for height.
   const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
   const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
-  const int vertical_taps = GetNumTapsInFilter(vert_filter_index);
+  const int vertical_taps =
+      GetNumTapsInFilter(vert_filter_index, vertical_filter_id);
   const int intermediate_height = height + vertical_taps - 1;
   const ptrdiff_t src_stride = reference_stride;
   const auto* const src = static_cast<const uint8_t*>(reference) -
@@ -933,7 +903,7 @@ inline void ConvolveHorizontalScale(const uint8_t* LIBGAV1_RESTRICT src,
                                                       source);
 
         StoreLo8(intermediate, RightShiftWithRounding_S16(
-                                   SumOnePassTaps<filter_index>(source, taps),
+                                   SumOnePassTaps<num_taps>(source, taps),
                                    kInterRoundBitsHorizontal - 1));
         src_x += src_stride;
         intermediate += kIntermediateStride;
@@ -960,10 +930,9 @@ inline void ConvolveHorizontalScale(const uint8_t* LIBGAV1_RESTRICT src,
       PrepareSourceVectors<num_taps, grade_x>(src_x, packed_indices, source);
 
       // Shift by one less because the taps are halved.
-      StoreAligned16(
-          intermediate_x,
-          RightShiftWithRounding_S16(SumOnePassTaps<filter_index>(source, taps),
-                                     kInterRoundBitsHorizontal - 1));
+      StoreAligned16(intermediate_x, RightShiftWithRounding_S16(
+                                         SumOnePassTaps<num_taps>(source, taps),
+                                         kInterRoundBitsHorizontal - 1));
       src_x += src_stride;
       intermediate_x += kIntermediateStride;
     } while (--y != 0);
@@ -1188,7 +1157,7 @@ void ConvolveScale2D_SSE4_1(const void* LIBGAV1_RESTRICT const reference,
   alignas(16) int16_t
       intermediate_result[kIntermediateAllocWidth *
                           (2 * kIntermediateAllocWidth + kSubPixelTaps)];
-  const int num_vert_taps = GetNumTapsInFilter(vert_filter_index);
+  const int num_vert_taps = dsp::GetNumTapsInFilter(vert_filter_index);
   const int intermediate_height =
       (((height - 1) * step_y + (1 << kScaleSubPixelBits) - 1) >>
        kScaleSubPixelBits) +
@@ -1211,7 +1180,7 @@ void ConvolveScale2D_SSE4_1(const void* LIBGAV1_RESTRICT const reference,
   // inputs in each iteration on large blocks. When step_x is large, we need a
   // second register and alignr in order to gather all filter inputs.
   // |num_taps| - 1 is the offset for the shuffle of inputs to the final tap.
-  const int num_horiz_taps = GetNumTapsInFilter(horiz_filter_index);
+  const int num_horiz_taps = dsp::GetNumTapsInFilter(horiz_filter_index);
   const int kernel_start_ceiling = 16 - num_horiz_taps;
   // This truncated quotient |grade_x_threshold| selects |step_x| such that:
   // (step_x * 7) >> kScaleSubPixelBits < single load limit
@@ -1891,7 +1860,7 @@ void Init8bpp() {
   dsp->convolve[0][0][1][0] = ConvolveVertical_SSE4_1;
   dsp->convolve[0][0][1][1] = Convolve2D_SSE4_1;
 
-  dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_SSE4;
+  dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_SSE4_1;
   dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_SSE4_1;
   dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_SSE4_1;
   dsp->convolve[0][1][1][1] = ConvolveCompound2D_SSE4_1;
diff --git a/src/dsp/x86/convolve_sse4.inc b/src/dsp/x86/convolve_sse4.inc
index 550d6a4..5548c5b 100644
--- a/src/dsp/x86/convolve_sse4.inc
+++ b/src/dsp/x86/convolve_sse4.inc
@@ -18,20 +18,63 @@
 
 #include "src/dsp/convolve.inc"
 
+// This version checks for the special cases when filter_index == 1.
+int GetNumTapsInFilter(const int filter_index, const int filter_id) {
+  if (filter_index == 0) {
+    // Despite the names these only use 6 taps.
+    // kInterpolationFilterEightTap
+    // kInterpolationFilterEightTapSmooth
+    return 6;
+  }
+
+  if (filter_index == 1) {
+    // Despite the names these only use 6 taps.
+    // kInterpolationFilterEightTap
+    // kInterpolationFilterEightTapSmooth
+    if (((filter_id == 1) | (filter_id == 15) | (filter_id == 7) |
+         (filter_id == 8) | (filter_id == 9)) != 0) {
+      return 6;
+    }
+    // When |filter_index| == 1, the |filter_id| values not listed above map to
+    // 4 tap filters.
+    return 4;
+  }
+
+  if (filter_index == 2) {
+    // kInterpolationFilterEightTapSharp
+    return 8;
+  }
+
+  if (filter_index == 3) {
+    // kInterpolationFilterBilinear
+    return 2;
+  }
+
+  assert(filter_index > 3);
+  // For small sizes (width/height <= 4) the large filters are replaced with 4
+  // tap options.
+  // If the original filters were |kInterpolationFilterEightTap| or
+  // |kInterpolationFilterEightTapSharp| then it becomes
+  // |kInterpolationFilterSwitchable|.
+  // If it was |kInterpolationFilterEightTapSmooth| then it becomes an unnamed 4
+  // tap filter.
+  return 4;
+}
+
 // Multiply every entry in |src[]| by the corresponding entry in |taps[]| and
 // sum. The filters in |taps[]| are pre-shifted by 1. This prevents the final
 // sum from outranging int16_t.
-template <int filter_index>
+template <int num_taps>
 __m128i SumOnePassTaps(const __m128i* const src, const __m128i* const taps) {
   __m128i sum;
-  if (filter_index < 2) {
+  if (num_taps == 6) {
     // 6 taps.
     const __m128i v_madd_21 = _mm_maddubs_epi16(src[0], taps[0]);  // k2k1
     const __m128i v_madd_43 = _mm_maddubs_epi16(src[1], taps[1]);  // k4k3
     const __m128i v_madd_65 = _mm_maddubs_epi16(src[2], taps[2]);  // k6k5
     sum = _mm_add_epi16(v_madd_21, v_madd_43);
     sum = _mm_add_epi16(sum, v_madd_65);
-  } else if (filter_index == 2) {
+  } else if (num_taps == 8) {
     // 8 taps.
     const __m128i v_madd_10 = _mm_maddubs_epi16(src[0], taps[0]);  // k1k0
     const __m128i v_madd_32 = _mm_maddubs_epi16(src[1], taps[1]);  // k3k2
@@ -40,7 +83,7 @@ __m128i SumOnePassTaps(const __m128i* const src, const __m128i* const taps) {
     const __m128i v_sum_3210 = _mm_add_epi16(v_madd_10, v_madd_32);
     const __m128i v_sum_7654 = _mm_add_epi16(v_madd_54, v_madd_76);
     sum = _mm_add_epi16(v_sum_7654, v_sum_3210);
-  } else if (filter_index == 3) {
+  } else if (num_taps == 2) {
     // 2 taps.
     sum = _mm_maddubs_epi16(src[0], taps[0]);  // k4k3
   } else {
@@ -52,13 +95,13 @@ __m128i SumOnePassTaps(const __m128i* const src, const __m128i* const taps) {
   return sum;
 }
 
-template <int filter_index>
+template <int num_taps>
 __m128i SumHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride,
                              const __m128i* const v_tap) {
   // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17
   const __m128i v_src = LoadHi8(LoadLo8(&src[0]), &src[src_stride]);
 
-  if (filter_index == 3) {
+  if (num_taps == 2) {
     // 03 04 04 05 05 06 06 07 13 14 14 15 15 16 16 17
     const __m128i v_src_43 = _mm_shuffle_epi8(
         v_src, _mm_set_epi32(0x0f0e0e0d, 0x0d0c0c0b, 0x07060605, 0x05040403));
@@ -79,10 +122,10 @@ __m128i SumHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride,
   return v_sum_5432;
 }
 
-template <int filter_index>
+template <int num_taps>
 __m128i SimpleHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride,
                                 const __m128i* const v_tap) {
-  __m128i sum = SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
+  __m128i sum = SumHorizontalTaps2x2<num_taps>(src, src_stride, v_tap);
 
   // Normally the Horizontal pass does the downshift in two passes:
   // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
@@ -95,11 +138,10 @@ __m128i SimpleHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride,
   return _mm_packus_epi16(sum, sum);
 }
 
-template <int filter_index>
+template <int num_taps>
 __m128i HorizontalTaps8To16_2x2(const uint8_t* src, const ptrdiff_t src_stride,
                                 const __m128i* const v_tap) {
-  const __m128i sum =
-      SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
+  const __m128i sum = SumHorizontalTaps2x2<num_taps>(src, src_stride, v_tap);
 
   return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
 }
@@ -411,36 +453,34 @@ __m128i Compound1DShift(const __m128i sum) {
   return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
 }
 
-template <int filter_index>
+template <int num_taps>
 __m128i SumVerticalTaps(const __m128i* const srcs, const __m128i* const v_tap) {
   __m128i v_src[4];
 
-  if (filter_index < 2) {
+  if (num_taps == 6) {
     // 6 taps.
     v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
     v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]);
     v_src[2] = _mm_unpacklo_epi8(srcs[4], srcs[5]);
-  } else if (filter_index == 2) {
+  } else if (num_taps == 8) {
     // 8 taps.
     v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
     v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]);
     v_src[2] = _mm_unpacklo_epi8(srcs[4], srcs[5]);
     v_src[3] = _mm_unpacklo_epi8(srcs[6], srcs[7]);
-  } else if (filter_index == 3) {
+  } else if (num_taps == 2) {
     // 2 taps.
     v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
-  } else if (filter_index > 3) {
+  } else {
     // 4 taps.
     v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
     v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]);
   }
-  const __m128i sum = SumOnePassTaps<filter_index>(v_src, v_tap);
+  const __m128i sum = SumOnePassTaps<num_taps>(v_src, v_tap);
   return sum;
 }
 
-// TODO(slavarnway): Use num_taps instead of filter_index for templates. See the
-// 2D version.
-template <int num_taps, int filter_index, bool is_compound = false>
+template <int num_taps, bool is_compound = false>
 void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
                        void* const dst, const ptrdiff_t dst_stride,
                        const int height, const __m128i* const v_tap) {
@@ -468,7 +508,7 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
       // 10 11 12 13 20 21 22 23
       srcs[1] = _mm_unpacklo_epi32(a, srcs[2]);
 
-      const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+      const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
       if (is_compound) {
         const __m128i results = Compound1DShift(sums);
         StoreUnaligned16(dst16, results);
@@ -515,7 +555,7 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
       // 30 31 32 33 40 41 42 43
       srcs[3] = _mm_unpacklo_epi32(b, srcs[4]);
 
-      const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+      const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
       if (is_compound) {
         const __m128i results = Compound1DShift(sums);
         StoreUnaligned16(dst16, results);
@@ -574,7 +614,7 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
       // 50 51 52 53 60 61 62 63
       srcs[5] = _mm_unpacklo_epi32(c, srcs[6]);
 
-      const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+      const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
       if (is_compound) {
         const __m128i results = Compound1DShift(sums);
         StoreUnaligned16(dst16, results);
@@ -645,7 +685,7 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
       // 70 71 72 73 80 81 82 83
       srcs[7] = _mm_unpacklo_epi32(d, srcs[8]);
 
-      const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+      const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
       if (is_compound) {
         const __m128i results = Compound1DShift(sums);
         StoreUnaligned16(dst16, results);
@@ -672,7 +712,7 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
   }
 }
 
-template <int num_taps, int filter_index, bool negative_outside_taps = false>
+template <int num_taps, bool negative_outside_taps = false>
 void FilterVertical2xH(const uint8_t* src, const ptrdiff_t src_stride,
                        void* const dst, const ptrdiff_t dst_stride,
                        const int height, const __m128i* const v_tap) {
@@ -705,7 +745,7 @@ void FilterVertical2xH(const uint8_t* src, const ptrdiff_t src_stride,
       // 10 11 20 21 30 31 40 41
       srcs[1] = _mm_srli_si128(srcs_0_2, 2);
       // This uses srcs[0]..srcs[1].
-      const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+      const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
       const __m128i results_16 =
           RightShiftWithRounding_S16(sums, kFilterBits - 1);
       const __m128i results = _mm_packus_epi16(results_16, results_16);
@@ -760,7 +800,7 @@ void FilterVertical2xH(const uint8_t* src, const ptrdiff_t src_stride,
       srcs[3] = _mm_srli_si128(srcs_0_4, 6);
 
       // This uses srcs[0]..srcs[3].
-      const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+      const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
       const __m128i results_16 =
           RightShiftWithRounding_S16(sums, kFilterBits - 1);
       const __m128i results = _mm_packus_epi16(results_16, results_16);
@@ -829,7 +869,7 @@ void FilterVertical2xH(const uint8_t* src, const ptrdiff_t src_stride,
       srcs[5] = _mm_srli_si128(srcs_4_8, 2);
 
       // This uses srcs[0]..srcs[5].
-      const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+      const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
       const __m128i results_16 =
           RightShiftWithRounding_S16(sums, kFilterBits - 1);
       const __m128i results = _mm_packus_epi16(results_16, results_16);
@@ -909,7 +949,7 @@ void FilterVertical2xH(const uint8_t* src, const ptrdiff_t src_stride,
       srcs[7] = _mm_srli_si128(srcs_4_8, 6);
 
       // This uses srcs[0]..srcs[7].
-      const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+      const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
       const __m128i results_16 =
           RightShiftWithRounding_S16(sums, kFilterBits - 1);
       const __m128i results = _mm_packus_epi16(results_16, results_16);
diff --git a/src/dsp/x86/distance_weighted_blend_sse4.cc b/src/dsp/x86/distance_weighted_blend_sse4.cc
index c813df4..8c32117 100644
--- a/src/dsp/x86/distance_weighted_blend_sse4.cc
+++ b/src/dsp/x86/distance_weighted_blend_sse4.cc
@@ -34,54 +34,50 @@ namespace low_bitdepth {
 namespace {
 
 constexpr int kInterPostRoundBit = 4;
+constexpr int kInterPostRhsAdjust = 1 << (16 - kInterPostRoundBit - 1);
 
 inline __m128i ComputeWeightedAverage8(const __m128i& pred0,
                                        const __m128i& pred1,
-                                       const __m128i& weights) {
-  // TODO(https://issuetracker.google.com/issues/150325685): Investigate range.
-  const __m128i preds_lo = _mm_unpacklo_epi16(pred0, pred1);
-  const __m128i mult_lo = _mm_madd_epi16(preds_lo, weights);
-  const __m128i result_lo =
-      RightShiftWithRounding_S32(mult_lo, kInterPostRoundBit + 4);
-
-  const __m128i preds_hi = _mm_unpackhi_epi16(pred0, pred1);
-  const __m128i mult_hi = _mm_madd_epi16(preds_hi, weights);
-  const __m128i result_hi =
-      RightShiftWithRounding_S32(mult_hi, kInterPostRoundBit + 4);
-
-  return _mm_packs_epi32(result_lo, result_hi);
+                                       const __m128i& weight) {
+  // Given: p0,p1 in range [-5132,9212] and w0 = 16 - w1, w1 = 16 - w0
+  // Output: (p0 * w0 + p1 * w1 + 128(=rounding bit)) >>
+  //    8(=kInterPostRoundBit + 4)
+  // The formula is manipulated to avoid lengthening to 32 bits.
+  // p0 * w0 + p1 * w1 = p0 * w0 + (16 - w0) * p1
+  // = (p0 - p1) * w0 + 16 * p1
+  // Maximum value of p0 - p1 is 9212 + 5132 = 0x3808.
+  const __m128i diff = _mm_slli_epi16(_mm_sub_epi16(pred0, pred1), 1);
+  // (((p0 - p1) * (w0 << 12) >> 16) + ((16 * p1) >> 4)
+  const __m128i weighted_diff = _mm_mulhi_epi16(diff, weight);
+  // ((p0 - p1) * w0 >> 4) + p1
+  const __m128i upscaled_average = _mm_add_epi16(weighted_diff, pred1);
+  // (x << 11) >> 15 == x >> 4
+  const __m128i right_shift_prep = _mm_set1_epi16(kInterPostRhsAdjust);
+  // (((p0 - p1) * w0 >> 4) + p1 + (128 >> 4)) >> 4
+  return _mm_mulhrs_epi16(upscaled_average, right_shift_prep);
 }
 
 template <int height>
 inline void DistanceWeightedBlend4xH_SSE4_1(
     const int16_t* LIBGAV1_RESTRICT pred_0,
-    const int16_t* LIBGAV1_RESTRICT pred_1, const uint8_t weight_0,
-    const uint8_t weight_1, void* LIBGAV1_RESTRICT const dest,
-    const ptrdiff_t dest_stride) {
+    const int16_t* LIBGAV1_RESTRICT pred_1, const uint8_t weight,
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t dest_stride) {
   auto* dst = static_cast<uint8_t*>(dest);
-  const __m128i weights = _mm_set1_epi32(weight_0 | (weight_1 << 16));
+  // Upscale the weight for mulhi.
+  const __m128i weights = _mm_set1_epi16(weight << 11);
 
   for (int y = 0; y < height; y += 4) {
-    // TODO(b/150326556): Use larger loads.
-    const __m128i src_00 = LoadLo8(pred_0);
-    const __m128i src_10 = LoadLo8(pred_1);
-    pred_0 += 4;
-    pred_1 += 4;
-    __m128i src_0 = LoadHi8(src_00, pred_0);
-    __m128i src_1 = LoadHi8(src_10, pred_1);
-    pred_0 += 4;
-    pred_1 += 4;
-    const __m128i res0 = ComputeWeightedAverage8(src_0, src_1, weights);
-
-    const __m128i src_01 = LoadLo8(pred_0);
-    const __m128i src_11 = LoadLo8(pred_1);
-    pred_0 += 4;
-    pred_1 += 4;
-    src_0 = LoadHi8(src_01, pred_0);
-    src_1 = LoadHi8(src_11, pred_1);
-    pred_0 += 4;
-    pred_1 += 4;
-    const __m128i res1 = ComputeWeightedAverage8(src_0, src_1, weights);
+    const __m128i src_00 = LoadAligned16(pred_0);
+    const __m128i src_10 = LoadAligned16(pred_1);
+    pred_0 += 8;
+    pred_1 += 8;
+    const __m128i res0 = ComputeWeightedAverage8(src_00, src_10, weights);
+
+    const __m128i src_01 = LoadAligned16(pred_0);
+    const __m128i src_11 = LoadAligned16(pred_1);
+    pred_0 += 8;
+    pred_1 += 8;
+    const __m128i res1 = ComputeWeightedAverage8(src_01, src_11, weights);
 
     const __m128i result_pixels = _mm_packus_epi16(res0, res1);
     Store4(dst, result_pixels);
@@ -101,11 +97,11 @@ inline void DistanceWeightedBlend4xH_SSE4_1(
 template <int height>
 inline void DistanceWeightedBlend8xH_SSE4_1(
     const int16_t* LIBGAV1_RESTRICT pred_0,
-    const int16_t* LIBGAV1_RESTRICT pred_1, const uint8_t weight_0,
-    const uint8_t weight_1, void* LIBGAV1_RESTRICT const dest,
-    const ptrdiff_t dest_stride) {
+    const int16_t* LIBGAV1_RESTRICT pred_1, const uint8_t weight,
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t dest_stride) {
   auto* dst = static_cast<uint8_t*>(dest);
-  const __m128i weights = _mm_set1_epi32(weight_0 | (weight_1 << 16));
+  // Upscale the weight for mulhi.
+  const __m128i weights = _mm_set1_epi16(weight << 11);
 
   for (int y = 0; y < height; y += 2) {
     const __m128i src_00 = LoadAligned16(pred_0);
@@ -130,11 +126,12 @@ inline void DistanceWeightedBlend8xH_SSE4_1(
 
 inline void DistanceWeightedBlendLarge_SSE4_1(
     const int16_t* LIBGAV1_RESTRICT pred_0,
-    const int16_t* LIBGAV1_RESTRICT pred_1, const uint8_t weight_0,
-    const uint8_t weight_1, const int width, const int height,
-    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t dest_stride) {
+    const int16_t* LIBGAV1_RESTRICT pred_1, const uint8_t weight,
+    const int width, const int height, void* LIBGAV1_RESTRICT const dest,
+    const ptrdiff_t dest_stride) {
   auto* dst = static_cast<uint8_t*>(dest);
-  const __m128i weights = _mm_set1_epi32(weight_0 | (weight_1 << 16));
+  // Upscale the weight for mulhi.
+  const __m128i weights = _mm_set1_epi16(weight << 11);
 
   int y = height;
   do {
@@ -162,23 +159,24 @@ inline void DistanceWeightedBlendLarge_SSE4_1(
 void DistanceWeightedBlend_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
                                   const void* LIBGAV1_RESTRICT prediction_1,
                                   const uint8_t weight_0,
-                                  const uint8_t weight_1, const int width,
+                                  const uint8_t /*weight_1*/, const int width,
                                   const int height,
                                   void* LIBGAV1_RESTRICT const dest,
                                   const ptrdiff_t dest_stride) {
   const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  const uint8_t weight = weight_0;
   if (width == 4) {
     if (height == 4) {
-      DistanceWeightedBlend4xH_SSE4_1<4>(pred_0, pred_1, weight_0, weight_1,
-                                         dest, dest_stride);
+      DistanceWeightedBlend4xH_SSE4_1<4>(pred_0, pred_1, weight, dest,
+                                         dest_stride);
     } else if (height == 8) {
-      DistanceWeightedBlend4xH_SSE4_1<8>(pred_0, pred_1, weight_0, weight_1,
-                                         dest, dest_stride);
+      DistanceWeightedBlend4xH_SSE4_1<8>(pred_0, pred_1, weight, dest,
+                                         dest_stride);
     } else {
       assert(height == 16);
-      DistanceWeightedBlend4xH_SSE4_1<16>(pred_0, pred_1, weight_0, weight_1,
-                                          dest, dest_stride);
+      DistanceWeightedBlend4xH_SSE4_1<16>(pred_0, pred_1, weight, dest,
+                                          dest_stride);
     }
     return;
   }
@@ -186,28 +184,28 @@ void DistanceWeightedBlend_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
   if (width == 8) {
     switch (height) {
       case 4:
-        DistanceWeightedBlend8xH_SSE4_1<4>(pred_0, pred_1, weight_0, weight_1,
-                                           dest, dest_stride);
+        DistanceWeightedBlend8xH_SSE4_1<4>(pred_0, pred_1, weight, dest,
+                                           dest_stride);
         return;
       case 8:
-        DistanceWeightedBlend8xH_SSE4_1<8>(pred_0, pred_1, weight_0, weight_1,
-                                           dest, dest_stride);
+        DistanceWeightedBlend8xH_SSE4_1<8>(pred_0, pred_1, weight, dest,
+                                           dest_stride);
         return;
       case 16:
-        DistanceWeightedBlend8xH_SSE4_1<16>(pred_0, pred_1, weight_0, weight_1,
-                                            dest, dest_stride);
+        DistanceWeightedBlend8xH_SSE4_1<16>(pred_0, pred_1, weight, dest,
+                                            dest_stride);
         return;
       default:
         assert(height == 32);
-        DistanceWeightedBlend8xH_SSE4_1<32>(pred_0, pred_1, weight_0, weight_1,
-                                            dest, dest_stride);
+        DistanceWeightedBlend8xH_SSE4_1<32>(pred_0, pred_1, weight, dest,
+                                            dest_stride);
 
         return;
     }
   }
 
-  DistanceWeightedBlendLarge_SSE4_1(pred_0, pred_1, weight_0, weight_1, width,
-                                    height, dest, dest_stride);
+  DistanceWeightedBlendLarge_SSE4_1(pred_0, pred_1, weight, width, height, dest,
+                                    dest_stride);
 }
 
 void Init8bpp() {
@@ -273,27 +271,19 @@ inline void DistanceWeightedBlend4xH_SSE4_1(
 
   int y = height;
   do {
-    const __m128i src_00 = LoadLo8(pred_0);
-    const __m128i src_10 = LoadLo8(pred_1);
-    pred_0 += 4;
-    pred_1 += 4;
-    __m128i src_0 = LoadHi8(src_00, pred_0);
-    __m128i src_1 = LoadHi8(src_10, pred_1);
-    pred_0 += 4;
-    pred_1 += 4;
+    const __m128i src_00 = LoadAligned16(pred_0);
+    const __m128i src_10 = LoadAligned16(pred_1);
+    pred_0 += 8;
+    pred_1 += 8;
     const __m128i res0 =
-        ComputeWeightedAverage8(src_0, src_1, weight0, weight1);
-
-    const __m128i src_01 = LoadLo8(pred_0);
-    const __m128i src_11 = LoadLo8(pred_1);
-    pred_0 += 4;
-    pred_1 += 4;
-    src_0 = LoadHi8(src_01, pred_0);
-    src_1 = LoadHi8(src_11, pred_1);
-    pred_0 += 4;
-    pred_1 += 4;
+        ComputeWeightedAverage8(src_00, src_10, weight0, weight1);
+
+    const __m128i src_01 = LoadAligned16(pred_0);
+    const __m128i src_11 = LoadAligned16(pred_1);
+    pred_0 += 8;
+    pred_1 += 8;
     const __m128i res1 =
-        ComputeWeightedAverage8(src_0, src_1, weight0, weight1);
+        ComputeWeightedAverage8(src_01, src_11, weight0, weight1);
 
     StoreLo8(dst, res0);
     dst += dest_stride;
diff --git a/src/dsp/x86/film_grain_sse4.cc b/src/dsp/x86/film_grain_sse4.cc
index 9ece947..59d18a6 100644
--- a/src/dsp/x86/film_grain_sse4.cc
+++ b/src/dsp/x86/film_grain_sse4.cc
@@ -23,14 +23,15 @@
 #include <cstdint>
 #include <cstring>
 
-#include "src/dsp/common.h"
 #include "src/dsp/constants.h"
 #include "src/dsp/dsp.h"
 #include "src/dsp/film_grain_common.h"
 #include "src/dsp/x86/common_sse4.h"
+#include "src/utils/array_2d.h"
 #include "src/utils/common.h"
 #include "src/utils/compiler_attributes.h"
-#include "src/utils/logging.h"
+#include "src/utils/constants.h"
+#include "src/utils/types.h"
 
 namespace libgav1 {
 namespace dsp {
@@ -165,7 +166,7 @@ void BlendNoiseWithImageLuma_SSE4_1(
   int y = 0;
   do {
     int x = 0;
-    for (; x < safe_width; x += 8) {
+    for (; x + 8 <= safe_width; x += 8) {
       const __m128i orig = LoadSource(&in_y_row[x]);
       const __m128i scaling =
           GetScalingFactors<bitdepth, Pixel>(scaling_lut_y, &in_y_row[x]);
@@ -181,6 +182,7 @@ void BlendNoiseWithImageLuma_SSE4_1(
       // Prevent arbitrary indices from entering GetScalingFactors.
       memset(luma_buffer, 0, sizeof(luma_buffer));
       const int valid_range = width - x;
+      assert(valid_range < 8);
       memcpy(luma_buffer, &in_y_row[x], valid_range * sizeof(in_y_row[0]));
       luma_buffer[valid_range] = in_y_row[width - 1];
       const __m128i orig = LoadSource(&in_y_row[x]);
@@ -239,7 +241,7 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlaneWithCfl_SSE4_1(
   int y = 0;
   do {
     int x = 0;
-    for (; x < safe_chroma_width; x += 8) {
+    for (; x + 8 <= safe_chroma_width; x += 8) {
       const int luma_x = x << subsampling_x;
       const __m128i average_luma =
           GetAverageLuma(&in_y_row[luma_x], subsampling_x);
@@ -252,8 +254,6 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlaneWithCfl_SSE4_1(
       StoreUnsigned(&out_chroma_row[x], Clip3(blended, floor, ceiling));
     }
 
-    // This section only runs if width % (8 << sub_x) != 0. It should never run
-    // on 720p and above.
     if (x < chroma_width) {
       // Prevent huge indices from entering GetScalingFactors due to
       // uninitialized values. This is not a problem in 8bpp because the table
@@ -365,7 +365,7 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlane8bpp_SSE4_1(
   int y = 0;
   do {
     int x = 0;
-    for (; x < safe_chroma_width; x += 8) {
+    for (; x + 8 <= safe_chroma_width; x += 8) {
       const int luma_x = x << subsampling_x;
       const __m128i average_luma =
           GetAverageLuma(&in_y_row[luma_x], subsampling_x);
diff --git a/src/dsp/x86/intrapred_directional_sse4.cc b/src/dsp/x86/intrapred_directional_sse4.cc
index e642aee..bc61745 100644
--- a/src/dsp/x86/intrapred_directional_sse4.cc
+++ b/src/dsp/x86/intrapred_directional_sse4.cc
@@ -624,14 +624,6 @@ inline void DirectionalZone2FromLeftCol_4x4_SSE4_1(
   }
 }
 
-// The height at which a load of 16 bytes will not contain enough source pixels
-// from |left_column| to supply an accurate row when computing 8 pixels at a
-// time. The values are found by inspection. By coincidence, all angles that
-// satisfy (ystep >> 6) == 2 map to the same value, so it is enough to look up
-// by ystep >> 6. The largest index for this lookup is 1023 >> 6 == 15.
-constexpr int kDirectionalZone2ShuffleInvalidHeight[16] = {
-    1024, 1024, 16, 16, 16, 16, 0, 0, 18, 0, 0, 0, 0, 0, 0, 40};
-
 template <bool upsampled>
 inline void DirectionalZone2FromLeftCol_8x8_SSE4_1(
     uint8_t* dst, ptrdiff_t stride, const uint8_t* const left_column,
@@ -729,6 +721,103 @@ inline void DirectionalZone1Blend_8xH(
   }
 }
 
+template <bool shuffle_left_column, bool upsampled_left, bool upsampled_top>
+inline void DirectionalZone2_8xH(
+    uint8_t* LIBGAV1_RESTRICT const dst, const ptrdiff_t stride,
+    const uint8_t* LIBGAV1_RESTRICT const top_row,
+    const uint8_t* LIBGAV1_RESTRICT const left_column, const int height,
+    const int xstep, const int ystep, const int x, const int left_offset,
+    const __m128i& xstep_for_shift, const __m128i& xstep_bounds_base,
+    const __m128i& left_y) {
+  const int upsample_left_shift = static_cast<int>(upsampled_left);
+  const int upsample_top_shift = static_cast<int>(upsampled_top);
+
+  // Loop incrementers for moving by block (8x8). This function handles blocks
+  // with height 4 as well. They are calculated in one pass so these variables
+  // do not get used.
+  const ptrdiff_t stride8 = stride << 3;
+  const int xstep8 = xstep << 3;
+  const __m128i xstep8_vect = _mm_set1_epi16(xstep8);
+
+  // Cover 8x4 case.
+  const int min_height = (height == 4) ? 4 : 8;
+
+  // The first stage, before the first y-loop, covers blocks that are only
+  // computed from the top row. The second stage, comprising two y-loops, covers
+  // blocks that have a mixture of values computed from top or left. The final
+  // stage covers blocks that are only computed from the left.
+  uint8_t* dst_x = dst + x;
+
+  // Round down to the nearest multiple of 8 (or 4, if height is 4).
+  const int max_top_only_y =
+      std::min(((x + 1) << 6) / xstep, height) & ~(min_height - 1);
+  DirectionalZone1_4xH(dst_x, stride, top_row + (x << upsample_top_shift),
+                       max_top_only_y, -xstep, upsampled_top);
+  DirectionalZone1_4xH(dst_x + 4, stride,
+                       top_row + ((x + 4) << upsample_top_shift),
+                       max_top_only_y, -xstep, upsampled_top);
+  if (max_top_only_y == height) return;
+
+  const __m128i max_shift = _mm_set1_epi8(32);
+  const __m128i shift_mask = _mm_set1_epi32(0x003F003F);
+  const __m128i dest_index_x =
+      _mm_set_epi32(0x00070006, 0x00050004, 0x00030002, 0x00010000);
+  const __m128i sampler_top =
+      upsampled_top
+          ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100)
+          : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100);
+  int y = max_top_only_y;
+  dst_x += stride * y;
+  const int xstep_y = xstep * y;
+  const __m128i xstep_y_vect = _mm_set1_epi16(xstep_y);
+  // All rows from |min_left_only_y| down for this set of columns, only need
+  // |left_column| to compute.
+  const int min_left_only_y =
+      Align(std::min(((x + 8) << 6) / xstep, height), 8);
+
+  __m128i xstep_bounds = _mm_add_epi16(xstep_bounds_base, xstep_y_vect);
+  __m128i xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift, xstep_y_vect);
+  int top_x = -xstep_y;
+
+  const auto base_left_y = static_cast<int16_t>(_mm_extract_epi16(left_y, 0));
+  for (; y < min_left_only_y;
+       y += 8, dst_x += stride8,
+       xstep_bounds = _mm_add_epi16(xstep_bounds, xstep8_vect),
+       xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift_y, xstep8_vect),
+       top_x -= xstep8) {
+    // Pick up from the last y-value, using the 10% slower but secure method for
+    // left prediction.
+    if (shuffle_left_column) {
+      DirectionalZone2FromLeftCol_8x8_SSE4_1<upsampled_left>(
+          dst_x, stride,
+          left_column + ((left_offset + y) << upsample_left_shift), left_y);
+    } else {
+      DirectionalZone3_8xH<upsampled_left, 8>(
+          dst_x, stride,
+          left_column + ((left_offset + y) << upsample_left_shift), base_left_y,
+          -ystep);
+    }
+
+    __m128i shifts = _mm_srli_epi16(
+        _mm_and_si128(_mm_slli_epi16(xstep_for_shift_y, upsample_top_shift),
+                      shift_mask),
+        1);
+    shifts = _mm_packus_epi16(shifts, shifts);
+    __m128i opposite_shifts = _mm_sub_epi8(max_shift, shifts);
+    shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
+    __m128i xstep_bounds_off = _mm_srai_epi16(xstep_bounds, 6);
+    DirectionalZone1Blend_8xH<upsampled_top, 8>(
+        dst_x, top_row + (x << upsample_top_shift), stride, sampler_top,
+        xstep_bounds_off, shifts, dest_index_x, top_x, xstep);
+  }
+  // Loop over y for left_only rows.
+  for (; y < height; y += 8, dst_x += stride8) {
+    DirectionalZone3_8xH<upsampled_left, 8>(
+        dst_x, stride, left_column + ((left_offset + y) << upsample_left_shift),
+        base_left_y, -ystep);
+  }
+}
+
 // 7.11.2.4 (8) 90 < angle > 180
 // The strategy for this function is to know how many blocks can be processed
 // with just pixels from |top_ptr|, then handle mixed blocks, then handle only
@@ -742,29 +831,11 @@ inline void DirectionalZone2_SSE4_1(void* dest, ptrdiff_t stride,
                                     const int width, const int height,
                                     const int xstep, const int ystep) {
   auto* dst = static_cast<uint8_t*>(dest);
-  const int upsample_left_shift = static_cast<int>(upsampled_left);
   const int upsample_top_shift = static_cast<int>(upsampled_top);
-  const __m128i max_shift = _mm_set1_epi8(32);
-  const ptrdiff_t stride8 = stride << 3;
-  const __m128i dest_index_x =
-      _mm_set_epi32(0x00070006, 0x00050004, 0x00030002, 0x00010000);
-  const __m128i sampler_top =
-      upsampled_top
-          ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100)
-          : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100);
-  const __m128i shift_mask = _mm_set1_epi32(0x003F003F);
-  // All columns from |min_top_only_x| to the right will only need |top_row| to
-  // compute. This assumes minimum |xstep| is 3.
+  // All columns from |min_top_only_x| to the right will only need |top_row|
+  // to compute. This assumes minimum |xstep| is 3.
   const int min_top_only_x = std::min((height * xstep) >> 6, width);
 
-  // For steep angles, the source pixels from left_column may not fit in a
-  // 16-byte load for shuffling.
-  // TODO(petersonab): Find a more precise formula for this subject to x.
-  const int max_shuffle_height =
-      std::min(height, kDirectionalZone2ShuffleInvalidHeight[ystep >> 6]);
-
-  const int xstep8 = xstep << 3;
-  const __m128i xstep8_vect = _mm_set1_epi16(xstep8);
   // Accumulate xstep across 8 rows.
   const __m128i xstep_dup = _mm_set1_epi16(-xstep);
   const __m128i increments = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
@@ -787,105 +858,39 @@ inline void DirectionalZone2_SSE4_1(void* dest, ptrdiff_t stride,
   // offset. Following values need the full ystep as a relative offset.
   const __m128i ystep_init = _mm_set1_epi16(-ystep_remainder);
   const __m128i ystep_dup = _mm_set1_epi16(-ystep);
+  const __m128i dest_index_x =
+      _mm_set_epi32(0x00070006, 0x00050004, 0x00030002, 0x00010000);
   __m128i left_y = _mm_mullo_epi16(ystep_dup, dest_index_x);
   left_y = _mm_add_epi16(ystep_init, left_y);
 
+  // Analysis finds that, for most angles (ystep < 132), all segments that use
+  // both top_row and left_column can compute from left_column using byte
+  // shuffles from a single vector. For steeper angles, the shuffle is also
+  // fully reliable when x >= 32.
+  const int shuffle_left_col_x = (ystep < 132) ? 0 : 32;
+  const int min_shuffle_x = std::min(min_top_only_x, shuffle_left_col_x);
   const __m128i increment_top8 = _mm_set1_epi16(8 << 6);
   int x = 0;
 
-  // This loop treats each set of 4 columns in 3 stages with y-value boundaries.
-  // The first stage, before the first y-loop, covers blocks that are only
-  // computed from the top row. The second stage, comprising two y-loops, covers
-  // blocks that have a mixture of values computed from top or left. The final
-  // stage covers blocks that are only computed from the left.
+  for (int left_offset = -left_base_increment; x < min_shuffle_x;
+       x += 8,
+           xstep_bounds_base = _mm_sub_epi16(xstep_bounds_base, increment_top8),
+           // Watch left_y because it can still get big.
+       left_y = _mm_add_epi16(left_y, increment_left8),
+           left_offset -= left_base_increment8) {
+    DirectionalZone2_8xH<false, upsampled_left, upsampled_top>(
+        dst, stride, top_row, left_column, height, xstep, ystep, x, left_offset,
+        xstep_for_shift, xstep_bounds_base, left_y);
+  }
   for (int left_offset = -left_base_increment; x < min_top_only_x;
        x += 8,
            xstep_bounds_base = _mm_sub_epi16(xstep_bounds_base, increment_top8),
            // Watch left_y because it can still get big.
        left_y = _mm_add_epi16(left_y, increment_left8),
            left_offset -= left_base_increment8) {
-    uint8_t* dst_x = dst + x;
-
-    // Round down to the nearest multiple of 8.
-    const int max_top_only_y = std::min(((x + 1) << 6) / xstep, height) & ~7;
-    DirectionalZone1_4xH(dst_x, stride, top_row + (x << upsample_top_shift),
-                         max_top_only_y, -xstep, upsampled_top);
-    DirectionalZone1_4xH(dst_x + 4, stride,
-                         top_row + ((x + 4) << upsample_top_shift),
-                         max_top_only_y, -xstep, upsampled_top);
-
-    int y = max_top_only_y;
-    dst_x += stride * y;
-    const int xstep_y = xstep * y;
-    const __m128i xstep_y_vect = _mm_set1_epi16(xstep_y);
-    // All rows from |min_left_only_y| down for this set of columns, only need
-    // |left_column| to compute.
-    const int min_left_only_y = std::min(((x + 8) << 6) / xstep, height);
-    // At high angles such that min_left_only_y < 8, ystep is low and xstep is
-    // high. This means that max_shuffle_height is unbounded and xstep_bounds
-    // will overflow in 16 bits. This is prevented by stopping the first
-    // blending loop at min_left_only_y for such cases, which means we skip over
-    // the second blending loop as well.
-    const int left_shuffle_stop_y =
-        std::min(max_shuffle_height, min_left_only_y);
-    __m128i xstep_bounds = _mm_add_epi16(xstep_bounds_base, xstep_y_vect);
-    __m128i xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift, xstep_y_vect);
-    int top_x = -xstep_y;
-
-    for (; y < left_shuffle_stop_y;
-         y += 8, dst_x += stride8,
-         xstep_bounds = _mm_add_epi16(xstep_bounds, xstep8_vect),
-         xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift_y, xstep8_vect),
-         top_x -= xstep8) {
-      DirectionalZone2FromLeftCol_8x8_SSE4_1<upsampled_left>(
-          dst_x, stride,
-          left_column + ((left_offset + y) << upsample_left_shift), left_y);
-
-      __m128i shifts = _mm_srli_epi16(
-          _mm_and_si128(_mm_slli_epi16(xstep_for_shift_y, upsample_top_shift),
-                        shift_mask),
-          1);
-      shifts = _mm_packus_epi16(shifts, shifts);
-      __m128i opposite_shifts = _mm_sub_epi8(max_shift, shifts);
-      shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
-      __m128i xstep_bounds_off = _mm_srai_epi16(xstep_bounds, 6);
-      DirectionalZone1Blend_8xH<upsampled_top, 8>(
-          dst_x, top_row + (x << upsample_top_shift), stride, sampler_top,
-          xstep_bounds_off, shifts, dest_index_x, top_x, xstep);
-    }
-    // Pick up from the last y-value, using the 10% slower but secure method for
-    // left prediction.
-    const auto base_left_y = static_cast<int16_t>(_mm_extract_epi16(left_y, 0));
-    for (; y < min_left_only_y;
-         y += 8, dst_x += stride8,
-         xstep_bounds = _mm_add_epi16(xstep_bounds, xstep8_vect),
-         xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift_y, xstep8_vect),
-         top_x -= xstep8) {
-      const __m128i xstep_bounds_off = _mm_srai_epi16(xstep_bounds, 6);
-
-      DirectionalZone3_8xH<upsampled_left, 8>(
-          dst_x, stride,
-          left_column + ((left_offset + y) << upsample_left_shift), base_left_y,
-          -ystep);
-
-      __m128i shifts = _mm_srli_epi16(
-          _mm_and_si128(_mm_slli_epi16(xstep_for_shift_y, upsample_top_shift),
-                        shift_mask),
-          1);
-      shifts = _mm_packus_epi16(shifts, shifts);
-      __m128i opposite_shifts = _mm_sub_epi8(max_shift, shifts);
-      shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
-      DirectionalZone1Blend_8xH<upsampled_top, 8>(
-          dst_x, top_row + (x << upsample_top_shift), stride, sampler_top,
-          xstep_bounds_off, shifts, dest_index_x, top_x, xstep);
-    }
-    // Loop over y for left_only rows.
-    for (; y < height; y += 8, dst_x += stride8) {
-      DirectionalZone3_8xH<upsampled_left, 8>(
-          dst_x, stride,
-          left_column + ((left_offset + y) << upsample_left_shift), base_left_y,
-          -ystep);
-    }
+    DirectionalZone2_8xH<true, upsampled_left, upsampled_top>(
+        dst, stride, top_row, left_column, height, xstep, ystep, x, left_offset,
+        xstep_for_shift, xstep_bounds_base, left_y);
   }
   for (; x < width; x += 4) {
     DirectionalZone1_4xH(dst + x, stride, top_row + (x << upsample_top_shift),
@@ -952,8 +957,8 @@ inline void DirectionalZone2_4_SSE4_1(void* dest, ptrdiff_t stride,
            left_offset -= left_base_increment4) {
     uint8_t* dst_x = dst + x;
 
-    // Round down to the nearest multiple of 8.
-    const int max_top_only_y = std::min((x << 6) / xstep, height) & 0xFFFFFFF4;
+    // Round down to the nearest multiple of 4.
+    const int max_top_only_y = std::min((x << 6) / xstep, height) & ~3;
     DirectionalZone1_4xH(dst_x, stride, top_row + (x << upsample_top_shift),
                          max_top_only_y, -xstep, upsampled_top);
     int y = max_top_only_y;
diff --git a/src/dsp/x86/loop_restoration_sse4.cc b/src/dsp/x86/loop_restoration_sse4.cc
index 3363f0e..b4df072 100644
--- a/src/dsp/x86/loop_restoration_sse4.cc
+++ b/src/dsp/x86/loop_restoration_sse4.cc
@@ -2088,6 +2088,7 @@ LIBGAV1_ALWAYS_INLINE void BoxFilter(
     uint16_t* const ma444[3], uint16_t* const ma565[2], uint32_t* const b343[4],
     uint32_t* const b444[3], uint32_t* const b565[2], uint8_t* const dst) {
   __m128i s[2][2], ma3[2][2], ma5[2], sq[2][4], b3[2][3], b5[3];
+  ma5[1] = _mm_setzero_si128();  // Quiets -Wmaybe-unintialized with gcc.
   s[0][0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width);
   s[1][0] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1 - width);
   sq[0][0] = SquareLo8(s[0][0]);
diff --git a/src/dsp/x86/mask_blend_sse4.cc b/src/dsp/x86/mask_blend_sse4.cc
index a18444b..833814c 100644
--- a/src/dsp/x86/mask_blend_sse4.cc
+++ b/src/dsp/x86/mask_blend_sse4.cc
@@ -30,35 +30,81 @@
 
 namespace libgav1 {
 namespace dsp {
-namespace low_bitdepth {
 namespace {
 
+template <int subsampling_x, int subsampling_y>
+inline __m128i GetMask8(const uint8_t* mask, const ptrdiff_t stride) {
+  if (subsampling_x == 1 && subsampling_y == 1) {
+    const __m128i one = _mm_set1_epi8(1);
+    const __m128i mask_val_0 = LoadUnaligned16(mask);
+    const __m128i mask_val_1 = LoadUnaligned16(mask + stride);
+    const __m128i add_0 = _mm_adds_epu8(mask_val_0, mask_val_1);
+    const __m128i mask_0 = _mm_maddubs_epi16(add_0, one);
+    return RightShiftWithRounding_U16(mask_0, 2);
+  }
+  if (subsampling_x == 1) {
+    const __m128i row_vals = LoadUnaligned16(mask);
+    const __m128i mask_val_0 = _mm_cvtepu8_epi16(row_vals);
+    const __m128i mask_val_1 = _mm_cvtepu8_epi16(_mm_srli_si128(row_vals, 8));
+    __m128i subsampled_mask = _mm_hadd_epi16(mask_val_0, mask_val_1);
+    return RightShiftWithRounding_U16(subsampled_mask, 1);
+  }
+  assert(subsampling_y == 0 && subsampling_x == 0);
+  const __m128i mask_val = LoadLo8(mask);
+  return _mm_cvtepu8_epi16(mask_val);
+}
+
+// Imitate behavior of ARM vtrn1q_u64.
+inline __m128i Transpose1_U64(const __m128i a, const __m128i b) {
+  return _mm_castps_si128(
+      _mm_movelh_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b)));
+}
+
+// Imitate behavior of ARM vtrn2q_u64.
+inline __m128i Transpose2_U64(const __m128i a, const __m128i b) {
+  return _mm_castps_si128(
+      _mm_movehl_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b)));
+}
+
 // Width can only be 4 when it is subsampled from a block of width 8, hence
 // subsampling_x is always 1 when this function is called.
 template <int subsampling_x, int subsampling_y>
-inline __m128i GetMask4x2(const uint8_t* LIBGAV1_RESTRICT mask,
-                          ptrdiff_t mask_stride) {
+inline __m128i GetMask4x2(const uint8_t* mask) {
+  if (subsampling_x == 1 && subsampling_y == 1) {
+    const __m128i mask_val_01 = LoadUnaligned16(mask);
+    // Stride is fixed because this is the smallest block size.
+    const __m128i mask_val_23 = LoadUnaligned16(mask + 16);
+    // Transpose rows to add row 0 to row 1, and row 2 to row 3.
+    const __m128i mask_val_02 = Transpose1_U64(mask_val_01, mask_val_23);
+    const __m128i mask_val_13 = Transpose2_U64(mask_val_23, mask_val_01);
+    const __m128i add_0 = _mm_adds_epu8(mask_val_02, mask_val_13);
+    const __m128i one = _mm_set1_epi8(1);
+    const __m128i mask_0 = _mm_maddubs_epi16(add_0, one);
+    return RightShiftWithRounding_U16(mask_0, 2);
+  }
+  return GetMask8<subsampling_x, 0>(mask, 0);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline __m128i GetInterIntraMask4x2(const uint8_t* mask,
+                                    ptrdiff_t mask_stride) {
   if (subsampling_x == 1) {
-    const __m128i mask_val_0 = _mm_cvtepu8_epi16(LoadLo8(mask));
-    const __m128i mask_val_1 =
-        _mm_cvtepu8_epi16(LoadLo8(mask + (mask_stride << subsampling_y)));
-    __m128i subsampled_mask = _mm_hadd_epi16(mask_val_0, mask_val_1);
-    if (subsampling_y == 1) {
-      const __m128i next_mask_val_0 =
-          _mm_cvtepu8_epi16(LoadLo8(mask + mask_stride));
-      const __m128i next_mask_val_1 =
-          _mm_cvtepu8_epi16(LoadLo8(mask + mask_stride * 3));
-      subsampled_mask = _mm_add_epi16(
-          subsampled_mask, _mm_hadd_epi16(next_mask_val_0, next_mask_val_1));
-    }
-    return RightShiftWithRounding_U16(subsampled_mask, 1 + subsampling_y);
+    return GetMask4x2<subsampling_x, subsampling_y>(mask);
   }
+  // When using intra or difference weighted masks, the function doesn't use
+  // subsampling, so |mask_stride| may be 4 or 8.
+  assert(subsampling_y == 0 && subsampling_x == 0);
   const __m128i mask_val_0 = Load4(mask);
   const __m128i mask_val_1 = Load4(mask + mask_stride);
   return _mm_cvtepu8_epi16(
       _mm_or_si128(mask_val_0, _mm_slli_si128(mask_val_1, 4)));
 }
 
+}  // namespace
+
+namespace low_bitdepth {
+namespace {
+
 // This function returns a 16-bit packed mask to fit in _mm_madd_epi16.
 // 16-bit is also the lowest packing for hadd, but without subsampling there is
 // an unfortunate conversion required.
@@ -87,38 +133,6 @@ inline __m128i GetMask8(const uint8_t* LIBGAV1_RESTRICT mask,
   return _mm_cvtepu8_epi16(mask_val);
 }
 
-// This version returns 8-bit packed values to fit in _mm_maddubs_epi16 because,
-// when is_inter_intra is true, the prediction values are brought to 8-bit
-// packing as well.
-template <int subsampling_x, int subsampling_y>
-inline __m128i GetInterIntraMask8(const uint8_t* LIBGAV1_RESTRICT mask,
-                                  ptrdiff_t stride) {
-  if (subsampling_x == 1) {
-    const __m128i row_vals = LoadUnaligned16(mask);
-
-    const __m128i mask_val_0 = _mm_cvtepu8_epi16(row_vals);
-    const __m128i mask_val_1 = _mm_cvtepu8_epi16(_mm_srli_si128(row_vals, 8));
-    __m128i subsampled_mask = _mm_hadd_epi16(mask_val_0, mask_val_1);
-
-    if (subsampling_y == 1) {
-      const __m128i next_row_vals = LoadUnaligned16(mask + stride);
-      const __m128i next_mask_val_0 = _mm_cvtepu8_epi16(next_row_vals);
-      const __m128i next_mask_val_1 =
-          _mm_cvtepu8_epi16(_mm_srli_si128(next_row_vals, 8));
-      subsampled_mask = _mm_add_epi16(
-          subsampled_mask, _mm_hadd_epi16(next_mask_val_0, next_mask_val_1));
-    }
-    const __m128i ret =
-        RightShiftWithRounding_U16(subsampled_mask, 1 + subsampling_y);
-    return _mm_packus_epi16(ret, ret);
-  }
-  assert(subsampling_y == 0 && subsampling_x == 0);
-  // Unfortunately there is no shift operation for 8-bit packing, or else we
-  // could return everything with 8-bit packing.
-  const __m128i mask_val = LoadLo8(mask);
-  return mask_val;
-}
-
 inline void WriteMaskBlendLine4x2(const int16_t* LIBGAV1_RESTRICT const pred_0,
                                   const int16_t* LIBGAV1_RESTRICT const pred_1,
                                   const __m128i pred_mask_0,
@@ -149,15 +163,14 @@ inline void WriteMaskBlendLine4x2(const int16_t* LIBGAV1_RESTRICT const pred_0,
 }
 
 template <int subsampling_x, int subsampling_y>
-inline void MaskBlending4x4_SSE4(const int16_t* LIBGAV1_RESTRICT pred_0,
-                                 const int16_t* LIBGAV1_RESTRICT pred_1,
-                                 const uint8_t* LIBGAV1_RESTRICT mask,
-                                 const ptrdiff_t mask_stride,
-                                 uint8_t* LIBGAV1_RESTRICT dst,
-                                 const ptrdiff_t dst_stride) {
+inline void MaskBlending4x4_SSE4_1(const int16_t* LIBGAV1_RESTRICT pred_0,
+                                   const int16_t* LIBGAV1_RESTRICT pred_1,
+                                   const uint8_t* LIBGAV1_RESTRICT mask,
+                                   uint8_t* LIBGAV1_RESTRICT dst,
+                                   const ptrdiff_t dst_stride) {
+  constexpr ptrdiff_t mask_stride = 4 << subsampling_x;
   const __m128i mask_inverter = _mm_set1_epi16(64);
-  __m128i pred_mask_0 =
-      GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+  __m128i pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask);
   __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
   WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
                         dst_stride);
@@ -166,30 +179,30 @@ inline void MaskBlending4x4_SSE4(const int16_t* LIBGAV1_RESTRICT pred_0,
   mask += mask_stride << (1 + subsampling_y);
   dst += dst_stride << 1;
 
-  pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+  pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask);
   pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
   WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
                         dst_stride);
 }
 
 template <int subsampling_x, int subsampling_y>
-inline void MaskBlending4xH_SSE4(const int16_t* LIBGAV1_RESTRICT pred_0,
-                                 const int16_t* LIBGAV1_RESTRICT pred_1,
-                                 const uint8_t* LIBGAV1_RESTRICT const mask_ptr,
-                                 const ptrdiff_t mask_stride, const int height,
-                                 uint8_t* LIBGAV1_RESTRICT dst,
-                                 const ptrdiff_t dst_stride) {
+inline void MaskBlending4xH_SSE4_1(
+    const int16_t* LIBGAV1_RESTRICT pred_0,
+    const int16_t* LIBGAV1_RESTRICT pred_1,
+    const uint8_t* LIBGAV1_RESTRICT const mask_ptr, const int height,
+    uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t dst_stride) {
+  assert(subsampling_x == 1);
   const uint8_t* mask = mask_ptr;
+  constexpr ptrdiff_t mask_stride = 4 << subsampling_x;
   if (height == 4) {
-    MaskBlending4x4_SSE4<subsampling_x, subsampling_y>(
-        pred_0, pred_1, mask, mask_stride, dst, dst_stride);
+    MaskBlending4x4_SSE4_1<subsampling_x, subsampling_y>(pred_0, pred_1, mask,
+                                                         dst, dst_stride);
     return;
   }
   const __m128i mask_inverter = _mm_set1_epi16(64);
   int y = 0;
   do {
-    __m128i pred_mask_0 =
-        GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+    __m128i pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask);
     __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
 
     WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
@@ -199,7 +212,7 @@ inline void MaskBlending4xH_SSE4(const int16_t* LIBGAV1_RESTRICT pred_0,
     mask += mask_stride << (1 + subsampling_y);
     dst += dst_stride << 1;
 
-    pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+    pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask);
     pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
     WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
                           dst_stride);
@@ -208,7 +221,7 @@ inline void MaskBlending4xH_SSE4(const int16_t* LIBGAV1_RESTRICT pred_0,
     mask += mask_stride << (1 + subsampling_y);
     dst += dst_stride << 1;
 
-    pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+    pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask);
     pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
     WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
                           dst_stride);
@@ -217,7 +230,7 @@ inline void MaskBlending4xH_SSE4(const int16_t* LIBGAV1_RESTRICT pred_0,
     mask += mask_stride << (1 + subsampling_y);
     dst += dst_stride << 1;
 
-    pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+    pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask);
     pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
     WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
                           dst_stride);
@@ -230,21 +243,21 @@ inline void MaskBlending4xH_SSE4(const int16_t* LIBGAV1_RESTRICT pred_0,
 }
 
 template <int subsampling_x, int subsampling_y>
-inline void MaskBlend_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
-                           const void* LIBGAV1_RESTRICT prediction_1,
-                           const ptrdiff_t /*prediction_stride_1*/,
-                           const uint8_t* LIBGAV1_RESTRICT const mask_ptr,
-                           const ptrdiff_t mask_stride, const int width,
-                           const int height, void* LIBGAV1_RESTRICT dest,
-                           const ptrdiff_t dst_stride) {
+inline void MaskBlend_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                             const void* LIBGAV1_RESTRICT prediction_1,
+                             const ptrdiff_t /*prediction_stride_1*/,
+                             const uint8_t* LIBGAV1_RESTRICT const mask_ptr,
+                             const ptrdiff_t mask_stride, const int width,
+                             const int height, void* LIBGAV1_RESTRICT dest,
+                             const ptrdiff_t dst_stride) {
   auto* dst = static_cast<uint8_t*>(dest);
   const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
   const ptrdiff_t pred_stride_0 = width;
   const ptrdiff_t pred_stride_1 = width;
   if (width == 4) {
-    MaskBlending4xH_SSE4<subsampling_x, subsampling_y>(
-        pred_0, pred_1, mask_ptr, mask_stride, height, dst, dst_stride);
+    MaskBlending4xH_SSE4_1<subsampling_x, subsampling_y>(
+        pred_0, pred_1, mask_ptr, height, dst, dst_stride);
     return;
   }
   const uint8_t* mask = mask_ptr;
@@ -293,7 +306,6 @@ inline void InterIntraWriteMaskBlendLine8bpp4x2(
   const __m128i pred_mask = _mm_unpacklo_epi8(pred_mask_0, pred_mask_1);
 
   const __m128i pred_val_0 = LoadLo8(pred_0);
-  // TODO(b/150326556): One load.
   __m128i pred_val_1 = Load4(pred_1);
   pred_val_1 = _mm_or_si128(_mm_slli_si128(Load4(pred_1 + pred_stride_1), 4),
                             pred_val_1);
@@ -309,16 +321,16 @@ inline void InterIntraWriteMaskBlendLine8bpp4x2(
 }
 
 template <int subsampling_x, int subsampling_y>
-inline void InterIntraMaskBlending8bpp4x4_SSE4(
+inline void InterIntraMaskBlending8bpp4x4_SSE4_1(
     const uint8_t* LIBGAV1_RESTRICT pred_0, uint8_t* LIBGAV1_RESTRICT pred_1,
     const ptrdiff_t pred_stride_1, const uint8_t* LIBGAV1_RESTRICT mask,
     const ptrdiff_t mask_stride) {
   const __m128i mask_inverter = _mm_set1_epi8(64);
   const __m128i pred_mask_u16_first =
-      GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+      GetInterIntraMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
   mask += mask_stride << (1 + subsampling_y);
   const __m128i pred_mask_u16_second =
-      GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+      GetInterIntraMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
   mask += mask_stride << (1 + subsampling_y);
   __m128i pred_mask_1 =
       _mm_packus_epi16(pred_mask_u16_first, pred_mask_u16_second);
@@ -335,26 +347,26 @@ inline void InterIntraMaskBlending8bpp4x4_SSE4(
 }
 
 template <int subsampling_x, int subsampling_y>
-inline void InterIntraMaskBlending8bpp4xH_SSE4(
+inline void InterIntraMaskBlending8bpp4xH_SSE4_1(
     const uint8_t* LIBGAV1_RESTRICT pred_0, uint8_t* LIBGAV1_RESTRICT pred_1,
     const ptrdiff_t pred_stride_1,
     const uint8_t* LIBGAV1_RESTRICT const mask_ptr, const ptrdiff_t mask_stride,
     const int height) {
   const uint8_t* mask = mask_ptr;
   if (height == 4) {
-    InterIntraMaskBlending8bpp4x4_SSE4<subsampling_x, subsampling_y>(
+    InterIntraMaskBlending8bpp4x4_SSE4_1<subsampling_x, subsampling_y>(
         pred_0, pred_1, pred_stride_1, mask, mask_stride);
     return;
   }
   int y = 0;
   do {
-    InterIntraMaskBlending8bpp4x4_SSE4<subsampling_x, subsampling_y>(
+    InterIntraMaskBlending8bpp4x4_SSE4_1<subsampling_x, subsampling_y>(
         pred_0, pred_1, pred_stride_1, mask, mask_stride);
     pred_0 += 4 << 2;
     pred_1 += pred_stride_1 << 2;
     mask += mask_stride << (2 + subsampling_y);
 
-    InterIntraMaskBlending8bpp4x4_SSE4<subsampling_x, subsampling_y>(
+    InterIntraMaskBlending8bpp4x4_SSE4_1<subsampling_x, subsampling_y>(
         pred_0, pred_1, pred_stride_1, mask, mask_stride);
     pred_0 += 4 << 2;
     pred_1 += pred_stride_1 << 2;
@@ -363,14 +375,31 @@ inline void InterIntraMaskBlending8bpp4xH_SSE4(
   } while (y < height);
 }
 
+// This version returns 8-bit packed values to fit in _mm_maddubs_epi16 because,
+// when is_inter_intra is true, the prediction values are brought to 8-bit
+// packing as well.
+template <int subsampling_x, int subsampling_y>
+inline __m128i GetInterIntraMask8bpp8(const uint8_t* LIBGAV1_RESTRICT mask,
+                                      ptrdiff_t stride) {
+  if (subsampling_x == 1) {
+    const __m128i ret = GetMask8<subsampling_x, subsampling_y>(mask, stride);
+    return _mm_packus_epi16(ret, ret);
+  }
+  assert(subsampling_y == 0 && subsampling_x == 0);
+  // Unfortunately there is no shift operation for 8-bit packing, or else we
+  // could return everything with 8-bit packing.
+  const __m128i mask_val = LoadLo8(mask);
+  return mask_val;
+}
+
 template <int subsampling_x, int subsampling_y>
-void InterIntraMaskBlend8bpp_SSE4(
+void InterIntraMaskBlend8bpp_SSE4_1(
     const uint8_t* LIBGAV1_RESTRICT prediction_0,
     uint8_t* LIBGAV1_RESTRICT prediction_1, const ptrdiff_t prediction_stride_1,
     const uint8_t* LIBGAV1_RESTRICT const mask_ptr, const ptrdiff_t mask_stride,
     const int width, const int height) {
   if (width == 4) {
-    InterIntraMaskBlending8bpp4xH_SSE4<subsampling_x, subsampling_y>(
+    InterIntraMaskBlending8bpp4xH_SSE4_1<subsampling_x, subsampling_y>(
         prediction_0, prediction_1, prediction_stride_1, mask_ptr, mask_stride,
         height);
     return;
@@ -382,7 +411,7 @@ void InterIntraMaskBlend8bpp_SSE4(
     int x = 0;
     do {
       const __m128i pred_mask_1 =
-          GetInterIntraMask8<subsampling_x, subsampling_y>(
+          GetInterIntraMask8bpp8<subsampling_x, subsampling_y>(
               mask + (x << subsampling_x), mask_stride);
       // 64 - mask
       const __m128i pred_mask_0 = _mm_sub_epi8(mask_inverter, pred_mask_1);
@@ -411,24 +440,24 @@ void Init8bpp() {
   Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
   assert(dsp != nullptr);
 #if DSP_ENABLED_8BPP_SSE4_1(MaskBlend444)
-  dsp->mask_blend[0][0] = MaskBlend_SSE4<0, 0>;
+  dsp->mask_blend[0][0] = MaskBlend_SSE4_1<0, 0>;
 #endif
 #if DSP_ENABLED_8BPP_SSE4_1(MaskBlend422)
-  dsp->mask_blend[1][0] = MaskBlend_SSE4<1, 0>;
+  dsp->mask_blend[1][0] = MaskBlend_SSE4_1<1, 0>;
 #endif
 #if DSP_ENABLED_8BPP_SSE4_1(MaskBlend420)
-  dsp->mask_blend[2][0] = MaskBlend_SSE4<1, 1>;
+  dsp->mask_blend[2][0] = MaskBlend_SSE4_1<1, 1>;
 #endif
   // The is_inter_intra index of mask_blend[][] is replaced by
   // inter_intra_mask_blend_8bpp[] in 8-bit.
 #if DSP_ENABLED_8BPP_SSE4_1(InterIntraMaskBlend8bpp444)
-  dsp->inter_intra_mask_blend_8bpp[0] = InterIntraMaskBlend8bpp_SSE4<0, 0>;
+  dsp->inter_intra_mask_blend_8bpp[0] = InterIntraMaskBlend8bpp_SSE4_1<0, 0>;
 #endif
 #if DSP_ENABLED_8BPP_SSE4_1(InterIntraMaskBlend8bpp422)
-  dsp->inter_intra_mask_blend_8bpp[1] = InterIntraMaskBlend8bpp_SSE4<1, 0>;
+  dsp->inter_intra_mask_blend_8bpp[1] = InterIntraMaskBlend8bpp_SSE4_1<1, 0>;
 #endif
 #if DSP_ENABLED_8BPP_SSE4_1(InterIntraMaskBlend8bpp420)
-  dsp->inter_intra_mask_blend_8bpp[2] = InterIntraMaskBlend8bpp_SSE4<1, 1>;
+  dsp->inter_intra_mask_blend_8bpp[2] = InterIntraMaskBlend8bpp_SSE4_1<1, 1>;
 #endif
 }
 
@@ -443,14 +472,6 @@ constexpr int kMax10bppSample = (1 << 10) - 1;
 constexpr int kMaskInverse = 64;
 constexpr int kRoundBitsMaskBlend = 4;
 
-inline __m128i RightShiftWithRoundingZero_U16(const __m128i v_val_d, int bits,
-                                              const __m128i zero) {
-  // Shift out all but the last bit.
-  const __m128i v_tmp_d = _mm_srli_epi16(v_val_d, bits - 1);
-  // Avg with zero will shift by 1 and round.
-  return _mm_avg_epu16(v_tmp_d, zero);
-}
-
 inline __m128i RightShiftWithRoundingConst_S32(const __m128i v_val_d, int bits,
                                                const __m128i shift) {
   const __m128i v_tmp_d = _mm_add_epi32(v_val_d, shift);
@@ -458,53 +479,31 @@ inline __m128i RightShiftWithRoundingConst_S32(const __m128i v_val_d, int bits,
 }
 
 template <int subsampling_x, int subsampling_y>
-inline __m128i GetMask4x2(const uint8_t* mask, ptrdiff_t mask_stride,
-                          const __m128i zero) {
-  if (subsampling_x == 1) {
-    if (subsampling_y == 0) {
-      const __m128i mask_val_0 = _mm_cvtepu8_epi16(LoadLo8(mask));
-      const __m128i mask_val_1 =
-          _mm_cvtepu8_epi16(LoadLo8(mask + (mask_stride << subsampling_y)));
-      __m128i subsampled_mask = _mm_hadd_epi16(mask_val_0, mask_val_1);
-      return RightShiftWithRoundingZero_U16(subsampled_mask, 1, zero);
-    }
-    const __m128i one = _mm_set1_epi8(1);
-    const __m128i mask_val_0 =
-        LoadHi8(LoadLo8(mask), mask + (mask_stride << 1));
-    const __m128i mask_val_1 = LoadHi8(LoadLo8(mask + mask_stride),
-                                       mask + (mask_stride << 1) + mask_stride);
-    const __m128i add = _mm_adds_epu8(mask_val_0, mask_val_1);
-    const __m128i subsampled_mask = _mm_maddubs_epi16(add, one);
-    return RightShiftWithRoundingZero_U16(subsampled_mask, 2, zero);
+inline __m128i GetMask4x2(const uint8_t* mask) {
+  if (subsampling_x == 1 && subsampling_y == 1) {
+    const __m128i mask_row_01 = LoadUnaligned16(mask);
+    const __m128i mask_row_23 = LoadUnaligned16(mask + 16);
+    const __m128i mask_val_0 = _mm_cvtepu8_epi16(mask_row_01);
+    const __m128i mask_val_1 =
+        _mm_cvtepu8_epi16(_mm_srli_si128(mask_row_01, 8));
+    const __m128i mask_val_2 = _mm_cvtepu8_epi16(mask_row_23);
+    const __m128i mask_val_3 =
+        _mm_cvtepu8_epi16(_mm_srli_si128(mask_row_23, 8));
+    const __m128i subsampled_mask_02 = _mm_hadd_epi16(mask_val_0, mask_val_2);
+    const __m128i subsampled_mask_13 = _mm_hadd_epi16(mask_val_1, mask_val_3);
+    const __m128i subsampled_mask =
+        _mm_add_epi16(subsampled_mask_02, subsampled_mask_13);
+    return RightShiftWithRounding_U16(subsampled_mask, 2);
   }
-  assert(subsampling_y == 0 && subsampling_x == 0);
-  const __m128i mask_val_0 = Load4(mask);
-  const __m128i mask_val_1 = Load4(mask + mask_stride);
-  return _mm_cvtepu8_epi16(
-      _mm_or_si128(mask_val_0, _mm_slli_si128(mask_val_1, 4)));
-}
-
-template <int subsampling_x, int subsampling_y>
-inline __m128i GetMask8(const uint8_t* mask, const ptrdiff_t stride,
-                        const __m128i zero) {
   if (subsampling_x == 1) {
-    if (subsampling_y == 0) {
-      const __m128i row_vals = LoadUnaligned16(mask);
-      const __m128i mask_val_0 = _mm_cvtepu8_epi16(row_vals);
-      const __m128i mask_val_1 = _mm_cvtepu8_epi16(_mm_srli_si128(row_vals, 8));
-      __m128i subsampled_mask = _mm_hadd_epi16(mask_val_0, mask_val_1);
-      return RightShiftWithRoundingZero_U16(subsampled_mask, 1, zero);
-    }
-    const __m128i one = _mm_set1_epi8(1);
-    const __m128i mask_val_0 = LoadUnaligned16(mask);
-    const __m128i mask_val_1 = LoadUnaligned16(mask + stride);
-    const __m128i add_0 = _mm_adds_epu8(mask_val_0, mask_val_1);
-    const __m128i mask_0 = _mm_maddubs_epi16(add_0, one);
-    return RightShiftWithRoundingZero_U16(mask_0, 2, zero);
+    const __m128i mask_row_01 = LoadUnaligned16(mask);
+    const __m128i mask_val_0 = _mm_cvtepu8_epi16(mask_row_01);
+    const __m128i mask_val_1 =
+        _mm_cvtepu8_epi16(_mm_srli_si128(mask_row_01, 8));
+    const __m128i subsampled_mask = _mm_hadd_epi16(mask_val_0, mask_val_1);
+    return RightShiftWithRounding_U16(subsampled_mask, 1);
   }
-  assert(subsampling_y == 0 && subsampling_x == 0);
-  const __m128i mask_val = LoadLo8(mask);
-  return _mm_cvtepu8_epi16(mask_val);
+  return _mm_cvtepu8_epi16(LoadLo8(mask));
 }
 
 inline void WriteMaskBlendLine10bpp4x2_SSE4_1(
@@ -558,12 +557,10 @@ inline void MaskBlend10bpp4x4_SSE4_1(const uint16_t* LIBGAV1_RESTRICT pred_0,
                                      uint16_t* LIBGAV1_RESTRICT dst,
                                      const ptrdiff_t dst_stride) {
   const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse);
-  const __m128i zero = _mm_setzero_si128();
   const __m128i shift4 = _mm_set1_epi32((1 << kRoundBitsMaskBlend) >> 1);
   const __m128i offset = _mm_set1_epi32(kCompoundOffset);
   const __m128i max = _mm_set1_epi16(kMax10bppSample);
-  __m128i pred_mask_0 =
-      GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+  __m128i pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask);
   __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
   WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1, pred_mask_0,
                                     pred_mask_1, offset, max, shift4, dst,
@@ -573,8 +570,7 @@ inline void MaskBlend10bpp4x4_SSE4_1(const uint16_t* LIBGAV1_RESTRICT pred_0,
   mask += mask_stride << (1 + subsampling_y);
   dst += dst_stride << 1;
 
-  pred_mask_0 =
-      GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+  pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask);
   pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
   WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1, pred_mask_0,
                                     pred_mask_1, offset, max, shift4, dst,
@@ -595,7 +591,6 @@ inline void MaskBlend10bpp4xH_SSE4_1(
     return;
   }
   const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse);
-  const __m128i zero = _mm_setzero_si128();
   const uint8_t pred0_stride2 = 4 << 1;
   const ptrdiff_t pred1_stride2 = pred_stride_1 << 1;
   const ptrdiff_t mask_stride2 = mask_stride << (1 + subsampling_y);
@@ -605,8 +600,7 @@ inline void MaskBlend10bpp4xH_SSE4_1(
   const __m128i shift4 = _mm_set1_epi32((1 << kRoundBitsMaskBlend) >> 1);
   int y = height;
   do {
-    __m128i pred_mask_0 =
-        GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+    __m128i pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask);
     __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
 
     WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
@@ -617,8 +611,7 @@ inline void MaskBlend10bpp4xH_SSE4_1(
     mask += mask_stride2;
     dst += dst_stride2;
 
-    pred_mask_0 =
-        GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+    pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask);
     pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
     WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
                                       pred_mask_0, pred_mask_1, offset, max,
@@ -628,8 +621,7 @@ inline void MaskBlend10bpp4xH_SSE4_1(
     mask += mask_stride2;
     dst += dst_stride2;
 
-    pred_mask_0 =
-        GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+    pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask);
     pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
     WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
                                       pred_mask_0, pred_mask_1, offset, max,
@@ -639,8 +631,7 @@ inline void MaskBlend10bpp4xH_SSE4_1(
     mask += mask_stride2;
     dst += dst_stride2;
 
-    pred_mask_0 =
-        GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+    pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask);
     pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
     WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
                                       pred_mask_0, pred_mask_1, offset, max,
@@ -675,7 +666,6 @@ inline void MaskBlend10bpp_SSE4_1(
   }
   const uint8_t* mask = mask_ptr;
   const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse);
-  const __m128i zero = _mm_setzero_si128();
   const ptrdiff_t mask_stride_ss = mask_stride << subsampling_y;
   const __m128i offset = _mm_set1_epi32(kCompoundOffset);
   const __m128i max = _mm_set1_epi16(kMax10bppSample);
@@ -685,7 +675,7 @@ inline void MaskBlend10bpp_SSE4_1(
     int x = 0;
     do {
       const __m128i pred_mask_0 = GetMask8<subsampling_x, subsampling_y>(
-          mask + (x << subsampling_x), mask_stride, zero);
+          mask + (x << subsampling_x), mask_stride);
       const __m128i pred_val_0 = LoadUnaligned16(pred_0 + x);
       const __m128i pred_val_1 = LoadUnaligned16(pred_1 + x);
       // 64 - mask
@@ -729,7 +719,6 @@ inline void MaskBlend10bpp_SSE4_1(
     mask += mask_stride_ss;
   } while (--y != 0);
 }
-
 inline void InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(
     const uint16_t* LIBGAV1_RESTRICT prediction_0,
     const uint16_t* LIBGAV1_RESTRICT prediction_1,
@@ -764,9 +753,8 @@ inline void InterIntraMaskBlend10bpp4x4_SSE4_1(
     uint16_t* LIBGAV1_RESTRICT dst, const ptrdiff_t dst_stride) {
   const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse);
   const __m128i shift6 = _mm_set1_epi32((1 << 6) >> 1);
-  const __m128i zero = _mm_setzero_si128();
   __m128i pred_mask_0 =
-      GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+      GetInterIntraMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
   __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
   InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
                                               pred_mask_0, pred_mask_1, shift6,
@@ -777,7 +765,7 @@ inline void InterIntraMaskBlend10bpp4x4_SSE4_1(
   dst += dst_stride << 1;
 
   pred_mask_0 =
-      GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+      GetInterIntraMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
   pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
   InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
                                               pred_mask_0, pred_mask_1, shift6,
@@ -798,7 +786,6 @@ inline void InterIntraMaskBlend10bpp4xH_SSE4_1(
     return;
   }
   const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse);
-  const __m128i zero = _mm_setzero_si128();
   const __m128i shift6 = _mm_set1_epi32((1 << 6) >> 1);
   const uint8_t pred0_stride2 = 4 << 1;
   const ptrdiff_t pred1_stride2 = pred_stride_1 << 1;
@@ -807,7 +794,7 @@ inline void InterIntraMaskBlend10bpp4xH_SSE4_1(
   int y = height;
   do {
     __m128i pred_mask_0 =
-        GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+        GetInterIntraMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
     __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
     InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
                                                 pred_mask_0, pred_mask_1,
@@ -818,7 +805,7 @@ inline void InterIntraMaskBlend10bpp4xH_SSE4_1(
     dst += dst_stride2;
 
     pred_mask_0 =
-        GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+        GetInterIntraMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
     pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
     InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
                                                 pred_mask_0, pred_mask_1,
@@ -829,7 +816,7 @@ inline void InterIntraMaskBlend10bpp4xH_SSE4_1(
     dst += dst_stride2;
 
     pred_mask_0 =
-        GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+        GetInterIntraMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
     pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
     InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
                                                 pred_mask_0, pred_mask_1,
@@ -840,7 +827,7 @@ inline void InterIntraMaskBlend10bpp4xH_SSE4_1(
     dst += dst_stride2;
 
     pred_mask_0 =
-        GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+        GetInterIntraMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
     pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
     InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
                                                 pred_mask_0, pred_mask_1,
@@ -876,14 +863,13 @@ inline void InterIntraMaskBlend10bpp_SSE4_1(
   const uint8_t* mask = mask_ptr;
   const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse);
   const __m128i shift6 = _mm_set1_epi32((1 << 6) >> 1);
-  const __m128i zero = _mm_setzero_si128();
   const ptrdiff_t mask_stride_ss = mask_stride << subsampling_y;
   int y = height;
   do {
     int x = 0;
     do {
       const __m128i pred_mask_0 = GetMask8<subsampling_x, subsampling_y>(
-          mask + (x << subsampling_x), mask_stride, zero);
+          mask + (x << subsampling_x), mask_stride);
       const __m128i pred_val_0 = LoadUnaligned16(pred_0 + x);
       const __m128i pred_val_1 = LoadUnaligned16(pred_1 + x);
       // 64 - mask
diff --git a/src/dsp/x86/obmc_sse4.cc b/src/dsp/x86/obmc_sse4.cc
index 8ce23b4..f068ff3 100644
--- a/src/dsp/x86/obmc_sse4.cc
+++ b/src/dsp/x86/obmc_sse4.cc
@@ -39,8 +39,8 @@ namespace {
 inline void OverlapBlendFromLeft2xH_SSE4_1(
     uint8_t* LIBGAV1_RESTRICT const prediction,
     const ptrdiff_t prediction_stride, const int height,
-    const uint8_t* LIBGAV1_RESTRICT const obmc_prediction,
-    const ptrdiff_t obmc_prediction_stride) {
+    const uint8_t* LIBGAV1_RESTRICT const obmc_prediction) {
+  constexpr int obmc_prediction_stride = 2;
   uint8_t* pred = prediction;
   const uint8_t* obmc_pred = obmc_prediction;
   const __m128i mask_inverter = _mm_cvtsi32_si128(0x40404040);
@@ -51,8 +51,7 @@ inline void OverlapBlendFromLeft2xH_SSE4_1(
   int y = height;
   do {
     const __m128i pred_val = Load2x2(pred, pred + prediction_stride);
-    const __m128i obmc_pred_val =
-        Load2x2(obmc_pred, obmc_pred + obmc_prediction_stride);
+    const __m128i obmc_pred_val = Load4(obmc_pred);
 
     const __m128i terms = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
     const __m128i result =
@@ -71,8 +70,8 @@ inline void OverlapBlendFromLeft2xH_SSE4_1(
 inline void OverlapBlendFromLeft4xH_SSE4_1(
     uint8_t* LIBGAV1_RESTRICT const prediction,
     const ptrdiff_t prediction_stride, const int height,
-    const uint8_t* LIBGAV1_RESTRICT const obmc_prediction,
-    const ptrdiff_t obmc_prediction_stride) {
+    const uint8_t* LIBGAV1_RESTRICT const obmc_prediction) {
+  constexpr int obmc_prediction_stride = 4;
   uint8_t* pred = prediction;
   const uint8_t* obmc_pred = obmc_prediction;
   const __m128i mask_inverter = _mm_cvtsi32_si128(0x40404040);
@@ -85,15 +84,12 @@ inline void OverlapBlendFromLeft4xH_SSE4_1(
   int y = height;
   do {
     const __m128i pred_val0 = Load4(pred);
-    const __m128i obmc_pred_val0 = Load4(obmc_pred);
     pred += prediction_stride;
-    obmc_pred += obmc_prediction_stride;
 
     // Place the second row of each source in the second four bytes.
     const __m128i pred_val =
         _mm_alignr_epi8(Load4(pred), _mm_slli_si128(pred_val0, 12), 12);
-    const __m128i obmc_pred_val = _mm_alignr_epi8(
-        Load4(obmc_pred), _mm_slli_si128(obmc_pred_val0, 12), 12);
+    const __m128i obmc_pred_val = LoadLo8(obmc_pred);
     const __m128i terms = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
     const __m128i result =
         RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6);
@@ -102,7 +98,7 @@ inline void OverlapBlendFromLeft4xH_SSE4_1(
     const int second_row_result = _mm_extract_epi32(packed_result, 1);
     memcpy(pred, &second_row_result, sizeof(second_row_result));
     pred += prediction_stride;
-    obmc_pred += obmc_prediction_stride;
+    obmc_pred += obmc_prediction_stride << 1;
     y -= 2;
   } while (y != 0);
 }
@@ -110,8 +106,8 @@ inline void OverlapBlendFromLeft4xH_SSE4_1(
 inline void OverlapBlendFromLeft8xH_SSE4_1(
     uint8_t* LIBGAV1_RESTRICT const prediction,
     const ptrdiff_t prediction_stride, const int height,
-    const uint8_t* LIBGAV1_RESTRICT const obmc_prediction,
-    const ptrdiff_t obmc_prediction_stride) {
+    const uint8_t* LIBGAV1_RESTRICT const obmc_prediction) {
+  constexpr int obmc_prediction_stride = 8;
   uint8_t* pred = prediction;
   const uint8_t* obmc_pred = obmc_prediction;
   const __m128i mask_inverter = _mm_set1_epi8(64);
@@ -121,16 +117,25 @@ inline void OverlapBlendFromLeft8xH_SSE4_1(
   const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val);
   int y = height;
   do {
-    const __m128i pred_val = LoadLo8(pred);
-    const __m128i obmc_pred_val = LoadLo8(obmc_pred);
-    const __m128i terms = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
-    const __m128i result =
-        RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6);
+    const __m128i pred_val = LoadHi8(LoadLo8(pred), pred + prediction_stride);
+    const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred);
+
+    const __m128i terms_lo = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
+    const __m128i result_lo =
+        RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_lo, masks), 6);
+
+    const __m128i terms_hi = _mm_unpackhi_epi8(pred_val, obmc_pred_val);
+    const __m128i result_hi =
+        RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_hi, masks), 6);
 
-    StoreLo8(pred, _mm_packus_epi16(result, result));
+    const __m128i result = _mm_packus_epi16(result_lo, result_hi);
+    StoreLo8(pred, result);
     pred += prediction_stride;
-    obmc_pred += obmc_prediction_stride;
-  } while (--y != 0);
+    StoreHi8(pred, result);
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride << 1;
+    y -= 2;
+  } while (y != 0);
 }
 
 void OverlapBlendFromLeft_SSE4_1(
@@ -144,18 +149,15 @@ void OverlapBlendFromLeft_SSE4_1(
   assert(height >= 4);
 
   if (width == 2) {
-    OverlapBlendFromLeft2xH_SSE4_1(pred, prediction_stride, height, obmc_pred,
-                                   obmc_prediction_stride);
+    OverlapBlendFromLeft2xH_SSE4_1(pred, prediction_stride, height, obmc_pred);
     return;
   }
   if (width == 4) {
-    OverlapBlendFromLeft4xH_SSE4_1(pred, prediction_stride, height, obmc_pred,
-                                   obmc_prediction_stride);
+    OverlapBlendFromLeft4xH_SSE4_1(pred, prediction_stride, height, obmc_pred);
     return;
   }
   if (width == 8) {
-    OverlapBlendFromLeft8xH_SSE4_1(pred, prediction_stride, height, obmc_pred,
-                                   obmc_prediction_stride);
+    OverlapBlendFromLeft8xH_SSE4_1(pred, prediction_stride, height, obmc_pred);
     return;
   }
   const __m128i mask_inverter = _mm_set1_epi8(64);
@@ -192,8 +194,8 @@ void OverlapBlendFromLeft_SSE4_1(
 inline void OverlapBlendFromTop4xH_SSE4_1(
     uint8_t* LIBGAV1_RESTRICT const prediction,
     const ptrdiff_t prediction_stride, const int height,
-    const uint8_t* LIBGAV1_RESTRICT const obmc_prediction,
-    const ptrdiff_t obmc_prediction_stride) {
+    const uint8_t* LIBGAV1_RESTRICT const obmc_prediction) {
+  constexpr int obmc_prediction_stride = 4;
   uint8_t* pred = prediction;
   const uint8_t* obmc_pred = obmc_prediction;
   const __m128i mask_inverter = _mm_set1_epi16(64);
@@ -212,13 +214,10 @@ inline void OverlapBlendFromTop4xH_SSE4_1(
         _mm_sub_epi8(mask_inverter, _mm_sign_epi8(mask_val, mask_preinverter));
     const __m128i pred_val0 = Load4(pred);
 
-    const __m128i obmc_pred_val0 = Load4(obmc_pred);
+    const __m128i obmc_pred_val = LoadLo8(obmc_pred);
     pred += prediction_stride;
-    obmc_pred += obmc_prediction_stride;
     const __m128i pred_val =
         _mm_alignr_epi8(Load4(pred), _mm_slli_si128(pred_val0, 12), 12);
-    const __m128i obmc_pred_val = _mm_alignr_epi8(
-        Load4(obmc_pred), _mm_slli_si128(obmc_pred_val0, 12), 12);
     const __m128i terms = _mm_unpacklo_epi8(obmc_pred_val, pred_val);
     const __m128i result =
         RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6);
@@ -227,7 +226,7 @@ inline void OverlapBlendFromTop4xH_SSE4_1(
     Store4(pred - prediction_stride, packed_result);
     Store4(pred, _mm_srli_si128(packed_result, 4));
     pred += prediction_stride;
-    obmc_pred += obmc_prediction_stride;
+    obmc_pred += obmc_prediction_stride << 1;
     y += 2;
   } while (y < compute_height);
 }
@@ -235,8 +234,8 @@ inline void OverlapBlendFromTop4xH_SSE4_1(
 inline void OverlapBlendFromTop8xH_SSE4_1(
     uint8_t* LIBGAV1_RESTRICT const prediction,
     const ptrdiff_t prediction_stride, const int height,
-    const uint8_t* LIBGAV1_RESTRICT const obmc_prediction,
-    const ptrdiff_t obmc_prediction_stride) {
+    const uint8_t* LIBGAV1_RESTRICT const obmc_prediction) {
+  constexpr int obmc_prediction_stride = 8;
   uint8_t* pred = prediction;
   const uint8_t* obmc_pred = obmc_prediction;
   const uint8_t* mask = kObmcMask + height - 2;
@@ -244,20 +243,35 @@ inline void OverlapBlendFromTop8xH_SSE4_1(
   const int compute_height = height - (height >> 2);
   int y = compute_height;
   do {
-    const __m128i mask_val = _mm_set1_epi8(mask[compute_height - y]);
+    const __m128i mask_val0 = _mm_set1_epi8(mask[compute_height - y]);
     // 64 - mask
-    const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
-    const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val);
-    const __m128i pred_val = LoadLo8(pred);
-    const __m128i obmc_pred_val = LoadLo8(obmc_pred);
-    const __m128i terms = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
-    const __m128i result =
-        RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6);
+    const __m128i obmc_mask_val0 = _mm_sub_epi8(mask_inverter, mask_val0);
+    const __m128i masks0 = _mm_unpacklo_epi8(mask_val0, obmc_mask_val0);
 
-    StoreLo8(pred, _mm_packus_epi16(result, result));
+    const __m128i pred_val = LoadHi8(LoadLo8(pred), pred + prediction_stride);
+    const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred);
+
+    const __m128i terms_lo = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
+    const __m128i result_lo =
+        RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_lo, masks0), 6);
+
+    --y;
+    const __m128i mask_val1 = _mm_set1_epi8(mask[compute_height - y]);
+    // 64 - mask
+    const __m128i obmc_mask_val1 = _mm_sub_epi8(mask_inverter, mask_val1);
+    const __m128i masks1 = _mm_unpacklo_epi8(mask_val1, obmc_mask_val1);
+
+    const __m128i terms_hi = _mm_unpackhi_epi8(pred_val, obmc_pred_val);
+    const __m128i result_hi =
+        RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_hi, masks1), 6);
+
+    const __m128i result = _mm_packus_epi16(result_lo, result_hi);
+    StoreLo8(pred, result);
     pred += prediction_stride;
-    obmc_pred += obmc_prediction_stride;
-  } while (--y != 0);
+    StoreHi8(pred, result);
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride << 1;
+  } while (--y > 0);
 }
 
 void OverlapBlendFromTop_SSE4_1(
@@ -271,13 +285,11 @@ void OverlapBlendFromTop_SSE4_1(
   assert(height >= 2);
 
   if (width == 4) {
-    OverlapBlendFromTop4xH_SSE4_1(pred, prediction_stride, height, obmc_pred,
-                                  obmc_prediction_stride);
+    OverlapBlendFromTop4xH_SSE4_1(pred, prediction_stride, height, obmc_pred);
     return;
   }
   if (width == 8) {
-    OverlapBlendFromTop8xH_SSE4_1(pred, prediction_stride, height, obmc_pred,
-                                  obmc_prediction_stride);
+    OverlapBlendFromTop8xH_SSE4_1(pred, prediction_stride, height, obmc_pred);
     return;
   }
 
@@ -333,8 +345,8 @@ constexpr int kRoundBitsObmcBlend = 6;
 
 inline void OverlapBlendFromLeft2xH_SSE4_1(
     uint16_t* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride,
-    const int height, const uint16_t* LIBGAV1_RESTRICT const obmc_prediction,
-    const ptrdiff_t obmc_pred_stride) {
+    const int height, const uint16_t* LIBGAV1_RESTRICT const obmc_prediction) {
+  constexpr int obmc_pred_stride = 2;
   uint16_t* pred = prediction;
   const uint16_t* obmc_pred = obmc_prediction;
   const ptrdiff_t pred_stride2 = pred_stride << 1;
@@ -348,8 +360,7 @@ inline void OverlapBlendFromLeft2xH_SSE4_1(
   int y = height;
   do {
     const __m128i pred_val = Load4x2(pred, pred + pred_stride);
-    const __m128i obmc_pred_val =
-        Load4x2(obmc_pred, obmc_pred + obmc_pred_stride);
+    const __m128i obmc_pred_val = LoadLo8(obmc_pred);
     const __m128i terms = _mm_unpacklo_epi16(pred_val, obmc_pred_val);
     const __m128i result = RightShiftWithRounding_U32(
         _mm_madd_epi16(terms, masks), kRoundBitsObmcBlend);
@@ -364,8 +375,8 @@ inline void OverlapBlendFromLeft2xH_SSE4_1(
 
 inline void OverlapBlendFromLeft4xH_SSE4_1(
     uint16_t* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride,
-    const int height, const uint16_t* LIBGAV1_RESTRICT const obmc_prediction,
-    const ptrdiff_t obmc_pred_stride) {
+    const int height, const uint16_t* LIBGAV1_RESTRICT const obmc_prediction) {
+  constexpr int obmc_pred_stride = 4;
   uint16_t* pred = prediction;
   const uint16_t* obmc_pred = obmc_prediction;
   const ptrdiff_t pred_stride2 = pred_stride << 1;
@@ -379,8 +390,7 @@ inline void OverlapBlendFromLeft4xH_SSE4_1(
   int y = height;
   do {
     const __m128i pred_val = LoadHi8(LoadLo8(pred), pred + pred_stride);
-    const __m128i obmc_pred_val =
-        LoadHi8(LoadLo8(obmc_pred), obmc_pred + obmc_pred_stride);
+    const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred);
     const __m128i terms_lo = _mm_unpacklo_epi16(pred_val, obmc_pred_val);
     const __m128i terms_hi = _mm_unpackhi_epi16(pred_val, obmc_pred_val);
     const __m128i result_lo = RightShiftWithRounding_U32(
@@ -410,13 +420,11 @@ void OverlapBlendFromLeft10bpp_SSE4_1(
   assert(height >= 4);
 
   if (width == 2) {
-    OverlapBlendFromLeft2xH_SSE4_1(pred, pred_stride, height, obmc_pred,
-                                   obmc_pred_stride);
+    OverlapBlendFromLeft2xH_SSE4_1(pred, pred_stride, height, obmc_pred);
     return;
   }
   if (width == 4) {
-    OverlapBlendFromLeft4xH_SSE4_1(pred, pred_stride, height, obmc_pred,
-                                   obmc_pred_stride);
+    OverlapBlendFromLeft4xH_SSE4_1(pred, pred_stride, height, obmc_pred);
     return;
   }
   const __m128i mask_inverter = _mm_set1_epi8(64);
@@ -452,8 +460,8 @@ void OverlapBlendFromLeft10bpp_SSE4_1(
 
 inline void OverlapBlendFromTop4xH_SSE4_1(
     uint16_t* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride,
-    const int height, const uint16_t* LIBGAV1_RESTRICT const obmc_prediction,
-    const ptrdiff_t obmc_pred_stride) {
+    const int height, const uint16_t* LIBGAV1_RESTRICT const obmc_prediction) {
+  constexpr int obmc_pred_stride = 4;
   uint16_t* pred = prediction;
   const uint16_t* obmc_pred = obmc_prediction;
   const __m128i mask_inverter = _mm_set1_epi16(64);
@@ -473,8 +481,7 @@ inline void OverlapBlendFromTop4xH_SSE4_1(
     const __m128i masks_hi = _mm_cvtepi8_epi16(_mm_srli_si128(masks, 8));
 
     const __m128i pred_val = LoadHi8(LoadLo8(pred), pred + pred_stride);
-    const __m128i obmc_pred_val =
-        LoadHi8(LoadLo8(obmc_pred), obmc_pred + obmc_pred_stride);
+    const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred);
     const __m128i terms_lo = _mm_unpacklo_epi16(obmc_pred_val, pred_val);
     const __m128i terms_hi = _mm_unpackhi_epi16(obmc_pred_val, pred_val);
     const __m128i result_lo = RightShiftWithRounding_U32(
@@ -505,8 +512,7 @@ void OverlapBlendFromTop10bpp_SSE4_1(
   assert(height >= 2);
 
   if (width == 4) {
-    OverlapBlendFromTop4xH_SSE4_1(pred, pred_stride, height, obmc_pred,
-                                  obmc_pred_stride);
+    OverlapBlendFromTop4xH_SSE4_1(pred, pred_stride, height, obmc_pred);
     return;
   }
 
diff --git a/src/dsp/x86/warp_sse4.cc b/src/dsp/x86/warp_sse4.cc
index 5830894..5498052 100644
--- a/src/dsp/x86/warp_sse4.cc
+++ b/src/dsp/x86/warp_sse4.cc
@@ -167,7 +167,7 @@ inline void WriteVerticalFilter(const __m128i filter[8],
 }
 
 template <bool is_compound, typename DestType>
-inline void VerticalFilter(const int16_t source[15][8], int y4, int gamma,
+inline void VerticalFilter(const int16_t source[15][8], int64_t y4, int gamma,
                            int delta, DestType* LIBGAV1_RESTRICT dest_row,
                            ptrdiff_t dest_stride) {
   int sy4 = (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta);
@@ -188,8 +188,8 @@ inline void VerticalFilter(const int16_t source[15][8], int y4, int gamma,
 }
 
 template <bool is_compound, typename DestType>
-inline void VerticalFilter(const int16_t* LIBGAV1_RESTRICT source_cols, int y4,
-                           int gamma, int delta,
+inline void VerticalFilter(const int16_t* LIBGAV1_RESTRICT source_cols,
+                           int64_t y4, int gamma, int delta,
                            DestType* LIBGAV1_RESTRICT dest_row,
                            ptrdiff_t dest_stride) {
   int sy4 = (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta);
@@ -249,7 +249,7 @@ inline void WarpRegion1(const uint8_t* LIBGAV1_RESTRICT src,
 
 template <bool is_compound, typename DestType>
 inline void WarpRegion2(const uint8_t* LIBGAV1_RESTRICT src,
-                        ptrdiff_t source_stride, int source_width, int y4,
+                        ptrdiff_t source_stride, int source_width, int64_t y4,
                         int ix4, int iy4, int gamma, int delta,
                         int16_t intermediate_result_column[15],
                         DestType* LIBGAV1_RESTRICT dst_row,
@@ -291,7 +291,7 @@ inline void WarpRegion2(const uint8_t* LIBGAV1_RESTRICT src,
 template <bool is_compound, typename DestType>
 inline void WarpRegion3(const uint8_t* LIBGAV1_RESTRICT src,
                         ptrdiff_t source_stride, int source_height, int alpha,
-                        int beta, int x4, int ix4, int iy4,
+                        int beta, int64_t x4, int ix4, int iy4,
                         int16_t intermediate_result[15][8]) {
   // Region 3
   // At this point, we know ix4 - 7 < source_width - 1 and ix4 + 7 > 0.
@@ -323,8 +323,9 @@ inline void WarpRegion3(const uint8_t* LIBGAV1_RESTRICT src,
 
 template <bool is_compound, typename DestType>
 inline void WarpRegion4(const uint8_t* LIBGAV1_RESTRICT src,
-                        ptrdiff_t source_stride, int alpha, int beta, int x4,
-                        int ix4, int iy4, int16_t intermediate_result[15][8]) {
+                        ptrdiff_t source_stride, int alpha, int beta,
+                        int64_t x4, int ix4, int iy4,
+                        int16_t intermediate_result[15][8]) {
   // Region 4.
   // At this point, we know ix4 - 7 < source_width - 1 and ix4 + 7 > 0.
 
@@ -379,14 +380,8 @@ inline void HandleWarpBlock(const uint8_t* LIBGAV1_RESTRICT src,
     int16_t intermediate_result_column[15];
   };
 
-  const int dst_x =
-      src_x * warp_params[2] + src_y * warp_params[3] + warp_params[0];
-  const int dst_y =
-      src_x * warp_params[4] + src_y * warp_params[5] + warp_params[1];
-  const int x4 = dst_x >> subsampling_x;
-  const int y4 = dst_y >> subsampling_y;
-  const int ix4 = x4 >> kWarpedModelPrecisionBits;
-  const int iy4 = y4 >> kWarpedModelPrecisionBits;
+  const WarpFilterParams filter_params = GetWarpFilterParams(
+      src_x, src_y, subsampling_x, subsampling_y, warp_params);
   // A prediction block may fall outside the frame's boundaries. If a
   // prediction block is calculated using only samples outside the frame's
   // boundary, the filtering can be simplified. We can divide the plane
@@ -439,33 +434,38 @@ inline void HandleWarpBlock(const uint8_t* LIBGAV1_RESTRICT src,
   // border index (source_width - 1 or 0, respectively). Then for each x,
   // the inner for loop of the horizontal filter is reduced to multiplying
   // the border pixel by the sum of the filter coefficients.
-  if (ix4 - 7 >= source_width - 1 || ix4 + 7 <= 0) {
-    if ((iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0)) {
+  if (filter_params.ix4 - 7 >= source_width - 1 || filter_params.ix4 + 7 <= 0) {
+    if ((filter_params.iy4 - 7 >= source_height - 1 ||
+         filter_params.iy4 + 7 <= 0)) {
       // Outside the frame in both directions. One repeated value.
-      WarpRegion1<is_compound, DestType>(src, source_stride, source_width,
-                                         source_height, ix4, iy4, dst_row,
-                                         dest_stride);
+      WarpRegion1<is_compound, DestType>(
+          src, source_stride, source_width, source_height, filter_params.ix4,
+          filter_params.iy4, dst_row, dest_stride);
       return;
     }
     // Outside the frame horizontally. Rows repeated.
     WarpRegion2<is_compound, DestType>(
-        src, source_stride, source_width, y4, ix4, iy4, gamma, delta,
-        intermediate_result_column, dst_row, dest_stride);
+        src, source_stride, source_width, filter_params.y4, filter_params.ix4,
+        filter_params.iy4, gamma, delta, intermediate_result_column, dst_row,
+        dest_stride);
     return;
   }
 
-  if ((iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0)) {
+  if ((filter_params.iy4 - 7 >= source_height - 1 ||
+       filter_params.iy4 + 7 <= 0)) {
     // Outside the frame vertically.
-    WarpRegion3<is_compound, DestType>(src, source_stride, source_height, alpha,
-                                       beta, x4, ix4, iy4, intermediate_result);
+    WarpRegion3<is_compound, DestType>(
+        src, source_stride, source_height, alpha, beta, filter_params.x4,
+        filter_params.ix4, filter_params.iy4, intermediate_result);
   } else {
     // Inside the frame.
-    WarpRegion4<is_compound, DestType>(src, source_stride, alpha, beta, x4, ix4,
-                                       iy4, intermediate_result);
+    WarpRegion4<is_compound, DestType>(src, source_stride, alpha, beta,
+                                       filter_params.x4, filter_params.ix4,
+                                       filter_params.iy4, intermediate_result);
   }
   // Region 3 and 4 vertical filter.
-  VerticalFilter<is_compound, DestType>(intermediate_result, y4, gamma, delta,
-                                        dst_row, dest_stride);
+  VerticalFilter<is_compound, DestType>(intermediate_result, filter_params.y4,
+                                        gamma, delta, dst_row, dest_stride);
 }
 
 template <bool is_compound>
diff --git a/src/dsp/x86/weight_mask_sse4.cc b/src/dsp/x86/weight_mask_sse4.cc
index 69cb784..53a374d 100644
--- a/src/dsp/x86/weight_mask_sse4.cc
+++ b/src/dsp/x86/weight_mask_sse4.cc
@@ -37,10 +37,10 @@ namespace {
 constexpr int kRoundingBits8bpp = 4;
 
 template <bool mask_is_inverse, bool is_store_16>
-inline void WeightMask16_SSE4(const int16_t* LIBGAV1_RESTRICT prediction_0,
-                              const int16_t* LIBGAV1_RESTRICT prediction_1,
-                              uint8_t* LIBGAV1_RESTRICT mask,
-                              ptrdiff_t mask_stride) {
+inline void WeightMask16_SSE4_1(const int16_t* LIBGAV1_RESTRICT prediction_0,
+                                const int16_t* LIBGAV1_RESTRICT prediction_1,
+                                uint8_t* LIBGAV1_RESTRICT mask,
+                                ptrdiff_t mask_stride) {
   const __m128i pred_00 = LoadAligned16(prediction_0);
   const __m128i pred_10 = LoadAligned16(prediction_1);
   const __m128i difference_0 = RightShiftWithRounding_U16(
@@ -78,7 +78,7 @@ inline void WeightMask16_SSE4(const int16_t* LIBGAV1_RESTRICT prediction_0,
 }
 
 #define WEIGHT8_PAIR_WITHOUT_STRIDE \
-  WeightMask16_SSE4<mask_is_inverse, false>(pred_0, pred_1, mask, mask_stride)
+  WeightMask16_SSE4_1<mask_is_inverse, false>(pred_0, pred_1, mask, mask_stride)
 
 #define WEIGHT8_PAIR_AND_STRIDE \
   WEIGHT8_PAIR_WITHOUT_STRIDE;  \
@@ -87,9 +87,10 @@ inline void WeightMask16_SSE4(const int16_t* LIBGAV1_RESTRICT prediction_0,
   mask += mask_stride << 1
 
 template <bool mask_is_inverse>
-void WeightMask8x8_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
-                        const void* LIBGAV1_RESTRICT prediction_1,
-                        uint8_t* LIBGAV1_RESTRICT mask, ptrdiff_t mask_stride) {
+void WeightMask8x8_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                          const void* LIBGAV1_RESTRICT prediction_1,
+                          uint8_t* LIBGAV1_RESTRICT mask,
+                          ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
 
@@ -100,10 +101,10 @@ void WeightMask8x8_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
 }
 
 template <bool mask_is_inverse>
-void WeightMask8x16_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
-                         const void* LIBGAV1_RESTRICT prediction_1,
-                         uint8_t* LIBGAV1_RESTRICT mask,
-                         ptrdiff_t mask_stride) {
+void WeightMask8x16_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                           const void* LIBGAV1_RESTRICT prediction_1,
+                           uint8_t* LIBGAV1_RESTRICT mask,
+                           ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
   int y3 = 3;
@@ -116,10 +117,10 @@ void WeightMask8x16_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
 }
 
 template <bool mask_is_inverse>
-void WeightMask8x32_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
-                         const void* LIBGAV1_RESTRICT prediction_1,
-                         uint8_t* LIBGAV1_RESTRICT mask,
-                         ptrdiff_t mask_stride) {
+void WeightMask8x32_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                           const void* LIBGAV1_RESTRICT prediction_1,
+                           uint8_t* LIBGAV1_RESTRICT mask,
+                           ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
   int y5 = 5;
@@ -132,7 +133,7 @@ void WeightMask8x32_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
 }
 
 #define WEIGHT16_WITHOUT_STRIDE \
-  WeightMask16_SSE4<mask_is_inverse, true>(pred_0, pred_1, mask, mask_stride)
+  WeightMask16_SSE4_1<mask_is_inverse, true>(pred_0, pred_1, mask, mask_stride)
 
 #define WEIGHT16_AND_STRIDE \
   WEIGHT16_WITHOUT_STRIDE;  \
@@ -141,10 +142,10 @@ void WeightMask8x32_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
   mask += mask_stride
 
 template <bool mask_is_inverse>
-void WeightMask16x8_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
-                         const void* LIBGAV1_RESTRICT prediction_1,
-                         uint8_t* LIBGAV1_RESTRICT mask,
-                         ptrdiff_t mask_stride) {
+void WeightMask16x8_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                           const void* LIBGAV1_RESTRICT prediction_1,
+                           uint8_t* LIBGAV1_RESTRICT mask,
+                           ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
   int y = 7;
@@ -155,10 +156,10 @@ void WeightMask16x8_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
 }
 
 template <bool mask_is_inverse>
-void WeightMask16x16_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
-                          const void* LIBGAV1_RESTRICT prediction_1,
-                          uint8_t* LIBGAV1_RESTRICT mask,
-                          ptrdiff_t mask_stride) {
+void WeightMask16x16_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                            const void* LIBGAV1_RESTRICT prediction_1,
+                            uint8_t* LIBGAV1_RESTRICT mask,
+                            ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
   int y3 = 5;
@@ -171,10 +172,10 @@ void WeightMask16x16_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
 }
 
 template <bool mask_is_inverse>
-void WeightMask16x32_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
-                          const void* LIBGAV1_RESTRICT prediction_1,
-                          uint8_t* LIBGAV1_RESTRICT mask,
-                          ptrdiff_t mask_stride) {
+void WeightMask16x32_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                            const void* LIBGAV1_RESTRICT prediction_1,
+                            uint8_t* LIBGAV1_RESTRICT mask,
+                            ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
   int y5 = 6;
@@ -190,10 +191,10 @@ void WeightMask16x32_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
 }
 
 template <bool mask_is_inverse>
-void WeightMask16x64_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
-                          const void* LIBGAV1_RESTRICT prediction_1,
-                          uint8_t* LIBGAV1_RESTRICT mask,
-                          ptrdiff_t mask_stride) {
+void WeightMask16x64_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                            const void* LIBGAV1_RESTRICT prediction_1,
+                            uint8_t* LIBGAV1_RESTRICT mask,
+                            ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
   int y3 = 21;
@@ -205,10 +206,11 @@ void WeightMask16x64_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
   WEIGHT16_WITHOUT_STRIDE;
 }
 
-#define WEIGHT32_WITHOUT_STRIDE                                                \
-  WeightMask16_SSE4<mask_is_inverse, true>(pred_0, pred_1, mask, mask_stride); \
-  WeightMask16_SSE4<mask_is_inverse, true>(pred_0 + 16, pred_1 + 16,           \
-                                           mask + 16, mask_stride)
+#define WEIGHT32_WITHOUT_STRIDE                                        \
+  WeightMask16_SSE4_1<mask_is_inverse, true>(pred_0, pred_1, mask,     \
+                                             mask_stride);             \
+  WeightMask16_SSE4_1<mask_is_inverse, true>(pred_0 + 16, pred_1 + 16, \
+                                             mask + 16, mask_stride)
 
 #define WEIGHT32_AND_STRIDE \
   WEIGHT32_WITHOUT_STRIDE;  \
@@ -217,10 +219,10 @@ void WeightMask16x64_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
   mask += mask_stride
 
 template <bool mask_is_inverse>
-void WeightMask32x8_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
-                         const void* LIBGAV1_RESTRICT prediction_1,
-                         uint8_t* LIBGAV1_RESTRICT mask,
-                         ptrdiff_t mask_stride) {
+void WeightMask32x8_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                           const void* LIBGAV1_RESTRICT prediction_1,
+                           uint8_t* LIBGAV1_RESTRICT mask,
+                           ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
   WEIGHT32_AND_STRIDE;
@@ -234,10 +236,10 @@ void WeightMask32x8_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
 }
 
 template <bool mask_is_inverse>
-void WeightMask32x16_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
-                          const void* LIBGAV1_RESTRICT prediction_1,
-                          uint8_t* LIBGAV1_RESTRICT mask,
-                          ptrdiff_t mask_stride) {
+void WeightMask32x16_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                            const void* LIBGAV1_RESTRICT prediction_1,
+                            uint8_t* LIBGAV1_RESTRICT mask,
+                            ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
   int y3 = 5;
@@ -250,10 +252,10 @@ void WeightMask32x16_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
 }
 
 template <bool mask_is_inverse>
-void WeightMask32x32_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
-                          const void* LIBGAV1_RESTRICT prediction_1,
-                          uint8_t* LIBGAV1_RESTRICT mask,
-                          ptrdiff_t mask_stride) {
+void WeightMask32x32_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                            const void* LIBGAV1_RESTRICT prediction_1,
+                            uint8_t* LIBGAV1_RESTRICT mask,
+                            ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
   int y5 = 6;
@@ -269,10 +271,10 @@ void WeightMask32x32_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
 }
 
 template <bool mask_is_inverse>
-void WeightMask32x64_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
-                          const void* LIBGAV1_RESTRICT prediction_1,
-                          uint8_t* LIBGAV1_RESTRICT mask,
-                          ptrdiff_t mask_stride) {
+void WeightMask32x64_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                            const void* LIBGAV1_RESTRICT prediction_1,
+                            uint8_t* LIBGAV1_RESTRICT mask,
+                            ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
   int y3 = 21;
@@ -284,14 +286,15 @@ void WeightMask32x64_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
   WEIGHT32_WITHOUT_STRIDE;
 }
 
-#define WEIGHT64_WITHOUT_STRIDE                                                \
-  WeightMask16_SSE4<mask_is_inverse, true>(pred_0, pred_1, mask, mask_stride); \
-  WeightMask16_SSE4<mask_is_inverse, true>(pred_0 + 16, pred_1 + 16,           \
-                                           mask + 16, mask_stride);            \
-  WeightMask16_SSE4<mask_is_inverse, true>(pred_0 + 32, pred_1 + 32,           \
-                                           mask + 32, mask_stride);            \
-  WeightMask16_SSE4<mask_is_inverse, true>(pred_0 + 48, pred_1 + 48,           \
-                                           mask + 48, mask_stride)
+#define WEIGHT64_WITHOUT_STRIDE                                        \
+  WeightMask16_SSE4_1<mask_is_inverse, true>(pred_0, pred_1, mask,     \
+                                             mask_stride);             \
+  WeightMask16_SSE4_1<mask_is_inverse, true>(pred_0 + 16, pred_1 + 16, \
+                                             mask + 16, mask_stride);  \
+  WeightMask16_SSE4_1<mask_is_inverse, true>(pred_0 + 32, pred_1 + 32, \
+                                             mask + 32, mask_stride);  \
+  WeightMask16_SSE4_1<mask_is_inverse, true>(pred_0 + 48, pred_1 + 48, \
+                                             mask + 48, mask_stride)
 
 #define WEIGHT64_AND_STRIDE \
   WEIGHT64_WITHOUT_STRIDE;  \
@@ -300,10 +303,10 @@ void WeightMask32x64_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
   mask += mask_stride
 
 template <bool mask_is_inverse>
-void WeightMask64x16_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
-                          const void* LIBGAV1_RESTRICT prediction_1,
-                          uint8_t* LIBGAV1_RESTRICT mask,
-                          ptrdiff_t mask_stride) {
+void WeightMask64x16_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                            const void* LIBGAV1_RESTRICT prediction_1,
+                            uint8_t* LIBGAV1_RESTRICT mask,
+                            ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
   int y3 = 0;
@@ -316,10 +319,10 @@ void WeightMask64x16_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
 }
 
 template <bool mask_is_inverse>
-void WeightMask64x32_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
-                          const void* LIBGAV1_RESTRICT prediction_1,
-                          uint8_t* LIBGAV1_RESTRICT mask,
-                          ptrdiff_t mask_stride) {
+void WeightMask64x32_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                            const void* LIBGAV1_RESTRICT prediction_1,
+                            uint8_t* LIBGAV1_RESTRICT mask,
+                            ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
   int y5 = 0;
@@ -335,10 +338,10 @@ void WeightMask64x32_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
 }
 
 template <bool mask_is_inverse>
-void WeightMask64x64_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
-                          const void* LIBGAV1_RESTRICT prediction_1,
-                          uint8_t* LIBGAV1_RESTRICT mask,
-                          ptrdiff_t mask_stride) {
+void WeightMask64x64_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                            const void* LIBGAV1_RESTRICT prediction_1,
+                            uint8_t* LIBGAV1_RESTRICT mask,
+                            ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
   int y3 = 0;
@@ -351,10 +354,10 @@ void WeightMask64x64_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
 }
 
 template <bool mask_is_inverse>
-void WeightMask64x128_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
-                           const void* LIBGAV1_RESTRICT prediction_1,
-                           uint8_t* LIBGAV1_RESTRICT mask,
-                           ptrdiff_t mask_stride) {
+void WeightMask64x128_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                             const void* LIBGAV1_RESTRICT prediction_1,
+                             uint8_t* LIBGAV1_RESTRICT mask,
+                             ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
   int y3 = 0;
@@ -368,10 +371,10 @@ void WeightMask64x128_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
 }
 
 template <bool mask_is_inverse>
-void WeightMask128x64_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
-                           const void* LIBGAV1_RESTRICT prediction_1,
-                           uint8_t* LIBGAV1_RESTRICT mask,
-                           ptrdiff_t mask_stride) {
+void WeightMask128x64_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                             const void* LIBGAV1_RESTRICT prediction_1,
+                             uint8_t* LIBGAV1_RESTRICT mask,
+                             ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
   int y3 = 0;
@@ -412,10 +415,10 @@ void WeightMask128x64_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
 }
 
 template <bool mask_is_inverse>
-void WeightMask128x128_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
-                            const void* LIBGAV1_RESTRICT prediction_1,
-                            uint8_t* LIBGAV1_RESTRICT mask,
-                            ptrdiff_t mask_stride) {
+void WeightMask128x128_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                              const void* LIBGAV1_RESTRICT prediction_1,
+                              uint8_t* LIBGAV1_RESTRICT mask,
+                              ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
   int y3 = 0;
@@ -466,8 +469,9 @@ void WeightMask128x128_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
 
 #define INIT_WEIGHT_MASK_8BPP(width, height, w_index, h_index) \
   dsp->weight_mask[w_index][h_index][0] =                      \
-      WeightMask##width##x##height##_SSE4<0>;                  \
-  dsp->weight_mask[w_index][h_index][1] = WeightMask##width##x##height##_SSE4<1>
+      WeightMask##width##x##height##_SSE4_1<0>;                \
+  dsp->weight_mask[w_index][h_index][1] =                      \
+      WeightMask##width##x##height##_SSE4_1<1>
 void Init8bpp() {
   Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
   assert(dsp != nullptr);
@@ -501,7 +505,7 @@ constexpr int kRoundingBits10bpp = 6;
 constexpr int kScaledDiffShift = 4;
 
 template <bool mask_is_inverse, bool is_store_16>
-inline void WeightMask16_10bpp_SSE4(
+inline void WeightMask16_10bpp_SSE4_1(
     const uint16_t* LIBGAV1_RESTRICT prediction_0,
     const uint16_t* LIBGAV1_RESTRICT prediction_1,
     uint8_t* LIBGAV1_RESTRICT mask, ptrdiff_t mask_stride) {
@@ -562,9 +566,9 @@ inline void WeightMask16_10bpp_SSE4(
   }
 }
 
-#define WEIGHT8_PAIR_WITHOUT_STRIDE_10BPP                               \
-  WeightMask16_10bpp_SSE4<mask_is_inverse, false>(pred_0, pred_1, mask, \
-                                                  mask_stride)
+#define WEIGHT8_PAIR_WITHOUT_STRIDE_10BPP                                 \
+  WeightMask16_10bpp_SSE4_1<mask_is_inverse, false>(pred_0, pred_1, mask, \
+                                                    mask_stride)
 
 #define WEIGHT8_PAIR_AND_STRIDE_10BPP \
   WEIGHT8_PAIR_WITHOUT_STRIDE_10BPP;  \
@@ -573,10 +577,10 @@ inline void WeightMask16_10bpp_SSE4(
   mask += mask_stride << 1
 
 template <bool mask_is_inverse>
-void WeightMask8x8_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
-                              const void* LIBGAV1_RESTRICT prediction_1,
-                              uint8_t* LIBGAV1_RESTRICT mask,
-                              ptrdiff_t mask_stride) {
+void WeightMask8x8_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                                const void* LIBGAV1_RESTRICT prediction_1,
+                                uint8_t* LIBGAV1_RESTRICT mask,
+                                ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
 
@@ -587,10 +591,10 @@ void WeightMask8x8_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
 }
 
 template <bool mask_is_inverse>
-void WeightMask8x16_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
-                               const void* LIBGAV1_RESTRICT prediction_1,
-                               uint8_t* LIBGAV1_RESTRICT mask,
-                               ptrdiff_t mask_stride) {
+void WeightMask8x16_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                                 const void* LIBGAV1_RESTRICT prediction_1,
+                                 uint8_t* LIBGAV1_RESTRICT mask,
+                                 ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
   int y3 = 3;
@@ -603,10 +607,10 @@ void WeightMask8x16_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
 }
 
 template <bool mask_is_inverse>
-void WeightMask8x32_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
-                               const void* LIBGAV1_RESTRICT prediction_1,
-                               uint8_t* LIBGAV1_RESTRICT mask,
-                               ptrdiff_t mask_stride) {
+void WeightMask8x32_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                                 const void* LIBGAV1_RESTRICT prediction_1,
+                                 uint8_t* LIBGAV1_RESTRICT mask,
+                                 ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
   int y5 = 5;
@@ -618,9 +622,9 @@ void WeightMask8x32_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
   WEIGHT8_PAIR_WITHOUT_STRIDE_10BPP;
 }
 
-#define WEIGHT16_WITHOUT_STRIDE_10BPP                                  \
-  WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0, pred_1, mask, \
-                                                 mask_stride)
+#define WEIGHT16_WITHOUT_STRIDE_10BPP                                    \
+  WeightMask16_10bpp_SSE4_1<mask_is_inverse, true>(pred_0, pred_1, mask, \
+                                                   mask_stride)
 
 #define WEIGHT16_AND_STRIDE_10BPP \
   WEIGHT16_WITHOUT_STRIDE_10BPP;  \
@@ -629,10 +633,10 @@ void WeightMask8x32_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
   mask += mask_stride
 
 template <bool mask_is_inverse>
-void WeightMask16x8_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
-                               const void* LIBGAV1_RESTRICT prediction_1,
-                               uint8_t* LIBGAV1_RESTRICT mask,
-                               ptrdiff_t mask_stride) {
+void WeightMask16x8_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                                 const void* LIBGAV1_RESTRICT prediction_1,
+                                 uint8_t* LIBGAV1_RESTRICT mask,
+                                 ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
   int y = 7;
@@ -643,10 +647,10 @@ void WeightMask16x8_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
 }
 
 template <bool mask_is_inverse>
-void WeightMask16x16_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
-                                const void* LIBGAV1_RESTRICT prediction_1,
-                                uint8_t* LIBGAV1_RESTRICT mask,
-                                ptrdiff_t mask_stride) {
+void WeightMask16x16_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                                  const void* LIBGAV1_RESTRICT prediction_1,
+                                  uint8_t* LIBGAV1_RESTRICT mask,
+                                  ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
   int y3 = 5;
@@ -659,10 +663,10 @@ void WeightMask16x16_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
 }
 
 template <bool mask_is_inverse>
-void WeightMask16x32_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
-                                const void* LIBGAV1_RESTRICT prediction_1,
-                                uint8_t* LIBGAV1_RESTRICT mask,
-                                ptrdiff_t mask_stride) {
+void WeightMask16x32_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                                  const void* LIBGAV1_RESTRICT prediction_1,
+                                  uint8_t* LIBGAV1_RESTRICT mask,
+                                  ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
   int y5 = 6;
@@ -678,10 +682,10 @@ void WeightMask16x32_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
 }
 
 template <bool mask_is_inverse>
-void WeightMask16x64_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
-                                const void* LIBGAV1_RESTRICT prediction_1,
-                                uint8_t* LIBGAV1_RESTRICT mask,
-                                ptrdiff_t mask_stride) {
+void WeightMask16x64_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                                  const void* LIBGAV1_RESTRICT prediction_1,
+                                  uint8_t* LIBGAV1_RESTRICT mask,
+                                  ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
   int y3 = 21;
@@ -693,11 +697,11 @@ void WeightMask16x64_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
   WEIGHT16_WITHOUT_STRIDE_10BPP;
 }
 
-#define WEIGHT32_WITHOUT_STRIDE_10BPP                                      \
-  WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0, pred_1, mask,     \
-                                                 mask_stride);             \
-  WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0 + 16, pred_1 + 16, \
-                                                 mask + 16, mask_stride)
+#define WEIGHT32_WITHOUT_STRIDE_10BPP                                        \
+  WeightMask16_10bpp_SSE4_1<mask_is_inverse, true>(pred_0, pred_1, mask,     \
+                                                   mask_stride);             \
+  WeightMask16_10bpp_SSE4_1<mask_is_inverse, true>(pred_0 + 16, pred_1 + 16, \
+                                                   mask + 16, mask_stride)
 
 #define WEIGHT32_AND_STRIDE_10BPP \
   WEIGHT32_WITHOUT_STRIDE_10BPP;  \
@@ -706,10 +710,10 @@ void WeightMask16x64_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
   mask += mask_stride
 
 template <bool mask_is_inverse>
-void WeightMask32x8_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
-                               const void* LIBGAV1_RESTRICT prediction_1,
-                               uint8_t* LIBGAV1_RESTRICT mask,
-                               ptrdiff_t mask_stride) {
+void WeightMask32x8_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                                 const void* LIBGAV1_RESTRICT prediction_1,
+                                 uint8_t* LIBGAV1_RESTRICT mask,
+                                 ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
   WEIGHT32_AND_STRIDE_10BPP;
@@ -723,10 +727,10 @@ void WeightMask32x8_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
 }
 
 template <bool mask_is_inverse>
-void WeightMask32x16_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
-                                const void* LIBGAV1_RESTRICT prediction_1,
-                                uint8_t* LIBGAV1_RESTRICT mask,
-                                ptrdiff_t mask_stride) {
+void WeightMask32x16_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                                  const void* LIBGAV1_RESTRICT prediction_1,
+                                  uint8_t* LIBGAV1_RESTRICT mask,
+                                  ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
   int y3 = 5;
@@ -739,10 +743,10 @@ void WeightMask32x16_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
 }
 
 template <bool mask_is_inverse>
-void WeightMask32x32_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
-                                const void* LIBGAV1_RESTRICT prediction_1,
-                                uint8_t* LIBGAV1_RESTRICT mask,
-                                ptrdiff_t mask_stride) {
+void WeightMask32x32_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                                  const void* LIBGAV1_RESTRICT prediction_1,
+                                  uint8_t* LIBGAV1_RESTRICT mask,
+                                  ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
   int y5 = 6;
@@ -758,10 +762,10 @@ void WeightMask32x32_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
 }
 
 template <bool mask_is_inverse>
-void WeightMask32x64_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
-                                const void* LIBGAV1_RESTRICT prediction_1,
-                                uint8_t* LIBGAV1_RESTRICT mask,
-                                ptrdiff_t mask_stride) {
+void WeightMask32x64_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                                  const void* LIBGAV1_RESTRICT prediction_1,
+                                  uint8_t* LIBGAV1_RESTRICT mask,
+                                  ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
   int y3 = 21;
@@ -773,15 +777,15 @@ void WeightMask32x64_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
   WEIGHT32_WITHOUT_STRIDE_10BPP;
 }
 
-#define WEIGHT64_WITHOUT_STRIDE_10BPP                                      \
-  WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0, pred_1, mask,     \
-                                                 mask_stride);             \
-  WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0 + 16, pred_1 + 16, \
-                                                 mask + 16, mask_stride);  \
-  WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0 + 32, pred_1 + 32, \
-                                                 mask + 32, mask_stride);  \
-  WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0 + 48, pred_1 + 48, \
-                                                 mask + 48, mask_stride)
+#define WEIGHT64_WITHOUT_STRIDE_10BPP                                        \
+  WeightMask16_10bpp_SSE4_1<mask_is_inverse, true>(pred_0, pred_1, mask,     \
+                                                   mask_stride);             \
+  WeightMask16_10bpp_SSE4_1<mask_is_inverse, true>(pred_0 + 16, pred_1 + 16, \
+                                                   mask + 16, mask_stride);  \
+  WeightMask16_10bpp_SSE4_1<mask_is_inverse, true>(pred_0 + 32, pred_1 + 32, \
+                                                   mask + 32, mask_stride);  \
+  WeightMask16_10bpp_SSE4_1<mask_is_inverse, true>(pred_0 + 48, pred_1 + 48, \
+                                                   mask + 48, mask_stride)
 
 #define WEIGHT64_AND_STRIDE_10BPP \
   WEIGHT64_WITHOUT_STRIDE_10BPP;  \
@@ -790,10 +794,10 @@ void WeightMask32x64_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
   mask += mask_stride
 
 template <bool mask_is_inverse>
-void WeightMask64x16_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
-                                const void* LIBGAV1_RESTRICT prediction_1,
-                                uint8_t* LIBGAV1_RESTRICT mask,
-                                ptrdiff_t mask_stride) {
+void WeightMask64x16_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                                  const void* LIBGAV1_RESTRICT prediction_1,
+                                  uint8_t* LIBGAV1_RESTRICT mask,
+                                  ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
   int y3 = 5;
@@ -806,10 +810,10 @@ void WeightMask64x16_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
 }
 
 template <bool mask_is_inverse>
-void WeightMask64x32_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
-                                const void* LIBGAV1_RESTRICT prediction_1,
-                                uint8_t* LIBGAV1_RESTRICT mask,
-                                ptrdiff_t mask_stride) {
+void WeightMask64x32_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                                  const void* LIBGAV1_RESTRICT prediction_1,
+                                  uint8_t* LIBGAV1_RESTRICT mask,
+                                  ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
   int y5 = 6;
@@ -825,10 +829,10 @@ void WeightMask64x32_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
 }
 
 template <bool mask_is_inverse>
-void WeightMask64x64_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
-                                const void* LIBGAV1_RESTRICT prediction_1,
-                                uint8_t* LIBGAV1_RESTRICT mask,
-                                ptrdiff_t mask_stride) {
+void WeightMask64x64_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                                  const void* LIBGAV1_RESTRICT prediction_1,
+                                  uint8_t* LIBGAV1_RESTRICT mask,
+                                  ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
   int y3 = 21;
@@ -841,10 +845,10 @@ void WeightMask64x64_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
 }
 
 template <bool mask_is_inverse>
-void WeightMask64x128_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
-                                 const void* LIBGAV1_RESTRICT prediction_1,
-                                 uint8_t* LIBGAV1_RESTRICT mask,
-                                 ptrdiff_t mask_stride) {
+void WeightMask64x128_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                                   const void* LIBGAV1_RESTRICT prediction_1,
+                                   uint8_t* LIBGAV1_RESTRICT mask,
+                                   ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
   int y3 = 42;
@@ -858,10 +862,10 @@ void WeightMask64x128_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
 }
 
 template <bool mask_is_inverse>
-void WeightMask128x64_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
-                                 const void* LIBGAV1_RESTRICT prediction_1,
-                                 uint8_t* LIBGAV1_RESTRICT mask,
-                                 ptrdiff_t mask_stride) {
+void WeightMask128x64_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                                   const void* LIBGAV1_RESTRICT prediction_1,
+                                   uint8_t* LIBGAV1_RESTRICT mask,
+                                   ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
   int y3 = 21;
@@ -902,10 +906,10 @@ void WeightMask128x64_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
 }
 
 template <bool mask_is_inverse>
-void WeightMask128x128_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
-                                  const void* LIBGAV1_RESTRICT prediction_1,
-                                  uint8_t* LIBGAV1_RESTRICT mask,
-                                  ptrdiff_t mask_stride) {
+void WeightMask128x128_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                                    const void* LIBGAV1_RESTRICT prediction_1,
+                                    uint8_t* LIBGAV1_RESTRICT mask,
+                                    ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
   int y3 = 42;
@@ -956,9 +960,9 @@ void WeightMask128x128_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
 
 #define INIT_WEIGHT_MASK_10BPP(width, height, w_index, h_index) \
   dsp->weight_mask[w_index][h_index][0] =                       \
-      WeightMask##width##x##height##_10bpp_SSE4<0>;             \
+      WeightMask##width##x##height##_10bpp_SSE4_1<0>;           \
   dsp->weight_mask[w_index][h_index][1] =                       \
-      WeightMask##width##x##height##_10bpp_SSE4<1>
+      WeightMask##width##x##height##_10bpp_SSE4_1<1>
 void Init10bpp() {
   Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
   assert(dsp != nullptr);