Update upstream source from tag 'upstream/0.16.3'

Update to upstream version '0.16.3' with Debian dir a433e04a69210eb8fcdd6089240e161eb33f0590
author: qinxialei <xialeiqin@gmail.com> 2021-04-22 11:20:18 +0800
committer: qinxialei <xialeiqin@gmail.com> 2021-04-22 11:20:18 +0800
commit: 81ce37eb93e8ce442ecb1855a4e7166628128ac7 (patch)
tree: 2af6329f74f88ce090d08c61db5fb4bed8656584
parent: 4dab0c756a3cdd65b43470a4cca835422b32ca6e (diff)
parent: 2381d803c76105f44717d75f089ec37f51e5cfe4 (diff)
download: libgav1-81ce37eb93e8ce442ecb1855a4e7166628128ac7.tar.gz
libgav1-81ce37eb93e8ce442ecb1855a4e7166628128ac7.tar.bz2
libgav1-81ce37eb93e8ce442ecb1855a4e7166628128ac7.zip
179 files changed, 36565 insertions, 7327 deletions
diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000..b934084
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1 @@
+* whitespace=tab-in-indent,space-before-tab,trailing-space
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5d00ae6..5e9e17a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -36,31 +36,17 @@ endif()
 set(libgav1_examples "${libgav1_root}/examples")
 set(libgav1_source "${libgav1_root}/src")
 
-include(FindThreads)
-
-include("${libgav1_examples}/libgav1_examples.cmake")
-include("${libgav1_root}/cmake/libgav1_build_definitions.cmake")
-include("${libgav1_root}/cmake/libgav1_cpu_detection.cmake")
-include("${libgav1_root}/cmake/libgav1_flags.cmake")
-include("${libgav1_root}/cmake/libgav1_helpers.cmake")
-include("${libgav1_root}/cmake/libgav1_install.cmake")
-include("${libgav1_root}/cmake/libgav1_intrinsics.cmake")
 include("${libgav1_root}/cmake/libgav1_options.cmake")
-include("${libgav1_root}/cmake/libgav1_sanitizer.cmake")
-include("${libgav1_root}/cmake/libgav1_targets.cmake")
-include("${libgav1_root}/cmake/libgav1_variables.cmake")
-include("${libgav1_source}/dsp/libgav1_dsp.cmake")
-include("${libgav1_source}/libgav1_decoder.cmake")
-include("${libgav1_source}/utils/libgav1_utils.cmake")
 
 libgav1_option(NAME LIBGAV1_ENABLE_OPTIMIZATIONS HELPSTRING
                "Enables optimized code." VALUE ON)
-libgav1_option(NAME LIBGAV1_ENABLE_AVX2 HELPSTRING
-               "Enables avx2 optimizations." VALUE ON)
+libgav1_option(NAME LIBGAV1_ENABLE_AVX2 HELPSTRING "Enables avx2 optimizations."
+               VALUE ON)
 libgav1_option(NAME LIBGAV1_ENABLE_NEON HELPSTRING "Enables neon optimizations."
                VALUE ON)
 libgav1_option(NAME LIBGAV1_ENABLE_SSE4_1 HELPSTRING
                "Enables sse4.1 optimizations." VALUE ON)
+libgav1_option(NAME LIBGAV1_ENABLE_TESTS HELPSTRING "Enables tests." VALUE ON)
 libgav1_option(
   NAME LIBGAV1_VERBOSE HELPSTRING
   "Enables verbose build system output. Higher numbers are more verbose." VALUE
@@ -70,6 +56,23 @@ if(NOT CMAKE_BUILD_TYPE)
   set(CMAKE_BUILD_TYPE Release)
 endif()
 
+include(FindThreads)
+
+include("${libgav1_examples}/libgav1_examples.cmake")
+include("${libgav1_root}/cmake/libgav1_build_definitions.cmake")
+include("${libgav1_root}/cmake/libgav1_cpu_detection.cmake")
+include("${libgav1_root}/cmake/libgav1_flags.cmake")
+include("${libgav1_root}/cmake/libgav1_helpers.cmake")
+include("${libgav1_root}/cmake/libgav1_install.cmake")
+include("${libgav1_root}/cmake/libgav1_intrinsics.cmake")
+include("${libgav1_root}/cmake/libgav1_sanitizer.cmake")
+include("${libgav1_root}/cmake/libgav1_targets.cmake")
+include("${libgav1_root}/cmake/libgav1_variables.cmake")
+include("${libgav1_root}/tests/libgav1_tests.cmake")
+include("${libgav1_source}/dsp/libgav1_dsp.cmake")
+include("${libgav1_source}/libgav1_decoder.cmake")
+include("${libgav1_source}/utils/libgav1_utils.cmake")
+
 libgav1_optimization_detect()
 libgav1_set_build_definitions()
 libgav1_set_cxx_flags()
@@ -109,13 +112,27 @@ if(NOT "${LIBGAV1_EXE_LINKER_FLAGS}" STREQUAL "")
   separate_arguments(LIBGAV1_EXE_LINKER_FLAGS)
 endif()
 
-add_subdirectory("${libgav1_root}/third_party/abseil-cpp"
-                 "${libgav1_abseil_build}" EXCLUDE_FROM_ALL)
+# Set test-only flags based on LIBGAV1_CXX_FLAGS.
+libgav1_set_test_flags()
+
+set(libgav1_abseil "${libgav1_root}/third_party/abseil-cpp")
+if(NOT EXISTS "${libgav1_abseil}")
+  message(
+    FATAL_ERROR
+      "Abseil not found. This dependency is required by the"
+      " examples & tests and libgav1 when LIBGAV1_THREADPOOL_USE_STD_MUTEX is"
+      " not defined. To continue, download the Abseil repository to"
+      " third_party/abseil-cpp:\n  git \\\n    -C ${libgav1_root} \\\n"
+      "    clone \\\n"
+      "    https://github.com/abseil/abseil-cpp.git third_party/abseil-cpp")
+endif()
+add_subdirectory("${libgav1_abseil}" "${libgav1_abseil_build}" EXCLUDE_FROM_ALL)
 
 libgav1_reset_target_lists()
 libgav1_add_dsp_targets()
 libgav1_add_decoder_targets()
 libgav1_add_examples_targets()
+libgav1_add_tests_targets()
 libgav1_add_utils_targets()
 libgav1_setup_install_target()
 
diff --git a/README.md b/README.md
index 8ab8eab..3155970 100644
--- a/README.md
+++ b/README.md
@@ -20,7 +20,18 @@ information on the AV1 video format can be found at
     From within the libgav1 directory:
 
     ```shell
-      $ git clone https://github.com/abseil/abseil-cpp.git third_party/abseil-cpp
+    $ git clone https://github.com/abseil/abseil-cpp.git third_party/abseil-cpp
+    ```
+
+    Note: Abseil is required by the examples and tests. libgav1 will depend on
+    it if `LIBGAV1_THREADPOOL_USE_STD_MUTEX` is set to `0` (see below).
+
+4.  (Optional) [GoogleTest](https://github.com/google/googletest)
+
+    From within the libgav1 directory:
+
+    ```shell
+    $ git clone https://github.com/google/googletest.git third_party/googletest
     ```
 
 ### Compile
@@ -58,10 +69,11 @@ Configuration options:
 *   `LIBGAV1_THREADPOOL_USE_STD_MUTEX`: controls use of std::mutex and
     absl::Mutex in ThreadPool. Defining this to 1 will remove any Abseil
     dependency from the core library. Automatically defined in
-    `src/utils/threadpool.h` if unset.
+    `src/utils/threadpool.h` if unset. Defaults to 1 on Android & iOS, 0
+    otherwise.
 *   `LIBGAV1_MAX_THREADS`: sets the number of threads that the library is
-    allowed to create. Has to be an integer > 0. Otherwise this is ignored.
-    The default value is 128.
+    allowed to create. Has to be an integer > 0. Otherwise this is ignored. The
+    default value is 128.
 *   `LIBGAV1_FRAME_PARALLEL_THRESHOLD_MULTIPLIER`: the threshold multiplier that
     is used to determine when to use frame parallel decoding. Frame parallel
     decoding will be used if |threads| > |tile_count| * this multiplier. Has to
diff --git a/cmake/libgav1_build_definitions.cmake b/cmake/libgav1_build_definitions.cmake
index b170e7e..fc83490 100644
--- a/cmake/libgav1_build_definitions.cmake
+++ b/cmake/libgav1_build_definitions.cmake
@@ -21,7 +21,24 @@ macro(libgav1_set_build_definitions)
   string(TOLOWER "${CMAKE_BUILD_TYPE}" build_type_lowercase)
 
   libgav1_load_version_info()
-  set(LIBGAV1_SOVERSION 0)
+
+  # Library version info. See the libtool docs for updating the values:
+  # https://www.gnu.org/software/libtool/manual/libtool.html#Updating-version-info
+  #
+  # c=<current>, r=<revision>, a=<age>
+  #
+  # libtool generates a .so file as .so.[c-a].a.r, while -version-info c:r:a is
+  # passed to libtool.
+  #
+  # We set LIBGAV1_SOVERSION = [c-a].a.r
+  set(LT_CURRENT 0)
+  set(LT_REVISION 0)
+  set(LT_AGE 0)
+  math(EXPR LIBGAV1_SOVERSION_MAJOR "${LT_CURRENT} - ${LT_AGE}")
+  set(LIBGAV1_SOVERSION "${LIBGAV1_SOVERSION_MAJOR}.${LT_AGE}.${LT_REVISION}")
+  unset(LT_CURRENT)
+  unset(LT_REVISION)
+  unset(LT_AGE)
 
   list(APPEND libgav1_include_paths "${libgav1_root}" "${libgav1_root}/src"
               "${libgav1_build}" "${libgav1_root}/third_party/abseil-cpp")
@@ -89,9 +106,7 @@ macro(libgav1_set_build_definitions)
   endif()
 
   if(build_type_lowercase MATCHES "rel")
-    # TODO(tomfinegan): this value is only a concern for the core library and
-    # can be made smaller if the test targets are avoided.
-    list(APPEND libgav1_base_cxx_flags "-Wstack-usage=196608")
+    list(APPEND libgav1_base_cxx_flags "-Wframe-larger-than=196608")
   endif()
 
   list(APPEND libgav1_msvc_cxx_flags
diff --git a/cmake/libgav1_flags.cmake b/cmake/libgav1_flags.cmake
index 2d8d9a6..a5408e2 100644
--- a/cmake/libgav1_flags.cmake
+++ b/cmake/libgav1_flags.cmake
@@ -205,7 +205,7 @@ macro(libgav1_test_exe_linker_flag)
 
   # Restore cached global exe linker flags.
   if(cached_CMAKE_EXE_LINKER_FLAGS)
-    set(CMAKE_EXE_LINKER_FLAGS cached_CMAKE_EXE_LINKER_FLAGS)
+    set(CMAKE_EXE_LINKER_FLAGS ${cached_CMAKE_EXE_LINKER_FLAGS})
   else()
     unset(CMAKE_EXE_LINKER_FLAGS)
   endif()
@@ -249,3 +249,15 @@ macro(libgav1_set_cxx_flags)
 
   libgav1_test_cxx_flag(FLAG_LIST_VAR_NAMES ${cxx_flag_lists})
 endmacro()
+
+# Sets LIBGAV1_TEST_C_FLAGS and LIBGAV1_TEST_CXX_FLAGS.
+#
+# Note: libgav1_set_cxx_flags() must be called before this macro. Furthermore,
+# the call to this macro should be made after all additions to LIBGAV1_CXX_FLAGS
+# are complete.
+macro(libgav1_set_test_flags)
+  if(LIBGAV1_ENABLE_TESTS)
+    set(LIBGAV1_TEST_CXX_FLAGS ${LIBGAV1_CXX_FLAGS})
+    list(FILTER LIBGAV1_TEST_CXX_FLAGS EXCLUDE REGEX "-Wframe-larger-than")
+  endif()
+endmacro()
diff --git a/cmake/libgav1_helpers.cmake b/cmake/libgav1_helpers.cmake
index 76d8d67..ac16257 100644
--- a/cmake/libgav1_helpers.cmake
+++ b/cmake/libgav1_helpers.cmake
@@ -20,7 +20,13 @@ set(LIBGAV1_CMAKE_LIBGAV1_HELPERS_CMAKE_ 1)
 # Kills build generation using message(FATAL_ERROR) and outputs all data passed
 # to the console via use of $ARGN.
 macro(libgav1_die)
-  message(FATAL_ERROR ${ARGN})
+  # macro parameters are not variables so a temporary is needed to work with
+  # list().
+  set(msg ${ARGN})
+  # message(${ARGN}) will merge all list elements with no separator while
+  # "${ARGN}" will output the list as a ';' delimited string.
+  list(JOIN msg " " msg)
+  message(FATAL_ERROR "${msg}")
 endmacro()
 
 # Converts semi-colon delimited list variable(s) to string. Output is written to
@@ -94,10 +100,10 @@ macro(libgav1_create_dummy_source_file)
       "${dummy_source_dir}/libgav1_${cdsf_TARGET}_${cdsf_BASENAME}.cc")
   set(dummy_source_code
       "// Generated file. DO NOT EDIT!\n"
-      "// C++ source file created for target ${cdsf_TARGET}. \n"
-      "void libgav1_${cdsf_TARGET}_${cdsf_BASENAME}_dummy_function(void);\n"
+      "// C++ source file created for target ${cdsf_TARGET}.\n"
+      "void libgav1_${cdsf_TARGET}_${cdsf_BASENAME}_dummy_function(void)\;\n"
       "void libgav1_${cdsf_TARGET}_${cdsf_BASENAME}_dummy_function(void) {}\n")
-  file(WRITE "${dummy_source_file}" "${dummy_source_code}")
+  file(WRITE "${dummy_source_file}" ${dummy_source_code})
 
   target_sources(${cdsf_TARGET} PRIVATE ${dummy_source_file})
 
diff --git a/cmake/libgav1_sanitizer.cmake b/cmake/libgav1_sanitizer.cmake
index 4bb2263..2f9ee07 100644
--- a/cmake/libgav1_sanitizer.cmake
+++ b/cmake/libgav1_sanitizer.cmake
@@ -39,7 +39,9 @@ macro(libgav1_configure_sanitizer)
     list(APPEND LIBGAV1_CXX_FLAGS "-fno-omit-frame-pointer"
                 "-fno-optimize-sibling-calls")
 
-    libgav1_test_cxx_flag(FLAG_LIST_VAR_NAMES LIBGAV1_CXX_FLAGS FLAG_REQUIRED)
+    # Check the linker flags first as they may be required in the compile check
+    # to avoid undefined symbols related to the sanitizer.
     libgav1_test_exe_linker_flag(FLAG_LIST_VAR_NAME LIBGAV1_EXE_LINKER_FLAGS)
+    libgav1_test_cxx_flag(FLAG_LIST_VAR_NAMES LIBGAV1_CXX_FLAGS FLAG_REQUIRED)
   endif()
 endmacro()
diff --git a/cmake/libgav1_targets.cmake b/cmake/libgav1_targets.cmake
index 78b4865..997f8bd 100644
--- a/cmake/libgav1_targets.cmake
+++ b/cmake/libgav1_targets.cmake
@@ -29,7 +29,7 @@ endmacro()
 
 # Creates an executable target. The target name is passed as a parameter to the
 # NAME argument, and the sources passed as a parameter to the SOURCES argument:
-# libgav1_add_test(NAME <name> SOURCES <sources> [optional args])
+# libgav1_add_executable(NAME <name> SOURCES <sources> [optional args])
 #
 # Optional args:
 # cmake-format: off
@@ -115,15 +115,35 @@ macro(libgav1_add_executable)
     target_include_directories(${exe_NAME} PRIVATE ${exe_INCLUDES})
   endif()
 
-  if(exe_COMPILE_FLAGS OR LIBGAV1_CXX_FLAGS)
+  unset(exe_LIBGAV1_COMPILE_FLAGS)
+  if(exe_TEST)
+    list(FILTER exe_SOURCES INCLUDE REGEX "\\.c$")
+    list(LENGTH exe_SOURCES exe_SOURCES_length)
+    if(exe_SOURCES_length EQUAL 0)
+      set(exe_LIBGAV1_COMPILE_FLAGS ${LIBGAV1_TEST_CXX_FLAGS})
+    else()
+      set(exe_LIBGAV1_COMPILE_FLAGS ${LIBGAV1_TEST_C_FLAGS})
+    endif()
+  else()
+    set(exe_LIBGAV1_COMPILE_FLAGS ${LIBGAV1_CXX_FLAGS})
+  endif()
+
+  if(exe_COMPILE_FLAGS OR exe_LIBGAV1_COMPILE_FLAGS)
     target_compile_options(${exe_NAME}
-                           PRIVATE ${exe_COMPILE_FLAGS} ${LIBGAV1_CXX_FLAGS})
+                           PRIVATE ${exe_COMPILE_FLAGS}
+                                   ${exe_LIBGAV1_COMPILE_FLAGS})
   endif()
 
   if(exe_LINK_FLAGS OR LIBGAV1_EXE_LINKER_FLAGS)
-    set_target_properties(${exe_NAME}
-                          PROPERTIES LINK_FLAGS ${exe_LINK_FLAGS}
-                                     ${LIBGAV1_EXE_LINKER_FLAGS})
+    list(APPEND exe_LINK_FLAGS "${LIBGAV1_EXE_LINKER_FLAGS}")
+    if(${CMAKE_VERSION} VERSION_LESS "3.13")
+      # LINK_FLAGS is managed as a string.
+      libgav1_set_and_stringify(SOURCE "${exe_LINK_FLAGS}" DEST exe_LINK_FLAGS)
+      set_target_properties(${exe_NAME}
+                            PROPERTIES LINK_FLAGS "${exe_LINK_FLAGS}")
+    else()
+      target_link_options(${exe_NAME} PRIVATE ${exe_LINK_FLAGS})
+    endif()
   endif()
 
   if(exe_OBJLIB_DEPS)
@@ -137,7 +157,7 @@ macro(libgav1_add_executable)
   endif()
 
   if(BUILD_SHARED_LIBS AND (MSVC OR WIN32))
-    target_compile_definitions(${lib_NAME} PRIVATE "LIBGAV1_BUILDING_DLL=0")
+    target_compile_definitions(${exe_NAME} PRIVATE "LIBGAV1_BUILDING_DLL=0")
   endif()
 
   if(exe_LIB_DEPS)
@@ -321,7 +341,9 @@ macro(libgav1_add_library)
   endif()
 
   if(lib_TYPE STREQUAL SHARED AND NOT MSVC)
-    set_target_properties(${lib_NAME} PROPERTIES SOVERSION ${LIBGAV1_SOVERSION})
+    set_target_properties(${lib_NAME}
+                          PROPERTIES VERSION ${LIBGAV1_SOVERSION} SOVERSION
+                                     ${LIBGAV1_SOVERSION_MAJOR})
   endif()
 
   if(BUILD_SHARED_LIBS AND (MSVC OR WIN32))
diff --git a/examples/gav1_decode.cc b/examples/gav1_decode.cc
index 4de0ba2..1408e8c 100644
--- a/examples/gav1_decode.cc
+++ b/examples/gav1_decode.cc
@@ -419,6 +419,9 @@ int main(int argc, char* argv[]) {
         input_buffers.ReleaseInputBuffer(input_buffer);
       }
       input_buffer = nullptr;
+      // Clear any in progress frames to ensure the output frame limit is
+      // respected.
+      decoder.SignalEOS();
     }
   } while (input_buffer != nullptr ||
            (!file_reader->IsEndOfFile() && !limit_reached) ||
diff --git a/examples/logging.h b/examples/logging.h
index c0bcad7..cf5a09f 100644
--- a/examples/logging.h
+++ b/examples/logging.h
@@ -46,7 +46,7 @@ constexpr const char* Basename(const char* file_name, size_t offset) {
 #define LIBGAV1_EXAMPLES_LOG_ERROR(error_string)                              \
   do {                                                                        \
     constexpr const char* libgav1_examples_basename =                         \
-        ::libgav1::examples::Basename(__FILE__, sizeof(__FILE__) - 1);        \
+        libgav1::examples::Basename(__FILE__, sizeof(__FILE__) - 1);          \
     fprintf(stderr, "%s:%d (%s): %s.\n", libgav1_examples_basename, __LINE__, \
             __func__, error_string);                                          \
   } while (false)
diff --git a/src/decoder_impl.cc b/src/decoder_impl.cc
index 751671d..e23903c 100644
--- a/src/decoder_impl.cc
+++ b/src/decoder_impl.cc
@@ -36,7 +36,6 @@
 #include "src/utils/common.h"
 #include "src/utils/constants.h"
 #include "src/utils/logging.h"
-#include "src/utils/parameter_tree.h"
 #include "src/utils/raw_bit_reader.h"
 #include "src/utils/segmentation.h"
 #include "src/utils/threadpool.h"
@@ -631,10 +630,6 @@ DecoderImpl::~DecoderImpl() {
 }
 
 StatusCode DecoderImpl::Init() {
-  if (!GenerateWedgeMask(&wedge_masks_)) {
-    LIBGAV1_DLOG(ERROR, "GenerateWedgeMask() failed.");
-    return kStatusOutOfMemory;
-  }
   if (!output_frame_queue_.Init(kMaxLayers)) {
     LIBGAV1_DLOG(ERROR, "output_frame_queue_.Init() failed.");
     return kStatusOutOfMemory;
@@ -857,6 +852,10 @@ StatusCode DecoderImpl::ParseAndSchedule(const uint8_t* data, size_t size,
       LIBGAV1_DLOG(ERROR, "InitializeQuantizerMatrix() failed.");
       return kStatusOutOfMemory;
     }
+    if (!MaybeInitializeWedgeMasks(obu->frame_header().frame_type)) {
+      LIBGAV1_DLOG(ERROR, "InitializeWedgeMasks() failed.");
+      return kStatusOutOfMemory;
+    }
     if (IsNewSequenceHeader(*obu)) {
       const ObuSequenceHeader& sequence_header = obu->sequence_header();
       const Libgav1ImageFormat image_format =
@@ -1050,6 +1049,10 @@ StatusCode DecoderImpl::DecodeTemporalUnit(const TemporalUnit& temporal_unit,
       LIBGAV1_DLOG(ERROR, "InitializeQuantizerMatrix() failed.");
       return kStatusOutOfMemory;
     }
+    if (!MaybeInitializeWedgeMasks(obu->frame_header().frame_type)) {
+      LIBGAV1_DLOG(ERROR, "InitializeWedgeMasks() failed.");
+      return kStatusOutOfMemory;
+    }
     if (IsNewSequenceHeader(*obu)) {
       const ObuSequenceHeader& sequence_header = obu->sequence_header();
       const Libgav1ImageFormat image_format =
@@ -1278,8 +1281,7 @@ StatusCode DecoderImpl::DecodeTiles(
   // without having to check for boundary conditions.
   if (!frame_scratch_buffer->block_parameters_holder.Reset(
           frame_header.rows4x4 + kMaxBlockHeight4x4,
-          frame_header.columns4x4 + kMaxBlockWidth4x4,
-          sequence_header.use_128x128_superblock)) {
+          frame_header.columns4x4 + kMaxBlockWidth4x4)) {
     return kStatusOutOfMemory;
   }
   const dsp::Dsp* const dsp =
@@ -1646,6 +1648,17 @@ bool DecoderImpl::IsNewSequenceHeader(const ObuParser& obu) {
   return sequence_header_changed;
 }
 
+bool DecoderImpl::MaybeInitializeWedgeMasks(FrameType frame_type) {
+  if (IsIntraFrame(frame_type) || wedge_masks_initialized_) {
+    return true;
+  }
+  if (!GenerateWedgeMask(&wedge_masks_)) {
+    return false;
+  }
+  wedge_masks_initialized_ = true;
+  return true;
+}
+
 bool DecoderImpl::MaybeInitializeQuantizerMatrix(
     const ObuFrameHeader& frame_header) {
   if (quantizer_matrix_initialized_ || !frame_header.quantizer.use_matrix) {
diff --git a/src/decoder_impl.h b/src/decoder_impl.h
index 721b666..b52ecdf 100644
--- a/src/decoder_impl.h
+++ b/src/decoder_impl.h
@@ -215,6 +215,10 @@ class DecoderImpl : public Allocable {
   // |quantizer_matrix_initialized_| to true.
   bool MaybeInitializeQuantizerMatrix(const ObuFrameHeader& frame_header);
 
+  // Allocates and generates the |wedge_masks_| if necessary and sets
+  // |wedge_masks_initialized_| to true.
+  bool MaybeInitializeWedgeMasks(FrameType frame_type);
+
   // Elements in this queue cannot be moved with std::move since the
   // |EncodedFrame.temporal_unit| stores a pointer to elements in this queue.
   Queue<TemporalUnit> temporal_units_;
@@ -233,6 +237,7 @@ class DecoderImpl : public Allocable {
 
   BufferPool buffer_pool_;
   WedgeMaskArray wedge_masks_;
+  bool wedge_masks_initialized_ = false;
   QuantizerMatrix quantizer_matrix_;
   bool quantizer_matrix_initialized_ = false;
   FrameScratchBufferPool frame_scratch_buffer_pool_;
diff --git a/src/decoder_state.h b/src/decoder_state.h
index 897c99f..ea5c792 100644
--- a/src/decoder_state.h
+++ b/src/decoder_state.h
@@ -33,7 +33,6 @@ struct DecoderState {
     for (int ref_index = 0, mask = refresh_frame_flags; mask != 0;
          ++ref_index, mask >>= 1) {
       if ((mask & 1) != 0) {
-        reference_valid[ref_index] = true;
         reference_frame_id[ref_index] = current_frame_id;
         reference_frame[ref_index] = current_frame;
         reference_order_hint[ref_index] = order_hint;
@@ -43,7 +42,6 @@ struct DecoderState {
 
   // Clears all the reference frames.
   void ClearReferenceFrames() {
-    reference_valid = {};
     reference_frame_id = {};
     reference_order_hint = {};
     for (int ref_index = 0; ref_index < kNumReferenceFrameTypes; ++ref_index) {
@@ -51,12 +49,11 @@ struct DecoderState {
     }
   }
 
-  // reference_valid and reference_frame_id are used only if
-  // sequence_header_.frame_id_numbers_present is true.
-  // The reference_valid array is indexed by a reference picture slot number.
-  // A value (boolean) in the array signifies whether the corresponding
-  // reference picture slot is valid for use as a reference picture.
-  std::array<bool, kNumReferenceFrameTypes> reference_valid = {};
+  // reference_frame_id and current_frame_id have meaningful values and are used
+  // in checks only if sequence_header_.frame_id_numbers_present is true. If
+  // sequence_header_.frame_id_numbers_present is false, reference_frame_id and
+  // current_frame_id are assigned the default value 0 and are not used in
+  // checks.
   std::array<uint16_t, kNumReferenceFrameTypes> reference_frame_id = {};
   // A valid value of current_frame_id is an unsigned integer of at most 16
   // bits. -1 indicates current_frame_id is not initialized.
@@ -81,6 +78,11 @@ struct DecoderState {
   // * |true| indicates that the reference frame is a backwards reference.
   // Note: reference_frame_sign_bias[0] (for kReferenceFrameIntra) is not used.
   std::array<bool, kNumReferenceFrameTypes> reference_frame_sign_bias = {};
+  // The RefValid[i] variable in the spec does not need to be stored explicitly.
+  // If the RefValid[i] variable in the spec is 0, then reference_frame[i] is a
+  // null pointer. (Whenever the spec sets the RefValid[i] variable to 0, we set
+  // reference_frame[i] to a null pointer.) If the RefValid[i] variable in the
+  // spec is 1, then reference_frame[i] contains a frame buffer pointer.
   std::array<RefCountedBufferPtr, kNumReferenceFrameTypes> reference_frame;
 };
 
diff --git a/src/dsp/arm/average_blend_neon.cc b/src/dsp/arm/average_blend_neon.cc
index 834e8b4..5b4c094 100644
--- a/src/dsp/arm/average_blend_neon.cc
+++ b/src/dsp/arm/average_blend_neon.cc
@@ -35,6 +35,11 @@ namespace {
 constexpr int kInterPostRoundBit =
     kInterRoundBitsVertical - kInterRoundBitsCompoundVertical;
 
+}  // namespace
+
+namespace low_bitdepth {
+namespace {
+
 inline uint8x8_t AverageBlend8Row(const int16_t* prediction_0,
                                   const int16_t* prediction_1) {
   const int16x8_t pred0 = vld1q_s16(prediction_0);
@@ -128,13 +133,139 @@ void Init8bpp() {
 }
 
 }  // namespace
+}  // namespace low_bitdepth
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+inline uint16x8_t AverageBlend8Row(const uint16_t* prediction_0,
+                                   const uint16_t* prediction_1,
+                                   const int32x4_t compound_offset,
+                                   const uint16x8_t v_bitdepth) {
+  const uint16x8_t pred0 = vld1q_u16(prediction_0);
+  const uint16x8_t pred1 = vld1q_u16(prediction_1);
+  const uint32x4_t pred_lo =
+      vaddl_u16(vget_low_u16(pred0), vget_low_u16(pred1));
+  const uint32x4_t pred_hi =
+      vaddl_u16(vget_high_u16(pred0), vget_high_u16(pred1));
+  const int32x4_t offset_lo =
+      vsubq_s32(vreinterpretq_s32_u32(pred_lo), compound_offset);
+  const int32x4_t offset_hi =
+      vsubq_s32(vreinterpretq_s32_u32(pred_hi), compound_offset);
+  const uint16x4_t res_lo = vqrshrun_n_s32(offset_lo, kInterPostRoundBit + 1);
+  const uint16x4_t res_hi = vqrshrun_n_s32(offset_hi, kInterPostRoundBit + 1);
+  return vminq_u16(vcombine_u16(res_lo, res_hi), v_bitdepth);
+}
+
+inline void AverageBlendLargeRow(const uint16_t* prediction_0,
+                                 const uint16_t* prediction_1, const int width,
+                                 uint16_t* dest,
+                                 const int32x4_t compound_offset,
+                                 const uint16x8_t v_bitdepth) {
+  int x = width;
+  do {
+    vst1q_u16(dest, AverageBlend8Row(prediction_0, prediction_1,
+                                     compound_offset, v_bitdepth));
+    prediction_0 += 8;
+    prediction_1 += 8;
+    dest += 8;
+
+    vst1q_u16(dest, AverageBlend8Row(prediction_0, prediction_1,
+                                     compound_offset, v_bitdepth));
+    prediction_0 += 8;
+    prediction_1 += 8;
+    dest += 8;
+
+    x -= 16;
+  } while (x != 0);
+}
+
+void AverageBlend_NEON(const void* prediction_0, const void* prediction_1,
+                       const int width, const int height, void* const dest,
+                       const ptrdiff_t dest_stride) {
+  auto* dst = static_cast<uint16_t*>(dest);
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  int y = height;
+
+  const ptrdiff_t dst_stride = dest_stride >> 1;
+  const int32x4_t compound_offset =
+      vdupq_n_s32(static_cast<int32_t>(kCompoundOffset + kCompoundOffset));
+  const uint16x8_t v_bitdepth = vdupq_n_u16((1 << kBitdepth10) - 1);
+  if (width == 4) {
+    do {
+      const uint16x8_t result =
+          AverageBlend8Row(pred_0, pred_1, compound_offset, v_bitdepth);
+      pred_0 += 8;
+      pred_1 += 8;
+
+      vst1_u16(dst, vget_low_u16(result));
+      dst += dst_stride;
+      vst1_u16(dst, vget_high_u16(result));
+      dst += dst_stride;
+      y -= 2;
+    } while (y != 0);
+    return;
+  }
+
+  if (width == 8) {
+    do {
+      vst1q_u16(dst,
+                AverageBlend8Row(pred_0, pred_1, compound_offset, v_bitdepth));
+      dst += dst_stride;
+      pred_0 += 8;
+      pred_1 += 8;
+
+      vst1q_u16(dst,
+                AverageBlend8Row(pred_0, pred_1, compound_offset, v_bitdepth));
+      dst += dst_stride;
+      pred_0 += 8;
+      pred_1 += 8;
+
+      y -= 2;
+    } while (y != 0);
+    return;
+  }
+
+  do {
+    AverageBlendLargeRow(pred_0, pred_1, width, dst, compound_offset,
+                         v_bitdepth);
+    dst += dst_stride;
+    pred_0 += width;
+    pred_1 += width;
+
+    AverageBlendLargeRow(pred_0, pred_1, width, dst, compound_offset,
+                         v_bitdepth);
+    dst += dst_stride;
+    pred_0 += width;
+    pred_1 += width;
+
+    y -= 2;
+  } while (y != 0);
+}
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+  dsp->average_blend = AverageBlend_NEON;
+}
+
+}  // namespace
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
 
-void AverageBlendInit_NEON() { Init8bpp(); }
+void AverageBlendInit_NEON() {
+  low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+}
 
 }  // namespace dsp
 }  // namespace libgav1
 
-#else  // !LIBGAV1_ENABLE_NEON
+#else   // !LIBGAV1_ENABLE_NEON
 
 namespace libgav1 {
 namespace dsp {
diff --git a/src/dsp/arm/cdef_neon.cc b/src/dsp/arm/cdef_neon.cc
index 4d0e76f..60c72d6 100644
--- a/src/dsp/arm/cdef_neon.cc
+++ b/src/dsp/arm/cdef_neon.cc
@@ -265,7 +265,7 @@ LIBGAV1_ALWAYS_INLINE void AddPartial(const void* const source,
   // 05 15 25 35 45 55 65 75  00 00 00 00 00 00 00 00
   // 06 16 26 36 46 56 66 76  00 00 00 00 00 00 00 00
   // 07 17 27 37 47 57 67 77  00 00 00 00 00 00 00 00
-  partial_lo[2] = vsetq_lane_u16(SumVector(v_src[0]), partial_lo[2], 0);
+  partial_lo[2] = vsetq_lane_u16(SumVector(v_src[0]), vdupq_n_u16(0), 0);
   partial_lo[2] = vsetq_lane_u16(SumVector(v_src[1]), partial_lo[2], 1);
   partial_lo[2] = vsetq_lane_u16(SumVector(v_src[2]), partial_lo[2], 2);
   partial_lo[2] = vsetq_lane_u16(SumVector(v_src[3]), partial_lo[2], 3);
@@ -285,9 +285,8 @@ LIBGAV1_ALWAYS_INLINE void AddPartial(const void* const source,
   // 50 51 52 53 54 55 56 57  00 00 00 00 00 00 00 00
   // 60 61 62 63 64 65 66 67  00 00 00 00 00 00 00 00
   // 70 71 72 73 74 75 76 77  00 00 00 00 00 00 00 00
-  const uint8x8_t v_zero = vdup_n_u8(0);
-  partial_lo[6] = vaddl_u8(v_zero, v_src[0]);
-  for (int i = 1; i < 8; ++i) {
+  partial_lo[6] = vaddl_u8(v_src[0], v_src[1]);
+  for (int i = 2; i < 8; ++i) {
     partial_lo[6] = vaddw_u8(partial_lo[6], v_src[i]);
   }
 
@@ -451,7 +450,7 @@ void LoadDirection4(const uint16_t* const src, const ptrdiff_t stride,
 
 int16x8_t Constrain(const uint16x8_t pixel, const uint16x8_t reference,
                     const uint16x8_t threshold, const int16x8_t damping) {
-  // If reference > pixel, the difference will be negative, so covert to 0 or
+  // If reference > pixel, the difference will be negative, so convert to 0 or
   // -1.
   const uint16x8_t sign = vcgtq_u16(reference, pixel);
   const uint16x8_t abs_diff = vabdq_u16(pixel, reference);
@@ -686,7 +685,7 @@ void CdefInit_NEON() { low_bitdepth::Init8bpp(); }
 
 }  // namespace dsp
 }  // namespace libgav1
-#else  // !LIBGAV1_ENABLE_NEON
+#else   // !LIBGAV1_ENABLE_NEON
 namespace libgav1 {
 namespace dsp {
 
diff --git a/src/dsp/arm/common_neon.h b/src/dsp/arm/common_neon.h
index dcb7567..05e0d05 100644
--- a/src/dsp/arm/common_neon.h
+++ b/src/dsp/arm/common_neon.h
@@ -28,8 +28,7 @@
 
 #if 0
 #include <cstdio>
-
-#include "absl/strings/str_cat.h"
+#include <string>
 
 constexpr bool kEnablePrintRegs = true;
 
@@ -86,11 +85,11 @@ inline void PrintVectQ(const DebugRegisterQ r, const char* const name,
 
 inline void PrintReg(const int32x4x2_t val, const std::string& name) {
   DebugRegisterQ r;
-  vst1q_u32(r.u32, val.val[0]);
-  const std::string name0 = absl::StrCat(name, ".val[0]").c_str();
+  vst1q_s32(r.i32, val.val[0]);
+  const std::string name0 = name + std::string(".val[0]");
   PrintVectQ(r, name0.c_str(), 32);
-  vst1q_u32(r.u32, val.val[1]);
-  const std::string name1 = absl::StrCat(name, ".val[1]").c_str();
+  vst1q_s32(r.i32, val.val[1]);
+  const std::string name1 = name + std::string(".val[1]");
   PrintVectQ(r, name1.c_str(), 32);
 }
 
@@ -169,14 +168,14 @@ inline void PrintReg(const int8x8_t val, const char* name) {
 // Print an individual (non-vector) value in decimal format.
 inline void PrintReg(const int x, const char* name) {
   if (kEnablePrintRegs) {
-    printf("%s: %d\n", name, x);
+    fprintf(stderr, "%s: %d\n", name, x);
   }
 }
 
 // Print an individual (non-vector) value in hexadecimal format.
 inline void PrintHex(const int x, const char* name) {
   if (kEnablePrintRegs) {
-    printf("%s: %x\n", name, x);
+    fprintf(stderr, "%s: %x\n", name, x);
   }
 }
 
@@ -277,22 +276,32 @@ inline void Store2(uint16_t* const buf, const uint16x4_t val) {
   ValueToMem<uint32_t>(buf, vget_lane_u32(vreinterpret_u32_u16(val), lane));
 }
 
+// Simplify code when caller has |buf| cast as uint8_t*.
+inline void Store4(void* const buf, const uint16x4_t val) {
+  vst1_u16(static_cast<uint16_t*>(buf), val);
+}
+
+// Simplify code when caller has |buf| cast as uint8_t*.
+inline void Store8(void* const buf, const uint16x8_t val) {
+  vst1q_u16(static_cast<uint16_t*>(buf), val);
+}
+
 //------------------------------------------------------------------------------
 // Bit manipulation.
 
 // vshXX_n_XX() requires an immediate.
 template <int shift>
-inline uint8x8_t LeftShift(const uint8x8_t vector) {
+inline uint8x8_t LeftShiftVector(const uint8x8_t vector) {
   return vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(vector), shift));
 }
 
 template <int shift>
-inline uint8x8_t RightShift(const uint8x8_t vector) {
+inline uint8x8_t RightShiftVector(const uint8x8_t vector) {
   return vreinterpret_u8_u64(vshr_n_u64(vreinterpret_u64_u8(vector), shift));
 }
 
 template <int shift>
-inline int8x8_t RightShift(const int8x8_t vector) {
+inline int8x8_t RightShiftVector(const int8x8_t vector) {
   return vreinterpret_s8_u64(vshr_n_u64(vreinterpret_u64_s8(vector), shift));
 }
 
@@ -387,6 +396,15 @@ inline uint16_t SumVector(const uint8x8_t a) {
 #endif  // defined(__aarch64__)
 }
 
+inline uint32_t SumVector(const uint32x2_t a) {
+#if defined(__aarch64__)
+  return vaddv_u32(a);
+#else
+  const uint64x1_t b = vpaddl_u32(a);
+  return vget_lane_u32(vreinterpret_u32_u64(b), 0);
+#endif  // defined(__aarch64__)
+}
+
 inline uint32_t SumVector(const uint32x4_t a) {
 #if defined(__aarch64__)
   return vaddvq_u32(a);
@@ -447,6 +465,36 @@ inline uint16x8x2_t VtrnqU64(uint32x4_t a0, uint32x4_t a1) {
 }
 
 // Input:
+// 00 01 02 03
+// 10 11 12 13
+// 20 21 22 23
+// 30 31 32 33
+inline void Transpose4x4(uint16x4_t a[4]) {
+  // b:
+  // 00 10 02 12
+  // 01 11 03 13
+  const uint16x4x2_t b = vtrn_u16(a[0], a[1]);
+  // c:
+  // 20 30 22 32
+  // 21 31 23 33
+  const uint16x4x2_t c = vtrn_u16(a[2], a[3]);
+  // d:
+  // 00 10 20 30
+  // 02 12 22 32
+  const uint32x2x2_t d =
+      vtrn_u32(vreinterpret_u32_u16(b.val[0]), vreinterpret_u32_u16(c.val[0]));
+  // e:
+  // 01 11 21 31
+  // 03 13 23 33
+  const uint32x2x2_t e =
+      vtrn_u32(vreinterpret_u32_u16(b.val[1]), vreinterpret_u32_u16(c.val[1]));
+  a[0] = vreinterpret_u16_u32(d.val[0]);
+  a[1] = vreinterpret_u16_u32(e.val[0]);
+  a[2] = vreinterpret_u16_u32(d.val[1]);
+  a[3] = vreinterpret_u16_u32(e.val[1]);
+}
+
+// Input:
 // a: 00 01 02 03 10 11 12 13
 // b: 20 21 22 23 30 31 32 33
 // Output:
diff --git a/src/dsp/arm/convolve_neon.cc b/src/dsp/arm/convolve_neon.cc
index fd9b912..331bfe2 100644
--- a/src/dsp/arm/convolve_neon.cc
+++ b/src/dsp/arm/convolve_neon.cc
@@ -101,245 +101,278 @@ int16x8_t SumOnePassTaps(const uint8x8_t* const src,
   return vreinterpretq_s16_u16(sum);
 }
 
-template <int filter_index, bool negative_outside_taps>
-int16x8_t SumHorizontalTaps(const uint8_t* const src,
-                            const uint8x8_t* const v_tap) {
-  uint8x8_t v_src[8];
-  const uint8x16_t src_long = vld1q_u8(src);
-  int16x8_t sum;
-
-  if (filter_index < 2) {
-    v_src[0] = vget_low_u8(vextq_u8(src_long, src_long, 1));
-    v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 2));
-    v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 3));
-    v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 4));
-    v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 5));
-    v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 6));
-    sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src, v_tap + 1);
-  } else if (filter_index == 2) {
-    v_src[0] = vget_low_u8(src_long);
-    v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
-    v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2));
-    v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3));
-    v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 4));
-    v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 5));
-    v_src[6] = vget_low_u8(vextq_u8(src_long, src_long, 6));
-    v_src[7] = vget_low_u8(vextq_u8(src_long, src_long, 7));
-    sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src, v_tap);
-  } else if (filter_index == 3) {
-    v_src[0] = vget_low_u8(vextq_u8(src_long, src_long, 3));
-    v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 4));
-    sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src, v_tap + 3);
-  } else if (filter_index > 3) {
-    v_src[0] = vget_low_u8(vextq_u8(src_long, src_long, 2));
-    v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 3));
-    v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 4));
-    v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 5));
-    sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src, v_tap + 2);
-  }
-  return sum;
-}
-
-template <int filter_index, bool negative_outside_taps>
-uint8x8_t SimpleHorizontalTaps(const uint8_t* const src,
-                               const uint8x8_t* const v_tap) {
-  int16x8_t sum =
-      SumHorizontalTaps<filter_index, negative_outside_taps>(src, v_tap);
-
-  // Normally the Horizontal pass does the downshift in two passes:
-  // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
-  // kInterRoundBitsHorizontal). Each one uses a rounding shift. Combining them
-  // requires adding the rounding offset from the skipped shift.
-  constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2);
-
-  sum = vaddq_s16(sum, vdupq_n_s16(first_shift_rounding_bit));
-  return vqrshrun_n_s16(sum, kFilterBits - 1);
-}
-
-template <int filter_index, bool negative_outside_taps>
-uint16x8_t HorizontalTaps8To16(const uint8_t* const src,
-                               const uint8x8_t* const v_tap) {
-  const int16x8_t sum =
-      SumHorizontalTaps<filter_index, negative_outside_taps>(src, v_tap);
-
-  return vreinterpretq_u16_s16(
-      vrshrq_n_s16(sum, kInterRoundBitsHorizontal - 1));
-}
-
-template <int filter_index>
-int16x8_t SumHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride,
-                               const uint8x8_t* const v_tap) {
-  uint16x8_t sum;
-  const uint8x8_t input0 = vld1_u8(src);
-  src += src_stride;
-  const uint8x8_t input1 = vld1_u8(src);
-  uint8x8x2_t input = vzip_u8(input0, input1);
-
-  if (filter_index == 3) {
-    // tap signs : + +
-    sum = vmull_u8(vext_u8(input.val[0], input.val[1], 6), v_tap[3]);
-    sum = vmlal_u8(sum, input.val[1], v_tap[4]);
-  } else if (filter_index == 4) {
-    // tap signs : - + + -
-    sum = vmull_u8(vext_u8(input.val[0], input.val[1], 6), v_tap[3]);
-    sum = vmlsl_u8(sum, RightShift<4 * 8>(input.val[0]), v_tap[2]);
-    sum = vmlal_u8(sum, input.val[1], v_tap[4]);
-    sum = vmlsl_u8(sum, RightShift<2 * 8>(input.val[1]), v_tap[5]);
-  } else {
-    // tap signs : + + + +
-    sum = vmull_u8(RightShift<4 * 8>(input.val[0]), v_tap[2]);
-    sum = vmlal_u8(sum, vext_u8(input.val[0], input.val[1], 6), v_tap[3]);
-    sum = vmlal_u8(sum, input.val[1], v_tap[4]);
-    sum = vmlal_u8(sum, RightShift<2 * 8>(input.val[1]), v_tap[5]);
-  }
-
-  return vreinterpretq_s16_u16(sum);
-}
-
-template <int filter_index>
-uint8x8_t SimpleHorizontalTaps2x2(const uint8_t* src,
-                                  const ptrdiff_t src_stride,
-                                  const uint8x8_t* const v_tap) {
-  int16x8_t sum = SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
-
-  // Normally the Horizontal pass does the downshift in two passes:
-  // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
-  // kInterRoundBitsHorizontal). Each one uses a rounding shift. Combining them
-  // requires adding the rounding offset from the skipped shift.
-  constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2);
-
-  sum = vaddq_s16(sum, vdupq_n_s16(first_shift_rounding_bit));
-  return vqrshrun_n_s16(sum, kFilterBits - 1);
-}
-
-template <int filter_index>
-uint16x8_t HorizontalTaps8To16_2x2(const uint8_t* src,
-                                   const ptrdiff_t src_stride,
-                                   const uint8x8_t* const v_tap) {
-  const int16x8_t sum =
-      SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
-
-  return vreinterpretq_u16_s16(
-      vrshrq_n_s16(sum, kInterRoundBitsHorizontal - 1));
-}
-
-template <int num_taps, int step, int filter_index,
-          bool negative_outside_taps = true, bool is_2d = false,
-          bool is_compound = false>
-void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride,
-                      void* const dest, const ptrdiff_t pred_stride,
-                      const int width, const int height,
-                      const uint8x8_t* const v_tap) {
+template <int filter_index, bool negative_outside_taps, bool is_2d,
+          bool is_compound>
+void FilterHorizontalWidth8AndUp(const uint8_t* src, const ptrdiff_t src_stride,
+                                 void* const dest, const ptrdiff_t pred_stride,
+                                 const int width, const int height,
+                                 const uint8x8_t* const v_tap) {
   auto* dest8 = static_cast<uint8_t*>(dest);
   auto* dest16 = static_cast<uint16_t*>(dest);
-
-  // 4 tap filters are never used when width > 4.
-  if (num_taps != 4 && width > 4) {
-    int y = 0;
+  if (!is_2d) {
+    int y = height;
     do {
       int x = 0;
-      do {
-        if (is_2d || is_compound) {
-          const uint16x8_t v_sum =
-              HorizontalTaps8To16<filter_index, negative_outside_taps>(&src[x],
-                                                                       v_tap);
+      do {  // Increasing loop counter x is better.
+        const uint8x16_t src_long = vld1q_u8(src + x);
+        uint8x8_t v_src[8];
+        int16x8_t sum;
+        if (filter_index < 2) {
+          v_src[0] = vget_low_u8(src_long);
+          v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+          v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2));
+          v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3));
+          v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 4));
+          v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 5));
+          sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src,
+                                                                    v_tap + 1);
+        } else if (filter_index == 2) {
+          v_src[0] = vget_low_u8(src_long);
+          v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+          v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2));
+          v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3));
+          v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 4));
+          v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 5));
+          v_src[6] = vget_low_u8(vextq_u8(src_long, src_long, 6));
+          v_src[7] = vget_low_u8(vextq_u8(src_long, src_long, 7));
+          sum = SumOnePassTaps<filter_index, false>(v_src, v_tap);
+        } else if (filter_index == 3) {
+          v_src[0] = vget_low_u8(src_long);
+          v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+          sum = SumOnePassTaps<filter_index, false>(v_src, v_tap + 3);
+        } else if (filter_index > 3) {
+          v_src[0] = vget_low_u8(src_long);
+          v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+          v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2));
+          v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3));
+          sum = SumOnePassTaps<filter_index, false>(v_src, v_tap + 2);
+        }
+        if (is_compound) {
+          const uint16x8_t v_sum = vreinterpretq_u16_s16(
+              vrshrq_n_s16(sum, kInterRoundBitsHorizontal - 1));
           vst1q_u16(&dest16[x], v_sum);
         } else {
-          const uint8x8_t result =
-              SimpleHorizontalTaps<filter_index, negative_outside_taps>(&src[x],
-                                                                        v_tap);
+          // Normally the Horizontal pass does the downshift in two passes:
+          // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
+          // kInterRoundBitsHorizontal). Each one uses a rounding shift.
+          // Combining them requires adding the rounding offset from the skipped
+          // shift.
+          constexpr int first_shift_rounding_bit =
+              1 << (kInterRoundBitsHorizontal - 2);
+          sum = vaddq_s16(sum, vdupq_n_s16(first_shift_rounding_bit));
+          const uint8x8_t result = vqrshrun_n_s16(sum, kFilterBits - 1);
           vst1_u8(&dest8[x], result);
         }
-        x += step;
+        x += 8;
       } while (x < width);
       src += src_stride;
       dest8 += pred_stride;
       dest16 += pred_stride;
-    } while (++y < height);
+    } while (--y != 0);
+  } else {
+    int x = 0;
+    do {
+      const uint8_t* s = src + x;
+      int y = height;
+      do {  // Increasing loop counter x is better.
+        const uint8x16_t src_long = vld1q_u8(s);
+        uint8x8_t v_src[8];
+        int16x8_t sum;
+        if (filter_index < 2) {
+          v_src[0] = vget_low_u8(src_long);
+          v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+          v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2));
+          v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3));
+          v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 4));
+          v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 5));
+          sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src,
+                                                                    v_tap + 1);
+        } else if (filter_index == 2) {
+          v_src[0] = vget_low_u8(src_long);
+          v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+          v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2));
+          v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3));
+          v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 4));
+          v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 5));
+          v_src[6] = vget_low_u8(vextq_u8(src_long, src_long, 6));
+          v_src[7] = vget_low_u8(vextq_u8(src_long, src_long, 7));
+          sum = SumOnePassTaps<filter_index, false>(v_src, v_tap);
+        } else if (filter_index == 3) {
+          v_src[0] = vget_low_u8(src_long);
+          v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+          sum = SumOnePassTaps<filter_index, false>(v_src, v_tap + 3);
+        } else if (filter_index > 3) {
+          v_src[0] = vget_low_u8(src_long);
+          v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+          v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2));
+          v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3));
+          sum = SumOnePassTaps<filter_index, false>(v_src, v_tap + 2);
+        }
+        const uint16x8_t v_sum = vreinterpretq_u16_s16(
+            vrshrq_n_s16(sum, kInterRoundBitsHorizontal - 1));
+        vst1q_u16(dest16, v_sum);
+        s += src_stride;
+        dest16 += 8;
+      } while (--y != 0);
+      x += 8;
+    } while (x < width);
+  }
+}
+
+template <int filter_index, bool is_2d, bool is_compound>
+void FilterHorizontalWidth4(const uint8_t* src, const ptrdiff_t src_stride,
+                            void* const dest, const ptrdiff_t pred_stride,
+                            const int height, const uint8x8_t* const v_tap) {
+  auto* dest8 = static_cast<uint8_t*>(dest);
+  auto* dest16 = static_cast<uint16_t*>(dest);
+  int y = height;
+  do {
+    uint8x8_t v_src[4];
+    int16x8_t sum;
+    v_src[0] = vld1_u8(src);
+    if (filter_index == 3) {
+      v_src[1] = RightShiftVector<1 * 8>(v_src[0]);
+      sum = SumOnePassTaps<filter_index, false>(v_src, v_tap + 3);
+    } else {
+      v_src[1] = RightShiftVector<1 * 8>(v_src[0]);
+      v_src[2] = RightShiftVector<2 * 8>(v_src[0]);
+      v_src[3] = RightShiftVector<3 * 8>(v_src[0]);
+      sum = SumOnePassTaps<filter_index, false>(v_src, v_tap + 2);
+    }
+    if (is_2d || is_compound) {
+      const uint16x4_t v_sum = vreinterpret_u16_s16(
+          vrshr_n_s16(vget_low_s16(sum), kInterRoundBitsHorizontal - 1));
+      vst1_u16(dest16, v_sum);
+    } else {
+      constexpr int first_shift_rounding_bit =
+          1 << (kInterRoundBitsHorizontal - 2);
+      sum = vaddq_s16(sum, vdupq_n_s16(first_shift_rounding_bit));
+      const uint8x8_t result = vqrshrun_n_s16(sum, kFilterBits - 1);
+      StoreLo4(&dest8[0], result);
+    }
+    src += src_stride;
+    dest8 += pred_stride;
+    dest16 += pred_stride;
+  } while (--y != 0);
+}
+
+template <int filter_index, bool is_2d>
+void FilterHorizontalWidth2(const uint8_t* src, const ptrdiff_t src_stride,
+                            void* const dest, const ptrdiff_t pred_stride,
+                            const int height, const uint8x8_t* const v_tap) {
+  auto* dest8 = static_cast<uint8_t*>(dest);
+  auto* dest16 = static_cast<uint16_t*>(dest);
+  int y = height >> 1;
+  do {
+    const uint8x8_t input0 = vld1_u8(src);
+    const uint8x8_t input1 = vld1_u8(src + src_stride);
+    const uint8x8x2_t input = vzip_u8(input0, input1);
+    uint16x8_t sum;
+    if (filter_index == 3) {
+      // tap signs : + +
+      sum = vmull_u8(input.val[0], v_tap[3]);
+      sum = vmlal_u8(sum, vext_u8(input.val[0], input.val[1], 2), v_tap[4]);
+    } else if (filter_index == 4) {
+      // tap signs : - + + -
+      sum = vmull_u8(RightShiftVector<2 * 8>(input.val[0]), v_tap[3]);
+      sum = vmlsl_u8(sum, input.val[0], v_tap[2]);
+      sum = vmlal_u8(sum, RightShiftVector<4 * 8>(input.val[0]), v_tap[4]);
+      sum = vmlsl_u8(sum, vext_u8(input.val[0], input.val[1], 6), v_tap[5]);
+    } else {
+      // tap signs : + + + +
+      sum = vmull_u8(input.val[0], v_tap[2]);
+      sum = vmlal_u8(sum, RightShiftVector<2 * 8>(input.val[0]), v_tap[3]);
+      sum = vmlal_u8(sum, RightShiftVector<4 * 8>(input.val[0]), v_tap[4]);
+      sum = vmlal_u8(sum, vext_u8(input.val[0], input.val[1], 6), v_tap[5]);
+    }
+    int16x8_t s = vreinterpretq_s16_u16(sum);
+    if (is_2d) {
+      const uint16x8_t v_sum =
+          vreinterpretq_u16_s16(vrshrq_n_s16(s, kInterRoundBitsHorizontal - 1));
+      dest16[0] = vgetq_lane_u16(v_sum, 0);
+      dest16[1] = vgetq_lane_u16(v_sum, 2);
+      dest16 += pred_stride;
+      dest16[0] = vgetq_lane_u16(v_sum, 1);
+      dest16[1] = vgetq_lane_u16(v_sum, 3);
+      dest16 += pred_stride;
+    } else {
+      // Normally the Horizontal pass does the downshift in two passes:
+      // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
+      // kInterRoundBitsHorizontal). Each one uses a rounding shift.
+      // Combining them requires adding the rounding offset from the skipped
+      // shift.
+      constexpr int first_shift_rounding_bit =
+          1 << (kInterRoundBitsHorizontal - 2);
+      s = vaddq_s16(s, vdupq_n_s16(first_shift_rounding_bit));
+      const uint8x8_t result = vqrshrun_n_s16(s, kFilterBits - 1);
+      dest8[0] = vget_lane_u8(result, 0);
+      dest8[1] = vget_lane_u8(result, 2);
+      dest8 += pred_stride;
+      dest8[0] = vget_lane_u8(result, 1);
+      dest8[1] = vget_lane_u8(result, 3);
+      dest8 += pred_stride;
+    }
+    src += src_stride << 1;
+  } while (--y != 0);
+
+  // The 2d filters have an odd |height| because the horizontal pass
+  // generates context for the vertical pass.
+  if (is_2d) {
+    assert(height % 2 == 1);
+    const uint8x8_t input = vld1_u8(src);
+    uint16x8_t sum;
+    if (filter_index == 3) {
+      sum = vmull_u8(input, v_tap[3]);
+      sum = vmlal_u8(sum, RightShiftVector<1 * 8>(input), v_tap[4]);
+    } else if (filter_index == 4) {
+      sum = vmull_u8(RightShiftVector<1 * 8>(input), v_tap[3]);
+      sum = vmlsl_u8(sum, input, v_tap[2]);
+      sum = vmlal_u8(sum, RightShiftVector<2 * 8>(input), v_tap[4]);
+      sum = vmlsl_u8(sum, RightShiftVector<3 * 8>(input), v_tap[5]);
+    } else {
+      assert(filter_index == 5);
+      sum = vmull_u8(input, v_tap[2]);
+      sum = vmlal_u8(sum, RightShiftVector<1 * 8>(input), v_tap[3]);
+      sum = vmlal_u8(sum, RightShiftVector<2 * 8>(input), v_tap[4]);
+      sum = vmlal_u8(sum, RightShiftVector<3 * 8>(input), v_tap[5]);
+    }
+    // |sum| contains an int16_t value.
+    sum = vreinterpretq_u16_s16(vrshrq_n_s16(vreinterpretq_s16_u16(sum),
+                                             kInterRoundBitsHorizontal - 1));
+    Store2<0>(dest16, sum);
+  }
+}
+
+template <int filter_index, bool negative_outside_taps, bool is_2d,
+          bool is_compound>
+void FilterHorizontal(const uint8_t* const src, const ptrdiff_t src_stride,
+                      void* const dest, const ptrdiff_t pred_stride,
+                      const int width, const int height,
+                      const uint8x8_t* const v_tap) {
+  assert(width < 8 || filter_index <= 3);
+  // Don't simplify the redundant if conditions with the template parameters,
+  // which helps the compiler generate compact code.
+  if (width >= 8 && filter_index <= 3) {
+    FilterHorizontalWidth8AndUp<filter_index, negative_outside_taps, is_2d,
+                                is_compound>(src, src_stride, dest, pred_stride,
+                                             width, height, v_tap);
     return;
   }
 
-  // Horizontal passes only needs to account for |num_taps| 2 and 4 when
+  // Horizontal passes only needs to account for number of taps 2 and 4 when
   // |width| <= 4.
   assert(width <= 4);
-  assert(num_taps <= 4);
-  if (num_taps <= 4) {
+  assert(filter_index >= 3 && filter_index <= 5);
+  if (filter_index >= 3 && filter_index <= 5) {
     if (width == 4) {
-      int y = 0;
-      do {
-        if (is_2d || is_compound) {
-          const uint16x8_t v_sum =
-              HorizontalTaps8To16<filter_index, negative_outside_taps>(src,
-                                                                       v_tap);
-          vst1_u16(dest16, vget_low_u16(v_sum));
-        } else {
-          const uint8x8_t result =
-              SimpleHorizontalTaps<filter_index, negative_outside_taps>(src,
-                                                                        v_tap);
-          StoreLo4(&dest8[0], result);
-        }
-        src += src_stride;
-        dest8 += pred_stride;
-        dest16 += pred_stride;
-      } while (++y < height);
+      FilterHorizontalWidth4<filter_index, is_2d, is_compound>(
+          src, src_stride, dest, pred_stride, height, v_tap);
       return;
     }
-
+    assert(width == 2);
     if (!is_compound) {
-      int y = 0;
-      do {
-        if (is_2d) {
-          const uint16x8_t sum =
-              HorizontalTaps8To16_2x2<filter_index>(src, src_stride, v_tap);
-          dest16[0] = vgetq_lane_u16(sum, 0);
-          dest16[1] = vgetq_lane_u16(sum, 2);
-          dest16 += pred_stride;
-          dest16[0] = vgetq_lane_u16(sum, 1);
-          dest16[1] = vgetq_lane_u16(sum, 3);
-          dest16 += pred_stride;
-        } else {
-          const uint8x8_t sum =
-              SimpleHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
-
-          dest8[0] = vget_lane_u8(sum, 0);
-          dest8[1] = vget_lane_u8(sum, 2);
-          dest8 += pred_stride;
-
-          dest8[0] = vget_lane_u8(sum, 1);
-          dest8[1] = vget_lane_u8(sum, 3);
-          dest8 += pred_stride;
-        }
-
-        src += src_stride << 1;
-        y += 2;
-      } while (y < height - 1);
-
-      // The 2d filters have an odd |height| because the horizontal pass
-      // generates context for the vertical pass.
-      if (is_2d) {
-        assert(height % 2 == 1);
-        uint16x8_t sum;
-        const uint8x8_t input = vld1_u8(src);
-        if (filter_index == 3) {  // |num_taps| == 2
-          sum = vmull_u8(RightShift<3 * 8>(input), v_tap[3]);
-          sum = vmlal_u8(sum, RightShift<4 * 8>(input), v_tap[4]);
-        } else if (filter_index == 4) {
-          sum = vmull_u8(RightShift<3 * 8>(input), v_tap[3]);
-          sum = vmlsl_u8(sum, RightShift<2 * 8>(input), v_tap[2]);
-          sum = vmlal_u8(sum, RightShift<4 * 8>(input), v_tap[4]);
-          sum = vmlsl_u8(sum, RightShift<5 * 8>(input), v_tap[5]);
-        } else {
-          assert(filter_index == 5);
-          sum = vmull_u8(RightShift<2 * 8>(input), v_tap[2]);
-          sum = vmlal_u8(sum, RightShift<3 * 8>(input), v_tap[3]);
-          sum = vmlal_u8(sum, RightShift<4 * 8>(input), v_tap[4]);
-          sum = vmlal_u8(sum, RightShift<5 * 8>(input), v_tap[5]);
-        }
-        // |sum| contains an int16_t value.
-        sum = vreinterpretq_u16_s16(vrshrq_n_s16(
-            vreinterpretq_s16_u16(sum), kInterRoundBitsHorizontal - 1));
-        Store2<0>(dest16, sum);
-      }
+      FilterHorizontalWidth2<filter_index, is_2d>(src, src_stride, dest,
+                                                  pred_stride, height, v_tap);
     }
   }
 }
@@ -451,78 +484,85 @@ int16x8_t SimpleSum2DVerticalTaps(const int16x8_t* const src,
 }
 
 template <int num_taps, bool is_compound = false>
-void Filter2DVertical(const uint16_t* src, void* const dst,
-                      const ptrdiff_t dst_stride, const int width,
-                      const int height, const int16x8_t taps) {
+void Filter2DVerticalWidth8AndUp(const uint16_t* src, void* const dst,
+                                 const ptrdiff_t dst_stride, const int width,
+                                 const int height, const int16x8_t taps) {
   assert(width >= 8);
   constexpr int next_row = num_taps - 1;
-  // The Horizontal pass uses |width| as |stride| for the intermediate buffer.
-  const ptrdiff_t src_stride = width;
-
-  auto* dst8 = static_cast<uint8_t*>(dst);
-  auto* dst16 = static_cast<uint16_t*>(dst);
+  auto* const dst8 = static_cast<uint8_t*>(dst);
+  auto* const dst16 = static_cast<uint16_t*>(dst);
 
   int x = 0;
   do {
-    int16x8_t srcs[8];
-    const uint16_t* src_x = src + x;
-    srcs[0] = vreinterpretq_s16_u16(vld1q_u16(src_x));
-    src_x += src_stride;
+    int16x8_t srcs[9];
+    srcs[0] = vreinterpretq_s16_u16(vld1q_u16(src));
+    src += 8;
     if (num_taps >= 4) {
-      srcs[1] = vreinterpretq_s16_u16(vld1q_u16(src_x));
-      src_x += src_stride;
-      srcs[2] = vreinterpretq_s16_u16(vld1q_u16(src_x));
-      src_x += src_stride;
+      srcs[1] = vreinterpretq_s16_u16(vld1q_u16(src));
+      src += 8;
+      srcs[2] = vreinterpretq_s16_u16(vld1q_u16(src));
+      src += 8;
       if (num_taps >= 6) {
-        srcs[3] = vreinterpretq_s16_u16(vld1q_u16(src_x));
-        src_x += src_stride;
-        srcs[4] = vreinterpretq_s16_u16(vld1q_u16(src_x));
-        src_x += src_stride;
+        srcs[3] = vreinterpretq_s16_u16(vld1q_u16(src));
+        src += 8;
+        srcs[4] = vreinterpretq_s16_u16(vld1q_u16(src));
+        src += 8;
         if (num_taps == 8) {
-          srcs[5] = vreinterpretq_s16_u16(vld1q_u16(src_x));
-          src_x += src_stride;
-          srcs[6] = vreinterpretq_s16_u16(vld1q_u16(src_x));
-          src_x += src_stride;
+          srcs[5] = vreinterpretq_s16_u16(vld1q_u16(src));
+          src += 8;
+          srcs[6] = vreinterpretq_s16_u16(vld1q_u16(src));
+          src += 8;
         }
       }
     }
 
-    int y = 0;
+    uint8_t* d8 = dst8 + x;
+    uint16_t* d16 = dst16 + x;
+    int y = height;
     do {
-      srcs[next_row] = vreinterpretq_s16_u16(vld1q_u16(src_x));
-      src_x += src_stride;
-
-      const int16x8_t sum =
-          SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps);
+      srcs[next_row] = vreinterpretq_s16_u16(vld1q_u16(src));
+      src += 8;
+      srcs[next_row + 1] = vreinterpretq_s16_u16(vld1q_u16(src));
+      src += 8;
+      const int16x8_t sum0 =
+          SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs + 0, taps);
+      const int16x8_t sum1 =
+          SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs + 1, taps);
       if (is_compound) {
-        vst1q_u16(dst16 + x + y * dst_stride, vreinterpretq_u16_s16(sum));
+        vst1q_u16(d16, vreinterpretq_u16_s16(sum0));
+        d16 += dst_stride;
+        vst1q_u16(d16, vreinterpretq_u16_s16(sum1));
+        d16 += dst_stride;
       } else {
-        vst1_u8(dst8 + x + y * dst_stride, vqmovun_s16(sum));
+        vst1_u8(d8, vqmovun_s16(sum0));
+        d8 += dst_stride;
+        vst1_u8(d8, vqmovun_s16(sum1));
+        d8 += dst_stride;
       }
-
-      srcs[0] = srcs[1];
+      srcs[0] = srcs[2];
       if (num_taps >= 4) {
-        srcs[1] = srcs[2];
-        srcs[2] = srcs[3];
+        srcs[1] = srcs[3];
+        srcs[2] = srcs[4];
         if (num_taps >= 6) {
-          srcs[3] = srcs[4];
-          srcs[4] = srcs[5];
+          srcs[3] = srcs[5];
+          srcs[4] = srcs[6];
           if (num_taps == 8) {
-            srcs[5] = srcs[6];
-            srcs[6] = srcs[7];
+            srcs[5] = srcs[7];
+            srcs[6] = srcs[8];
           }
         }
       }
-    } while (++y < height);
+      y -= 2;
+    } while (y != 0);
     x += 8;
   } while (x < width);
 }
 
 // Take advantage of |src_stride| == |width| to process two rows at a time.
 template <int num_taps, bool is_compound = false>
-void Filter2DVertical4xH(const uint16_t* src, void* const dst,
-                         const ptrdiff_t dst_stride, const int height,
-                         const int16x8_t taps) {
+void Filter2DVerticalWidth4(const uint16_t* src, void* const dst,
+                            const ptrdiff_t dst_stride, const int height,
+                            const int16x8_t taps) {
   auto* dst8 = static_cast<uint8_t*>(dst);
   auto* dst16 = static_cast<uint16_t*>(dst);
 
@@ -545,7 +585,7 @@ void Filter2DVertical4xH(const uint16_t* src, void* const dst,
     }
   }
 
-  int y = 0;
+  int y = height;
   do {
     srcs[num_taps] = vreinterpretq_s16_u16(vld1q_u16(src));
     src += 8;
@@ -580,15 +620,15 @@ void Filter2DVertical4xH(const uint16_t* src, void* const dst,
         }
       }
     }
-    y += 2;
-  } while (y < height);
+    y -= 2;
+  } while (y != 0);
 }
 
 // Take advantage of |src_stride| == |width| to process four rows at a time.
 template <int num_taps>
-void Filter2DVertical2xH(const uint16_t* src, void* const dst,
-                         const ptrdiff_t dst_stride, const int height,
-                         const int16x8_t taps) {
+void Filter2DVerticalWidth2(const uint16_t* src, void* const dst,
+                            const ptrdiff_t dst_stride, const int height,
+                            const int16x8_t taps) {
   constexpr int next_row = (num_taps < 6) ? 4 : 8;
 
   auto* dst8 = static_cast<uint8_t*>(dst);
@@ -672,29 +712,47 @@ LIBGAV1_ALWAYS_INLINE void DoHorizontalPass(
   }
 
   if (filter_index == 2) {  // 8 tap.
-    FilterHorizontal<8, 8, 2, true, is_2d, is_compound>(
+    FilterHorizontal<2, true, is_2d, is_compound>(
         src, src_stride, dst, dst_stride, width, height, v_tap);
   } else if (filter_index == 1) {  // 6 tap.
     // Check if outside taps are positive.
     if ((filter_id == 1) | (filter_id == 15)) {
-      FilterHorizontal<6, 8, 1, false, is_2d, is_compound>(
-          src, src_stride, dst, dst_stride, width, height, v_tap);
+      FilterHorizontal<1, false, is_2d, is_compound>(
+          src + 1, src_stride, dst, dst_stride, width, height, v_tap);
     } else {
-      FilterHorizontal<6, 8, 1, true, is_2d, is_compound>(
-          src, src_stride, dst, dst_stride, width, height, v_tap);
+      FilterHorizontal<1, true, is_2d, is_compound>(
+          src + 1, src_stride, dst, dst_stride, width, height, v_tap);
     }
   } else if (filter_index == 0) {  // 6 tap.
-    FilterHorizontal<6, 8, 0, true, is_2d, is_compound>(
-        src, src_stride, dst, dst_stride, width, height, v_tap);
+    FilterHorizontal<0, true, is_2d, is_compound>(
+        src + 1, src_stride, dst, dst_stride, width, height, v_tap);
   } else if (filter_index == 4) {  // 4 tap.
-    FilterHorizontal<4, 8, 4, true, is_2d, is_compound>(
-        src, src_stride, dst, dst_stride, width, height, v_tap);
+    FilterHorizontal<4, true, is_2d, is_compound>(
+        src + 2, src_stride, dst, dst_stride, width, height, v_tap);
   } else if (filter_index == 5) {  // 4 tap.
-    FilterHorizontal<4, 8, 5, true, is_2d, is_compound>(
-        src, src_stride, dst, dst_stride, width, height, v_tap);
+    FilterHorizontal<5, true, is_2d, is_compound>(
+        src + 2, src_stride, dst, dst_stride, width, height, v_tap);
   } else {  // 2 tap.
-    FilterHorizontal<2, 8, 3, true, is_2d, is_compound>(
-        src, src_stride, dst, dst_stride, width, height, v_tap);
+    FilterHorizontal<3, true, is_2d, is_compound>(
+        src + 3, src_stride, dst, dst_stride, width, height, v_tap);
+  }
+}
+
+template <int vertical_taps>
+void Filter2DVertical(const uint16_t* const intermediate_result,
+                      const int width, const int height, const int16x8_t taps,
+                      void* const prediction, const ptrdiff_t pred_stride) {
+  auto* const dest = static_cast<uint8_t*>(prediction);
+  if (width >= 8) {
+    Filter2DVerticalWidth8AndUp<vertical_taps>(
+        intermediate_result, dest, pred_stride, width, height, taps);
+  } else if (width == 4) {
+    Filter2DVerticalWidth4<vertical_taps>(intermediate_result, dest,
+                                          pred_stride, height, taps);
+  } else {
+    assert(width == 2);
+    Filter2DVerticalWidth2<vertical_taps>(intermediate_result, dest,
+                                          pred_stride, height, taps);
   }
 }
 
@@ -704,7 +762,7 @@ void Convolve2D_NEON(const void* const reference,
                      const int vertical_filter_index,
                      const int horizontal_filter_id,
                      const int vertical_filter_id, const int width,
-                     const int height, void* prediction,
+                     const int height, void* const prediction,
                      const ptrdiff_t pred_stride) {
   const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
   const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
@@ -715,67 +773,31 @@ void Convolve2D_NEON(const void* const reference,
       intermediate_result[kMaxSuperBlockSizeInPixels *
                           (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
   const int intermediate_height = height + vertical_taps - 1;
-
   const ptrdiff_t src_stride = reference_stride;
-  const auto* src = static_cast<const uint8_t*>(reference) -
-                    (vertical_taps / 2 - 1) * src_stride - kHorizontalOffset;
+  const auto* const src = static_cast<const uint8_t*>(reference) -
+                          (vertical_taps / 2 - 1) * src_stride -
+                          kHorizontalOffset;
 
   DoHorizontalPass</*is_2d=*/true>(src, src_stride, intermediate_result, width,
                                    width, intermediate_height,
                                    horizontal_filter_id, horiz_filter_index);
 
   // Vertical filter.
-  auto* dest = static_cast<uint8_t*>(prediction);
-  const ptrdiff_t dest_stride = pred_stride;
   assert(vertical_filter_id != 0);
-
   const int16x8_t taps = vmovl_s8(
       vld1_s8(kHalfSubPixelFilters[vert_filter_index][vertical_filter_id]));
-
   if (vertical_taps == 8) {
-    if (width == 2) {
-      Filter2DVertical2xH<8>(intermediate_result, dest, dest_stride, height,
-                             taps);
-    } else if (width == 4) {
-      Filter2DVertical4xH<8>(intermediate_result, dest, dest_stride, height,
-                             taps);
-    } else {
-      Filter2DVertical<8>(intermediate_result, dest, dest_stride, width, height,
-                          taps);
-    }
+    Filter2DVertical<8>(intermediate_result, width, height, taps, prediction,
+                        pred_stride);
   } else if (vertical_taps == 6) {
-    if (width == 2) {
-      Filter2DVertical2xH<6>(intermediate_result, dest, dest_stride, height,
-                             taps);
-    } else if (width == 4) {
-      Filter2DVertical4xH<6>(intermediate_result, dest, dest_stride, height,
-                             taps);
-    } else {
-      Filter2DVertical<6>(intermediate_result, dest, dest_stride, width, height,
-                          taps);
-    }
+    Filter2DVertical<6>(intermediate_result, width, height, taps, prediction,
+                        pred_stride);
   } else if (vertical_taps == 4) {
-    if (width == 2) {
-      Filter2DVertical2xH<4>(intermediate_result, dest, dest_stride, height,
-                             taps);
-    } else if (width == 4) {
-      Filter2DVertical4xH<4>(intermediate_result, dest, dest_stride, height,
-                             taps);
-    } else {
-      Filter2DVertical<4>(intermediate_result, dest, dest_stride, width, height,
-                          taps);
-    }
+    Filter2DVertical<4>(intermediate_result, width, height, taps, prediction,
+                        pred_stride);
   } else {  // |vertical_taps| == 2
-    if (width == 2) {
-      Filter2DVertical2xH<2>(intermediate_result, dest, dest_stride, height,
-                             taps);
-    } else if (width == 4) {
-      Filter2DVertical4xH<2>(intermediate_result, dest, dest_stride, height,
-                             taps);
-    } else {
-      Filter2DVertical<2>(intermediate_result, dest, dest_stride, width, height,
-                          taps);
-    }
+    Filter2DVertical<2>(intermediate_result, width, height, taps, prediction,
+                        pred_stride);
   }
 }
 
@@ -788,7 +810,7 @@ void Convolve2D_NEON(const void* const reference,
 // increments. The first load covers the initial elements of src_x, while the
 // final load covers the taps.
 template <int grade_x>
-inline uint8x8x3_t LoadSrcVals(const uint8_t* src_x) {
+inline uint8x8x3_t LoadSrcVals(const uint8_t* const src_x) {
   uint8x8x3_t ret;
   const uint8x16_t src_val = vld1q_u8(src_x);
   ret.val[0] = vget_low_u8(src_val);
@@ -811,7 +833,7 @@ inline uint8x16_t GetPositive2TapFilter(const int tap_index) {
 }
 
 template <int grade_x>
-inline void ConvolveKernelHorizontal2Tap(const uint8_t* src,
+inline void ConvolveKernelHorizontal2Tap(const uint8_t* const src,
                                          const ptrdiff_t src_stride,
                                          const int width, const int subpixel_x,
                                          const int step_x,
@@ -843,7 +865,7 @@ inline void ConvolveKernelHorizontal2Tap(const uint8_t* src,
     // on x.
     const uint8x8_t taps[2] = {VQTbl1U8(filter_taps0, filter_indices),
                                VQTbl1U8(filter_taps1, filter_indices)};
-    int y = 0;
+    int y = intermediate_height;
     do {
       // Load a pool of samples to select from using stepped indices.
       const uint8x16_t src_vals = vld1q_u8(src_x);
@@ -860,7 +882,7 @@ inline void ConvolveKernelHorizontal2Tap(const uint8_t* src,
                              kInterRoundBitsHorizontal - 1));
       src_x += src_stride;
       intermediate += kIntermediateStride;
-    } while (++y < intermediate_height);
+    } while (--y != 0);
     return;
   }
 
@@ -883,7 +905,7 @@ inline void ConvolveKernelHorizontal2Tap(const uint8_t* src,
     // on x.
     const uint8x8_t taps[2] = {VQTbl1U8(filter_taps0, filter_indices),
                                VQTbl1U8(filter_taps1, filter_indices)};
-    int y = 0;
+    int y = intermediate_height;
     do {
       // Load a pool of samples to select from using stepped indices.
       const uint8x8x3_t src_vals = LoadSrcVals<grade_x>(src_x);
@@ -900,7 +922,7 @@ inline void ConvolveKernelHorizontal2Tap(const uint8_t* src,
                              kInterRoundBitsHorizontal - 1));
       src_x += src_stride;
       intermediate_x += kIntermediateStride;
-    } while (++y < intermediate_height);
+    } while (--y != 0);
     x += 8;
     p += step_x8;
   } while (x < width);
@@ -921,7 +943,7 @@ inline uint8x16_t GetPositive4TapFilter(const int tap_index) {
 
 // This filter is only possible when width <= 4.
 void ConvolveKernelHorizontalPositive4Tap(
-    const uint8_t* src, const ptrdiff_t src_stride, const int subpixel_x,
+    const uint8_t* const src, const ptrdiff_t src_stride, const int subpixel_x,
     const int step_x, const int intermediate_height, int16_t* intermediate) {
   const int kernel_offset = 2;
   const int ref_x = subpixel_x >> kScaleSubPixelBits;
@@ -950,7 +972,7 @@ void ConvolveKernelHorizontalPositive4Tap(
 
   const uint8x8_t src_indices =
       vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits));
-  int y = 0;
+  int y = intermediate_height;
   do {
     // Load a pool of samples to select from using stepped index vectors.
     const uint8x16_t src_vals = vld1q_u8(src_x);
@@ -970,7 +992,7 @@ void ConvolveKernelHorizontalPositive4Tap(
 
     src_x += src_stride;
     intermediate += kIntermediateStride;
-  } while (++y < intermediate_height);
+  } while (--y != 0);
 }
 
 // Pre-transpose the 4 tap filters in |kAbsHalfSubPixelFilters|[4].
@@ -988,7 +1010,7 @@ inline uint8x16_t GetSigned4TapFilter(const int tap_index) {
 
 // This filter is only possible when width <= 4.
 inline void ConvolveKernelHorizontalSigned4Tap(
-    const uint8_t* src, const ptrdiff_t src_stride, const int subpixel_x,
+    const uint8_t* const src, const ptrdiff_t src_stride, const int subpixel_x,
     const int step_x, const int intermediate_height, int16_t* intermediate) {
   const int kernel_offset = 2;
   const int ref_x = subpixel_x >> kScaleSubPixelBits;
@@ -1025,7 +1047,7 @@ inline void ConvolveKernelHorizontalSigned4Tap(
                                     vadd_u8(src_indices_base, vdup_n_u8(2)),
                                     vadd_u8(src_indices_base, vdup_n_u8(3))};
 
-  int y = 0;
+  int y = intermediate_height;
   do {
     // Load a pool of samples to select from using stepped indices.
     const uint8x16_t src_vals = vld1q_u8(src_x);
@@ -1042,7 +1064,7 @@ inline void ConvolveKernelHorizontalSigned4Tap(
                            kInterRoundBitsHorizontal - 1));
     src_x += src_stride;
     intermediate += kIntermediateStride;
-  } while (++y < intermediate_height);
+  } while (--y != 0);
 }
 
 // Pre-transpose the 6 tap filters in |kAbsHalfSubPixelFilters|[0].
@@ -1063,9 +1085,9 @@ inline uint8x16_t GetSigned6TapFilter(const int tap_index) {
 // This filter is only possible when width >= 8.
 template <int grade_x>
 inline void ConvolveKernelHorizontalSigned6Tap(
-    const uint8_t* src, const ptrdiff_t src_stride, const int width,
+    const uint8_t* const src, const ptrdiff_t src_stride, const int width,
     const int subpixel_x, const int step_x, const int intermediate_height,
-    int16_t* intermediate) {
+    int16_t* const intermediate) {
   const int kernel_offset = 1;
   const uint8x8_t one = vdup_n_u8(1);
   const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
@@ -1107,7 +1129,7 @@ inline void ConvolveKernelHorizontalSigned6Tap(
     for (int i = 0; i < 6; ++i) {
       taps[i] = VQTbl1U8(filter_taps[i], filter_indices);
     }
-    int y = 0;
+    int y = intermediate_height;
     do {
       // Load a pool of samples to select from using stepped indices.
       const uint8x8x3_t src_vals = LoadSrcVals<grade_x>(src_x);
@@ -1122,7 +1144,7 @@ inline void ConvolveKernelHorizontalSigned6Tap(
                              kInterRoundBitsHorizontal - 1));
       src_x += src_stride;
       intermediate_x += kIntermediateStride;
-    } while (++y < intermediate_height);
+    } while (--y != 0);
     x += 8;
     p += step_x8;
   } while (x < width);
@@ -1156,9 +1178,9 @@ inline int8x16_t GetMixed6TapFilter(const int tap_index) {
 // This filter is only possible when width >= 8.
 template <int grade_x>
 inline void ConvolveKernelHorizontalMixed6Tap(
-    const uint8_t* src, const ptrdiff_t src_stride, const int width,
+    const uint8_t* const src, const ptrdiff_t src_stride, const int width,
     const int subpixel_x, const int step_x, const int intermediate_height,
-    int16_t* intermediate) {
+    int16_t* const intermediate) {
   const int kernel_offset = 1;
   const uint8x8_t one = vdup_n_u8(1);
   const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
@@ -1205,7 +1227,7 @@ inline void ConvolveKernelHorizontalMixed6Tap(
     mixed_taps[0] = vmovl_s8(VQTbl1S8(mixed_filter_taps[0], filter_indices));
     mixed_taps[1] = vmovl_s8(VQTbl1S8(mixed_filter_taps[1], filter_indices));
 
-    int y = 0;
+    int y = intermediate_height;
     do {
       // Load a pool of samples to select from using stepped indices.
       const uint8x8x3_t src_vals = LoadSrcVals<grade_x>(src_x);
@@ -1224,7 +1246,7 @@ inline void ConvolveKernelHorizontalMixed6Tap(
                                              kInterRoundBitsHorizontal - 1));
       src_x += src_stride;
       intermediate_x += kIntermediateStride;
-    } while (++y < intermediate_height);
+    } while (--y != 0);
     x += 8;
     p += step_x8;
   } while (x < width);
@@ -1250,9 +1272,9 @@ inline uint8x16_t GetSigned8TapFilter(const int tap_index) {
 // This filter is only possible when width >= 8.
 template <int grade_x>
 inline void ConvolveKernelHorizontalSigned8Tap(
-    const uint8_t* src, const ptrdiff_t src_stride, const int width,
+    const uint8_t* const src, const ptrdiff_t src_stride, const int width,
     const int subpixel_x, const int step_x, const int intermediate_height,
-    int16_t* intermediate) {
+    int16_t* const intermediate) {
   const uint8x8_t one = vdup_n_u8(1);
   const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
   const int ref_x = subpixel_x >> kScaleSubPixelBits;
@@ -1290,7 +1312,7 @@ inline void ConvolveKernelHorizontalSigned8Tap(
       taps[i] = VQTbl1U8(filter_taps[i], filter_indices);
     }
 
-    int y = 0;
+    int y = intermediate_height;
     do {
       // Load a pool of samples to select from using stepped indices.
       const uint8x8x3_t src_vals = LoadSrcVals<grade_x>(src_x);
@@ -1306,7 +1328,7 @@ inline void ConvolveKernelHorizontalSigned8Tap(
                              kInterRoundBitsHorizontal - 1));
       src_x += src_stride;
       intermediate_x += kIntermediateStride;
-    } while (++y < intermediate_height);
+    } while (--y != 0);
     x += 8;
     p += step_x8;
   } while (x < width);
@@ -1314,9 +1336,9 @@ inline void ConvolveKernelHorizontalSigned8Tap(
 
 // This function handles blocks of width 2 or 4.
 template <int num_taps, int grade_y, int width, bool is_compound>
-void ConvolveVerticalScale4xH(const int16_t* src, const int subpixel_y,
+void ConvolveVerticalScale4xH(const int16_t* const src, const int subpixel_y,
                               const int filter_index, const int step_y,
-                              const int height, void* dest,
+                              const int height, void* const dest,
                               const ptrdiff_t dest_stride) {
   constexpr ptrdiff_t src_stride = kIntermediateStride;
   const int16_t* src_y = src;
@@ -1327,8 +1349,8 @@ void ConvolveVerticalScale4xH(const int16_t* src, const int subpixel_y,
 
   int p = subpixel_y & 1023;
   int prev_p = p;
-  int y = 0;
-  do {  // y < height
+  int y = height;
+  do {
     for (int i = 0; i < num_taps; ++i) {
       s[i] = vld1_s16(src_y + i * src_stride);
     }
@@ -1381,16 +1403,16 @@ void ConvolveVerticalScale4xH(const int16_t* src, const int subpixel_y,
     prev_p = p;
     dest16_y += dest_stride;
     dest_y += dest_stride;
-
-    y += 2;
-  } while (y < height);
+    y -= 2;
+  } while (y != 0);
 }
 
 template <int num_taps, int grade_y, bool is_compound>
-inline void ConvolveVerticalScale(const int16_t* src, const int width,
+inline void ConvolveVerticalScale(const int16_t* const src, const int width,
                                   const int subpixel_y, const int filter_index,
                                   const int step_y, const int height,
-                                  void* dest, const ptrdiff_t dest_stride) {
+                                  void* const dest,
+                                  const ptrdiff_t dest_stride) {
   constexpr ptrdiff_t src_stride = kIntermediateStride;
   // A possible improvement is to use arithmetic to decide how many times to
   // apply filters to same source before checking whether to load new srcs.
@@ -1401,15 +1423,15 @@ inline void ConvolveVerticalScale(const int16_t* src, const int width,
   uint8_t* dest_y;
 
   int x = 0;
-  do {  // x < width
-    const int16_t* src_x = src + x;
+  do {
+    const int16_t* const src_x = src + x;
     const int16_t* src_y = src_x;
     dest16_y = static_cast<uint16_t*>(dest) + x;
     dest_y = static_cast<uint8_t*>(dest) + x;
     int p = subpixel_y & 1023;
     int prev_p = p;
-    int y = 0;
-    do {  // y < height
+    int y = height;
+    do {
       for (int i = 0; i < num_taps; ++i) {
         s[i] = vld1q_s16(src_y + i * src_stride);
       }
@@ -1448,9 +1470,8 @@ inline void ConvolveVerticalScale(const int16_t* src, const int width,
       prev_p = p;
       dest16_y += dest_stride;
       dest_y += dest_stride;
-
-      y += 2;
-    } while (y < height);
+      y -= 2;
+    } while (y != 0);
     x += 8;
   } while (x < width);
 }
@@ -1462,7 +1483,7 @@ void ConvolveScale2D_NEON(const void* const reference,
                           const int vertical_filter_index, const int subpixel_x,
                           const int subpixel_y, const int step_x,
                           const int step_y, const int width, const int height,
-                          void* prediction, const ptrdiff_t pred_stride) {
+                          void* const prediction, const ptrdiff_t pred_stride) {
   const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
   const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
   assert(step_x <= 2048);
@@ -1699,12 +1720,13 @@ void ConvolveHorizontal_NEON(const void* const reference,
                              const int /*vertical_filter_index*/,
                              const int horizontal_filter_id,
                              const int /*vertical_filter_id*/, const int width,
-                             const int height, void* prediction,
+                             const int height, void* const prediction,
                              const ptrdiff_t pred_stride) {
   const int filter_index = GetFilterIndex(horizontal_filter_index, width);
   // Set |src| to the outermost tap.
-  const auto* src = static_cast<const uint8_t*>(reference) - kHorizontalOffset;
-  auto* dest = static_cast<uint8_t*>(prediction);
+  const auto* const src =
+      static_cast<const uint8_t*>(reference) - kHorizontalOffset;
+  auto* const dest = static_cast<uint8_t*>(prediction);
 
   DoHorizontalPass(src, reference_stride, dest, pred_stride, width, height,
                    horizontal_filter_id, filter_index);
@@ -1719,14 +1741,14 @@ uint16x8_t Compound1DShift(const int16x8_t sum) {
 
 template <int filter_index, bool is_compound = false,
           bool negative_outside_taps = false>
-void FilterVertical(const uint8_t* src, const ptrdiff_t src_stride,
+void FilterVertical(const uint8_t* const src, const ptrdiff_t src_stride,
                     void* const dst, const ptrdiff_t dst_stride,
                     const int width, const int height,
                     const uint8x8_t* const taps) {
   const int num_taps = GetNumTapsInFilter(filter_index);
   const int next_row = num_taps - 1;
-  auto* dst8 = static_cast<uint8_t*>(dst);
-  auto* dst16 = static_cast<uint16_t*>(dst);
+  auto* const dst8 = static_cast<uint8_t*>(dst);
+  auto* const dst16 = static_cast<uint16_t*>(dst);
   assert(width >= 8);
 
   int x = 0;
@@ -1754,6 +1776,9 @@ void FilterVertical(const uint8_t* src, const ptrdiff_t src_stride,
       }
     }
 
+    // Decreasing the y loop counter produces worse code with clang.
+    // Don't unroll this loop since it generates too much code and the decoder
+    // is even slower.
     int y = 0;
     do {
       srcs[next_row] = vld1_u8(src_x);
@@ -1804,7 +1829,7 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
     srcs[0] = Load4(src);
     src += src_stride;
 
-    int y = 0;
+    int y = height;
     do {
       srcs[0] = Load4<1>(src, srcs[0]);
       src += src_stride;
@@ -1829,8 +1854,8 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
       }
 
       srcs[0] = srcs[2];
-      y += 2;
-    } while (y < height);
+      y -= 2;
+    } while (y != 0);
   } else if (num_taps == 4) {
     srcs[4] = vdup_n_u8(0);
 
@@ -1842,7 +1867,7 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
     src += src_stride;
     srcs[1] = vext_u8(srcs[0], srcs[2], 4);
 
-    int y = 0;
+    int y = height;
     do {
       srcs[2] = Load4<1>(src, srcs[2]);
       src += src_stride;
@@ -1869,8 +1894,8 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
       srcs[0] = srcs[2];
       srcs[1] = srcs[3];
       srcs[2] = srcs[4];
-      y += 2;
-    } while (y < height);
+      y -= 2;
+    } while (y != 0);
   } else if (num_taps == 6) {
     srcs[6] = vdup_n_u8(0);
 
@@ -1887,7 +1912,7 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
     src += src_stride;
     srcs[3] = vext_u8(srcs[2], srcs[4], 4);
 
-    int y = 0;
+    int y = height;
     do {
       srcs[4] = Load4<1>(src, srcs[4]);
       src += src_stride;
@@ -1916,8 +1941,8 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
       srcs[2] = srcs[4];
       srcs[3] = srcs[5];
       srcs[4] = srcs[6];
-      y += 2;
-    } while (y < height);
+      y -= 2;
+    } while (y != 0);
   } else if (num_taps == 8) {
     srcs[8] = vdup_n_u8(0);
 
@@ -1939,7 +1964,7 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
     src += src_stride;
     srcs[5] = vext_u8(srcs[4], srcs[6], 4);
 
-    int y = 0;
+    int y = height;
     do {
       srcs[6] = Load4<1>(src, srcs[6]);
       src += src_stride;
@@ -1970,8 +1995,8 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
       srcs[4] = srcs[6];
       srcs[5] = srcs[7];
       srcs[6] = srcs[8];
-      y += 2;
-    } while (y < height);
+      y -= 2;
+    } while (y != 0);
   }
 }
 
@@ -2186,14 +2211,14 @@ void ConvolveVertical_NEON(const void* const reference,
                            const int vertical_filter_index,
                            const int /*horizontal_filter_id*/,
                            const int vertical_filter_id, const int width,
-                           const int height, void* prediction,
+                           const int height, void* const prediction,
                            const ptrdiff_t pred_stride) {
   const int filter_index = GetFilterIndex(vertical_filter_index, height);
   const int vertical_taps = GetNumTapsInFilter(filter_index);
   const ptrdiff_t src_stride = reference_stride;
   const auto* src = static_cast<const uint8_t*>(reference) -
                     (vertical_taps / 2 - 1) * src_stride;
-  auto* dest = static_cast<uint8_t*>(prediction);
+  auto* const dest = static_cast<uint8_t*>(prediction);
   const ptrdiff_t dest_stride = pred_stride;
   assert(vertical_filter_id != 0);
 
@@ -2303,7 +2328,7 @@ void ConvolveCompoundCopy_NEON(
     const void* const reference, const ptrdiff_t reference_stride,
     const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
     const int /*horizontal_filter_id*/, const int /*vertical_filter_id*/,
-    const int width, const int height, void* prediction,
+    const int width, const int height, void* const prediction,
     const ptrdiff_t /*pred_stride*/) {
   const auto* src = static_cast<const uint8_t*>(reference);
   const ptrdiff_t src_stride = reference_stride;
@@ -2312,7 +2337,7 @@ void ConvolveCompoundCopy_NEON(
       kInterRoundBitsVertical - kInterRoundBitsCompoundVertical;
 
   if (width >= 16) {
-    int y = 0;
+    int y = height;
     do {
       int x = 0;
       do {
@@ -2328,20 +2353,20 @@ void ConvolveCompoundCopy_NEON(
       } while (x < width);
       src += src_stride;
       dest += width;
-    } while (++y < height);
+    } while (--y != 0);
   } else if (width == 8) {
-    int y = 0;
+    int y = height;
     do {
       const uint8x8_t v_src = vld1_u8(&src[0]);
       const uint16x8_t v_dest = vshll_n_u8(v_src, final_shift);
       vst1q_u16(&dest[0], v_dest);
       src += src_stride;
       dest += width;
-    } while (++y < height);
-  } else { /* width == 4 */
+    } while (--y != 0);
+  } else {  // width == 4
     uint8x8_t v_src = vdup_n_u8(0);
 
-    int y = 0;
+    int y = height;
     do {
       v_src = Load4<0>(&src[0], v_src);
       src += src_stride;
@@ -2350,8 +2375,8 @@ void ConvolveCompoundCopy_NEON(
       const uint16x8_t v_dest = vshll_n_u8(v_src, final_shift);
       vst1q_u16(&dest[0], v_dest);
       dest += 4 << 1;
-      y += 2;
-    } while (y < height);
+      y -= 2;
+    } while (y != 0);
   }
 }
 
@@ -2359,14 +2384,14 @@ void ConvolveCompoundVertical_NEON(
     const void* const reference, const ptrdiff_t reference_stride,
     const int /*horizontal_filter_index*/, const int vertical_filter_index,
     const int /*horizontal_filter_id*/, const int vertical_filter_id,
-    const int width, const int height, void* prediction,
+    const int width, const int height, void* const prediction,
     const ptrdiff_t /*pred_stride*/) {
   const int filter_index = GetFilterIndex(vertical_filter_index, height);
   const int vertical_taps = GetNumTapsInFilter(filter_index);
   const ptrdiff_t src_stride = reference_stride;
   const auto* src = static_cast<const uint8_t*>(reference) -
                     (vertical_taps / 2 - 1) * src_stride;
-  auto* dest = static_cast<uint16_t*>(prediction);
+  auto* const dest = static_cast<uint16_t*>(prediction);
   assert(vertical_filter_id != 0);
 
   uint8x8_t taps[8];
@@ -2454,24 +2479,39 @@ void ConvolveCompoundHorizontal_NEON(
     const void* const reference, const ptrdiff_t reference_stride,
     const int horizontal_filter_index, const int /*vertical_filter_index*/,
     const int horizontal_filter_id, const int /*vertical_filter_id*/,
-    const int width, const int height, void* prediction,
+    const int width, const int height, void* const prediction,
     const ptrdiff_t /*pred_stride*/) {
   const int filter_index = GetFilterIndex(horizontal_filter_index, width);
-  const auto* src = static_cast<const uint8_t*>(reference) - kHorizontalOffset;
-  auto* dest = static_cast<uint16_t*>(prediction);
+  const auto* const src =
+      static_cast<const uint8_t*>(reference) - kHorizontalOffset;
+  auto* const dest = static_cast<uint16_t*>(prediction);
 
   DoHorizontalPass</*is_2d=*/false, /*is_compound=*/true>(
       src, reference_stride, dest, width, width, height, horizontal_filter_id,
       filter_index);
 }
 
+template <int vertical_taps>
+void Compound2DVertical(const uint16_t* const intermediate_result,
+                        const int width, const int height, const int16x8_t taps,
+                        void* const prediction) {
+  auto* const dest = static_cast<uint16_t*>(prediction);
+  if (width == 4) {
+    Filter2DVerticalWidth4<vertical_taps, /*is_compound=*/true>(
+        intermediate_result, dest, width, height, taps);
+  } else {
+    Filter2DVerticalWidth8AndUp<vertical_taps, /*is_compound=*/true>(
+        intermediate_result, dest, width, width, height, taps);
+  }
+}
+
 void ConvolveCompound2D_NEON(const void* const reference,
                              const ptrdiff_t reference_stride,
                              const int horizontal_filter_index,
                              const int vertical_filter_index,
                              const int horizontal_filter_id,
                              const int vertical_filter_id, const int width,
-                             const int height, void* prediction,
+                             const int height, void* const prediction,
                              const ptrdiff_t /*pred_stride*/) {
   // The output of the horizontal filter, i.e. the intermediate_result, is
   // guaranteed to fit in int16_t.
@@ -2492,55 +2532,26 @@ void ConvolveCompound2D_NEON(const void* const reference,
   const auto* const src = static_cast<const uint8_t*>(reference) -
                           (vertical_taps / 2 - 1) * src_stride -
                           kHorizontalOffset;
-
   DoHorizontalPass</*is_2d=*/true, /*is_compound=*/true>(
       src, src_stride, intermediate_result, width, width, intermediate_height,
       horizontal_filter_id, horiz_filter_index);
 
   // Vertical filter.
-  auto* dest = static_cast<uint16_t*>(prediction);
   assert(vertical_filter_id != 0);
-
-  const ptrdiff_t dest_stride = width;
   const int16x8_t taps = vmovl_s8(
       vld1_s8(kHalfSubPixelFilters[vert_filter_index][vertical_filter_id]));
-
   if (vertical_taps == 8) {
-    if (width == 4) {
-      Filter2DVertical4xH<8, /*is_compound=*/true>(intermediate_result, dest,
-                                                   dest_stride, height, taps);
-    } else {
-      Filter2DVertical<8, /*is_compound=*/true>(
-          intermediate_result, dest, dest_stride, width, height, taps);
-    }
+    Compound2DVertical<8>(intermediate_result, width, height, taps, prediction);
   } else if (vertical_taps == 6) {
-    if (width == 4) {
-      Filter2DVertical4xH<6, /*is_compound=*/true>(intermediate_result, dest,
-                                                   dest_stride, height, taps);
-    } else {
-      Filter2DVertical<6, /*is_compound=*/true>(
-          intermediate_result, dest, dest_stride, width, height, taps);
-    }
+    Compound2DVertical<6>(intermediate_result, width, height, taps, prediction);
   } else if (vertical_taps == 4) {
-    if (width == 4) {
-      Filter2DVertical4xH<4, /*is_compound=*/true>(intermediate_result, dest,
-                                                   dest_stride, height, taps);
-    } else {
-      Filter2DVertical<4, /*is_compound=*/true>(
-          intermediate_result, dest, dest_stride, width, height, taps);
-    }
+    Compound2DVertical<4>(intermediate_result, width, height, taps, prediction);
   } else {  // |vertical_taps| == 2
-    if (width == 4) {
-      Filter2DVertical4xH<2, /*is_compound=*/true>(intermediate_result, dest,
-                                                   dest_stride, height, taps);
-    } else {
-      Filter2DVertical<2, /*is_compound=*/true>(
-          intermediate_result, dest, dest_stride, width, height, taps);
-    }
+    Compound2DVertical<2>(intermediate_result, width, height, taps, prediction);
   }
 }
 
-inline void HalfAddHorizontal(const uint8_t* src, uint8_t* dst) {
+inline void HalfAddHorizontal(const uint8_t* const src, uint8_t* const dst) {
   const uint8x16_t left = vld1q_u8(src);
   const uint8x16_t right = vld1q_u8(src + 1);
   vst1q_u8(dst, vrhaddq_u8(left, right));
@@ -2554,7 +2565,7 @@ inline void IntraBlockCopyHorizontal(const uint8_t* src,
   const ptrdiff_t src_remainder_stride = src_stride - (width - 16);
   const ptrdiff_t dst_remainder_stride = dst_stride - (width - 16);
 
-  int y = 0;
+  int y = height;
   do {
     HalfAddHorizontal(src, dst);
     if (width >= 32) {
@@ -2586,7 +2597,7 @@ inline void IntraBlockCopyHorizontal(const uint8_t* src,
     }
     src += src_remainder_stride;
     dst += dst_remainder_stride;
-  } while (++y < height);
+  } while (--y != 0);
 }
 
 void ConvolveIntraBlockCopyHorizontal_NEON(
@@ -2610,7 +2621,7 @@ void ConvolveIntraBlockCopyHorizontal_NEON(
     IntraBlockCopyHorizontal<16>(src, reference_stride, height, dest,
                                  pred_stride);
   } else if (width == 8) {
-    int y = 0;
+    int y = height;
     do {
       const uint8x8_t left = vld1_u8(src);
       const uint8x8_t right = vld1_u8(src + 1);
@@ -2618,11 +2629,11 @@ void ConvolveIntraBlockCopyHorizontal_NEON(
 
       src += reference_stride;
       dest += pred_stride;
-    } while (++y < height);
+    } while (--y != 0);
   } else if (width == 4) {
     uint8x8_t left = vdup_n_u8(0);
     uint8x8_t right = vdup_n_u8(0);
-    int y = 0;
+    int y = height;
     do {
       left = Load4<0>(src, left);
       right = Load4<0>(src + 1, right);
@@ -2637,13 +2648,13 @@ void ConvolveIntraBlockCopyHorizontal_NEON(
       dest += pred_stride;
       StoreHi4(dest, result);
       dest += pred_stride;
-      y += 2;
-    } while (y < height);
+      y -= 2;
+    } while (y != 0);
   } else {
     assert(width == 2);
     uint8x8_t left = vdup_n_u8(0);
     uint8x8_t right = vdup_n_u8(0);
-    int y = 0;
+    int y = height;
     do {
       left = Load2<0>(src, left);
       right = Load2<0>(src + 1, right);
@@ -2658,8 +2669,8 @@ void ConvolveIntraBlockCopyHorizontal_NEON(
       dest += pred_stride;
       Store2<1>(dest, result);
       dest += pred_stride;
-      y += 2;
-    } while (y < height);
+      y -= 2;
+    } while (y != 0);
   }
 }
 
@@ -2694,7 +2705,7 @@ inline void IntraBlockCopyVertical(const uint8_t* src,
   }
   src += src_remainder_stride;
 
-  int y = 0;
+  int y = height;
   do {
     below[0] = vld1q_u8(src);
     if (width >= 32) {
@@ -2749,7 +2760,7 @@ inline void IntraBlockCopyVertical(const uint8_t* src,
       }
     }
     dst += dst_remainder_stride;
-  } while (++y < height);
+  } while (--y != 0);
 }
 
 void ConvolveIntraBlockCopyVertical_NEON(
@@ -2778,7 +2789,7 @@ void ConvolveIntraBlockCopyVertical_NEON(
     row = vld1_u8(src);
     src += reference_stride;
 
-    int y = 0;
+    int y = height;
     do {
       below = vld1_u8(src);
       src += reference_stride;
@@ -2787,13 +2798,13 @@ void ConvolveIntraBlockCopyVertical_NEON(
       dest += pred_stride;
 
       row = below;
-    } while (++y < height);
+    } while (--y != 0);
   } else if (width == 4) {
     uint8x8_t row = Load4(src);
     uint8x8_t below = vdup_n_u8(0);
     src += reference_stride;
 
-    int y = 0;
+    int y = height;
     do {
       below = Load4<0>(src, below);
       src += reference_stride;
@@ -2802,14 +2813,14 @@ void ConvolveIntraBlockCopyVertical_NEON(
       dest += pred_stride;
 
       row = below;
-    } while (++y < height);
+    } while (--y != 0);
   } else {
     assert(width == 2);
     uint8x8_t row = Load2(src);
     uint8x8_t below = vdup_n_u8(0);
     src += reference_stride;
 
-    int y = 0;
+    int y = height;
     do {
       below = Load2<0>(src, below);
       src += reference_stride;
@@ -2818,7 +2829,7 @@ void ConvolveIntraBlockCopyVertical_NEON(
       dest += pred_stride;
 
       row = below;
-    } while (++y < height);
+    } while (--y != 0);
   }
 }
 
@@ -2870,7 +2881,7 @@ inline void IntraBlockCopy2D(const uint8_t* src, const ptrdiff_t src_stride,
   }
   src += src_remainder_stride;
 
-  int y = 0;
+  int y = height;
   do {
     const uint16x8_t below_0 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
     vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[0], below_0), 2));
@@ -2981,7 +2992,7 @@ inline void IntraBlockCopy2D(const uint8_t* src, const ptrdiff_t src_stride,
     }
     src += src_remainder_stride;
     dst += dst_remainder_stride;
-  } while (++y < height);
+  } while (--y != 0);
 }
 
 void ConvolveIntraBlockCopy2D_NEON(
@@ -3013,7 +3024,7 @@ void ConvolveIntraBlockCopy2D_NEON(
 
     uint16x4_t row = vget_low_u16(vaddl_u8(left, right));
 
-    int y = 0;
+    int y = height;
     do {
       left = Load4<0>(src, left);
       right = Load4<0>(src + 1, right);
@@ -3032,8 +3043,8 @@ void ConvolveIntraBlockCopy2D_NEON(
       dest += pred_stride;
 
       row = vget_high_u16(below);
-      y += 2;
-    } while (y < height);
+      y -= 2;
+    } while (y != 0);
   } else {
     uint8x8_t left = Load2(src);
     uint8x8_t right = Load2(src + 1);
@@ -3041,7 +3052,7 @@ void ConvolveIntraBlockCopy2D_NEON(
 
     uint16x4_t row = vget_low_u16(vaddl_u8(left, right));
 
-    int y = 0;
+    int y = height;
     do {
       left = Load2<0>(src, left);
       right = Load2<0>(src + 1, right);
@@ -3060,8 +3071,8 @@ void ConvolveIntraBlockCopy2D_NEON(
       dest += pred_stride;
 
       row = vget_high_u16(below);
-      y += 2;
-    } while (y < height);
+      y -= 2;
+    } while (y != 0);
   }
 }
 
@@ -3093,7 +3104,7 @@ void ConvolveInit_NEON() { low_bitdepth::Init8bpp(); }
 }  // namespace dsp
 }  // namespace libgav1
 
-#else  // !LIBGAV1_ENABLE_NEON
+#else   // !LIBGAV1_ENABLE_NEON
 
 namespace libgav1 {
 namespace dsp {
diff --git a/src/dsp/arm/distance_weighted_blend_neon.cc b/src/dsp/arm/distance_weighted_blend_neon.cc
index 04952ab..a0cd0ac 100644
--- a/src/dsp/arm/distance_weighted_blend_neon.cc
+++ b/src/dsp/arm/distance_weighted_blend_neon.cc
@@ -30,10 +30,12 @@
 
 namespace libgav1 {
 namespace dsp {
-namespace {
 
 constexpr int kInterPostRoundBit = 4;
 
+namespace low_bitdepth {
+namespace {
+
 inline int16x8_t ComputeWeightedAverage8(const int16x8_t pred0,
                                          const int16x8_t pred1,
                                          const int16x4_t weights[2]) {
@@ -185,13 +187,167 @@ void Init8bpp() {
 }
 
 }  // namespace
+}  // namespace low_bitdepth
+
+//------------------------------------------------------------------------------
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+inline uint16x4x2_t ComputeWeightedAverage8(const uint16x4x2_t pred0,
+                                            const uint16x4x2_t pred1,
+                                            const uint16x4_t weights[2]) {
+  const uint32x4_t wpred0_lo = vmull_u16(weights[0], pred0.val[0]);
+  const uint32x4_t wpred0_hi = vmull_u16(weights[0], pred0.val[1]);
+  const uint32x4_t blended_lo = vmlal_u16(wpred0_lo, weights[1], pred1.val[0]);
+  const uint32x4_t blended_hi = vmlal_u16(wpred0_hi, weights[1], pred1.val[1]);
+  const int32x4_t offset = vdupq_n_s32(kCompoundOffset * 16);
+  const int32x4_t res_lo = vsubq_s32(vreinterpretq_s32_u32(blended_lo), offset);
+  const int32x4_t res_hi = vsubq_s32(vreinterpretq_s32_u32(blended_hi), offset);
+  const uint16x4_t bd_max = vdup_n_u16((1 << kBitdepth10) - 1);
+  // Clip the result at (1 << bd) - 1.
+  uint16x4x2_t result;
+  result.val[0] =
+      vmin_u16(vqrshrun_n_s32(res_lo, kInterPostRoundBit + 4), bd_max);
+  result.val[1] =
+      vmin_u16(vqrshrun_n_s32(res_hi, kInterPostRoundBit + 4), bd_max);
+  return result;
+}
+
+inline uint16x4x4_t ComputeWeightedAverage8(const uint16x4x4_t pred0,
+                                            const uint16x4x4_t pred1,
+                                            const uint16x4_t weights[2]) {
+  const int32x4_t offset = vdupq_n_s32(kCompoundOffset * 16);
+  const uint32x4_t wpred0 = vmull_u16(weights[0], pred0.val[0]);
+  const uint32x4_t wpred1 = vmull_u16(weights[0], pred0.val[1]);
+  const uint32x4_t blended0 = vmlal_u16(wpred0, weights[1], pred1.val[0]);
+  const uint32x4_t blended1 = vmlal_u16(wpred1, weights[1], pred1.val[1]);
+  const int32x4_t res0 = vsubq_s32(vreinterpretq_s32_u32(blended0), offset);
+  const int32x4_t res1 = vsubq_s32(vreinterpretq_s32_u32(blended1), offset);
+  const uint32x4_t wpred2 = vmull_u16(weights[0], pred0.val[2]);
+  const uint32x4_t wpred3 = vmull_u16(weights[0], pred0.val[3]);
+  const uint32x4_t blended2 = vmlal_u16(wpred2, weights[1], pred1.val[2]);
+  const uint32x4_t blended3 = vmlal_u16(wpred3, weights[1], pred1.val[3]);
+  const int32x4_t res2 = vsubq_s32(vreinterpretq_s32_u32(blended2), offset);
+  const int32x4_t res3 = vsubq_s32(vreinterpretq_s32_u32(blended3), offset);
+  const uint16x4_t bd_max = vdup_n_u16((1 << kBitdepth10) - 1);
+  // Clip the result at (1 << bd) - 1.
+  uint16x4x4_t result;
+  result.val[0] =
+      vmin_u16(vqrshrun_n_s32(res0, kInterPostRoundBit + 4), bd_max);
+  result.val[1] =
+      vmin_u16(vqrshrun_n_s32(res1, kInterPostRoundBit + 4), bd_max);
+  result.val[2] =
+      vmin_u16(vqrshrun_n_s32(res2, kInterPostRoundBit + 4), bd_max);
+  result.val[3] =
+      vmin_u16(vqrshrun_n_s32(res3, kInterPostRoundBit + 4), bd_max);
+
+  return result;
+}
+
+// We could use vld1_u16_x2, but for compatibility reasons, use this function
+// instead. The compiler optimizes to the correct instruction.
+inline uint16x4x2_t LoadU16x4_x2(uint16_t const* ptr) {
+  uint16x4x2_t x;
+  // gcc/clang (64 bit) optimizes the following to ldp.
+  x.val[0] = vld1_u16(ptr);
+  x.val[1] = vld1_u16(ptr + 4);
+  return x;
+}
+
+// We could use vld1_u16_x4, but for compatibility reasons, use this function
+// instead. The compiler optimizes to a pair of vld1_u16_x2, which showed better
+// performance in the speed tests.
+inline uint16x4x4_t LoadU16x4_x4(uint16_t const* ptr) {
+  uint16x4x4_t x;
+  x.val[0] = vld1_u16(ptr);
+  x.val[1] = vld1_u16(ptr + 4);
+  x.val[2] = vld1_u16(ptr + 8);
+  x.val[3] = vld1_u16(ptr + 12);
+  return x;
+}
+
+void DistanceWeightedBlend_NEON(const void* prediction_0,
+                                const void* prediction_1,
+                                const uint8_t weight_0, const uint8_t weight_1,
+                                const int width, const int height,
+                                void* const dest, const ptrdiff_t dest_stride) {
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  auto* dst = static_cast<uint16_t*>(dest);
+  const ptrdiff_t dst_stride = dest_stride / sizeof(dst[0]);
+  const uint16x4_t weights[2] = {vdup_n_u16(weight_0), vdup_n_u16(weight_1)};
 
-void DistanceWeightedBlendInit_NEON() { Init8bpp(); }
+  if (width == 4) {
+    int y = height;
+    do {
+      const uint16x4x2_t src0 = LoadU16x4_x2(pred_0);
+      const uint16x4x2_t src1 = LoadU16x4_x2(pred_1);
+      const uint16x4x2_t res = ComputeWeightedAverage8(src0, src1, weights);
+      vst1_u16(dst, res.val[0]);
+      vst1_u16(dst + dst_stride, res.val[1]);
+      dst += dst_stride << 1;
+      pred_0 += 8;
+      pred_1 += 8;
+      y -= 2;
+    } while (y != 0);
+  } else if (width == 8) {
+    int y = height;
+    do {
+      const uint16x4x4_t src0 = LoadU16x4_x4(pred_0);
+      const uint16x4x4_t src1 = LoadU16x4_x4(pred_1);
+      const uint16x4x4_t res = ComputeWeightedAverage8(src0, src1, weights);
+      vst1_u16(dst, res.val[0]);
+      vst1_u16(dst + 4, res.val[1]);
+      vst1_u16(dst + dst_stride, res.val[2]);
+      vst1_u16(dst + dst_stride + 4, res.val[3]);
+      dst += dst_stride << 1;
+      pred_0 += 16;
+      pred_1 += 16;
+      y -= 2;
+    } while (y != 0);
+  } else {
+    int y = height;
+    do {
+      int x = 0;
+      do {
+        const uint16x4x4_t src0 = LoadU16x4_x4(pred_0 + x);
+        const uint16x4x4_t src1 = LoadU16x4_x4(pred_1 + x);
+        const uint16x4x4_t res = ComputeWeightedAverage8(src0, src1, weights);
+        vst1_u16(dst + x, res.val[0]);
+        vst1_u16(dst + x + 4, res.val[1]);
+        vst1_u16(dst + x + 8, res.val[2]);
+        vst1_u16(dst + x + 12, res.val[3]);
+        x += 16;
+      } while (x < width);
+      dst += dst_stride;
+      pred_0 += width;
+      pred_1 += width;
+    } while (--y != 0);
+  }
+}
+
+void Init10bpp() {
+  Dsp* dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+  dsp->distance_weighted_blend = DistanceWeightedBlend_NEON;
+}
+
+}  // namespace
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+void DistanceWeightedBlendInit_NEON() {
+  low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif
+}
 
 }  // namespace dsp
 }  // namespace libgav1
 
-#else  // !LIBGAV1_ENABLE_NEON
+#else   // !LIBGAV1_ENABLE_NEON
 
 namespace libgav1 {
 namespace dsp {
diff --git a/src/dsp/arm/distance_weighted_blend_neon.h b/src/dsp/arm/distance_weighted_blend_neon.h
index 4d8824c..94a799c 100644
--- a/src/dsp/arm/distance_weighted_blend_neon.h
+++ b/src/dsp/arm/distance_weighted_blend_neon.h
@@ -34,6 +34,8 @@ void DistanceWeightedBlendInit_NEON();
 #if LIBGAV1_ENABLE_NEON
 #define LIBGAV1_Dsp8bpp_DistanceWeightedBlend LIBGAV1_CPU_NEON
 
+#define LIBGAV1_Dsp10bpp_DistanceWeightedBlend LIBGAV1_CPU_NEON
+
 #endif  // LIBGAV1_ENABLE_NEON
 
 #endif  // LIBGAV1_SRC_DSP_ARM_DISTANCE_WEIGHTED_BLEND_NEON_H_
diff --git a/src/dsp/arm/film_grain_neon.cc b/src/dsp/arm/film_grain_neon.cc
index 2612466..8ee3745 100644
--- a/src/dsp/arm/film_grain_neon.cc
+++ b/src/dsp/arm/film_grain_neon.cc
@@ -1176,7 +1176,7 @@ void FilmGrainInit_NEON() {
 }  // namespace dsp
 }  // namespace libgav1
 
-#else  // !LIBGAV1_ENABLE_NEON
+#else   // !LIBGAV1_ENABLE_NEON
 
 namespace libgav1 {
 namespace dsp {
diff --git a/src/dsp/arm/intra_edge_neon.cc b/src/dsp/arm/intra_edge_neon.cc
index 00b186a..074283f 100644
--- a/src/dsp/arm/intra_edge_neon.cc
+++ b/src/dsp/arm/intra_edge_neon.cc
@@ -25,7 +25,7 @@
 #include "src/dsp/arm/common_neon.h"
 #include "src/dsp/constants.h"
 #include "src/dsp/dsp.h"
-#include "src/utils/common.h"  // RightShiftWithRounding()
+#include "src/utils/common.h"
 
 namespace libgav1 {
 namespace dsp {
@@ -35,6 +35,11 @@ namespace {
 // required.
 constexpr int kKernelsNEON[3][2] = {{4, 8}, {5, 6}};
 
+}  // namespace
+
+namespace low_bitdepth {
+namespace {
+
 void IntraEdgeFilter_NEON(void* buffer, const int size, const int strength) {
   assert(strength == 1 || strength == 2 || strength == 3);
   const int kernel_index = strength - 1;
@@ -44,6 +49,8 @@ void IntraEdgeFilter_NEON(void* buffer, const int size, const int strength) {
   // elements written is |size| - 1.
   if (size == 1) return;
 
+  const uint8x16_t v_index = vcombine_u8(vcreate_u8(0x0706050403020100),
+                                         vcreate_u8(0x0f0e0d0c0b0a0908));
   // |strength| 1 and 2 use a 3 tap filter.
   if (strength < 3) {
     // The last value requires extending the buffer (duplicating
@@ -89,7 +96,6 @@ void IntraEdgeFilter_NEON(void* buffer, const int size, const int strength) {
     // |remainder| == 1 then we don't have to do anything.
     const int remainder = (size - 1) & 0xf;
     if (remainder > 1) {
-      uint8_t temp[16];
       const uint8x16_t src_1 = vld1q_u8(dst_buffer + i);
       const uint8x16_t src_2 = vld1q_u8(dst_buffer + i + 1);
 
@@ -102,9 +108,11 @@ void IntraEdgeFilter_NEON(void* buffer, const int size, const int strength) {
 
       const uint8x16_t result =
           vcombine_u8(vrshrn_n_u16(sum_lo, 4), vrshrn_n_u16(sum_hi, 4));
-
-      vst1q_u8(temp, result);
-      memcpy(dst_buffer + i, temp, remainder);
+      const uint8x16_t v_remainder = vdupq_n_u8(remainder);
+      // Create over write mask.
+      const uint8x16_t mask = vcleq_u8(v_remainder, v_index);
+      const uint8x16_t dst_remainder = vbslq_u8(mask, src_1, result);
+      vst1q_u8(dst_buffer + i, dst_remainder);
     }
 
     dst_buffer[size - 1] = last_val;
@@ -173,7 +181,6 @@ void IntraEdgeFilter_NEON(void* buffer, const int size, const int strength) {
   // Like the 3 tap but if there are two remaining values we have already
   // calculated them.
   if (remainder > 2) {
-    uint8_t temp[16];
     const uint8x16_t src_2 = vld1q_u8(dst_buffer + i);
     const uint8x16_t src_3 = vld1q_u8(dst_buffer + i + 1);
     const uint8x16_t src_4 = vld1q_u8(dst_buffer + i + 2);
@@ -193,9 +200,11 @@ void IntraEdgeFilter_NEON(void* buffer, const int size, const int strength) {
 
     const uint8x16_t result =
         vcombine_u8(vrshrn_n_u16(sum_lo, 4), vrshrn_n_u16(sum_hi, 4));
-
-    vst1q_u8(temp, result);
-    memcpy(dst_buffer + i, temp, remainder);
+    const uint8x16_t v_remainder = vdupq_n_u8(remainder);
+    // Create over write mask.
+    const uint8x16_t mask = vcleq_u8(v_remainder, v_index);
+    const uint8x16_t dst_remainder = vbslq_u8(mask, src_2, result);
+    vst1q_u8(dst_buffer + i, dst_remainder);
   }
 
   dst_buffer[1] = special_vals[0];
@@ -284,13 +293,225 @@ void Init8bpp() {
 }
 
 }  // namespace
+}  // namespace low_bitdepth
+
+//------------------------------------------------------------------------------
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+const uint16_t kRemainderMask[8][8] = {
+    {0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000},
+    {0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000},
+    {0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000},
+    {0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000},
+    {0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000},
+    {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000},
+    {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000},
+    {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000},
+};
+
+void IntraEdgeFilter_NEON(void* buffer, const int size, const int strength) {
+  assert(strength == 1 || strength == 2 || strength == 3);
+  const int kernel_index = strength - 1;
+  auto* const dst_buffer = static_cast<uint16_t*>(buffer);
+
+  // The first element is not written out (but it is input) so the number of
+  // elements written is |size| - 1.
+  if (size == 1) return;
+
+  // |strength| 1 and 2 use a 3 tap filter.
+  if (strength < 3) {
+    // The last value requires extending the buffer (duplicating
+    // |dst_buffer[size - 1]). Calculate it here to avoid extra processing in
+    // neon.
+    const uint16_t last_val = RightShiftWithRounding(
+        kKernelsNEON[kernel_index][0] * dst_buffer[size - 2] +
+            kKernelsNEON[kernel_index][1] * dst_buffer[size - 1] +
+            kKernelsNEON[kernel_index][0] * dst_buffer[size - 1],
+        4);
+
+    const uint16_t krn0 = kKernelsNEON[kernel_index][0];
+    const uint16_t krn1 = kKernelsNEON[kernel_index][1];
+
+    // The first value we need gets overwritten by the output from the
+    // previous iteration.
+    uint16x8_t src_0 = vld1q_u16(dst_buffer);
+    int i = 1;
+
+    // Process blocks until there are less than 16 values remaining.
+    for (; i < size - 7; i += 8) {
+      // Loading these at the end of the block with |src_0| will read past the
+      // end of |top_row_data[160]|, the source of |buffer|.
+      const uint16x8_t src_1 = vld1q_u16(dst_buffer + i);
+      const uint16x8_t src_2 = vld1q_u16(dst_buffer + i + 1);
+      const uint16x8_t sum_02 = vmulq_n_u16(vaddq_u16(src_0, src_2), krn0);
+      const uint16x8_t sum = vmlaq_n_u16(sum_02, src_1, krn1);
+      const uint16x8_t result = vrshrq_n_u16(sum, 4);
+      // Load the next row before overwriting. This loads an extra 7 values
+      // past |size| on the trailing iteration.
+      src_0 = vld1q_u16(dst_buffer + i + 7);
+      vst1q_u16(dst_buffer + i, result);
+    }
+
+    // The last output value |last_val| was already calculated so if
+    // |remainder| == 1 then we don't have to do anything.
+    const int remainder = (size - 1) & 0x7;
+    if (remainder > 1) {
+      const uint16x8_t src_1 = vld1q_u16(dst_buffer + i);
+      const uint16x8_t src_2 = vld1q_u16(dst_buffer + i + 1);
+      const uint16x8_t sum_02 = vmulq_n_u16(vaddq_u16(src_0, src_2), krn0);
+      const uint16x8_t sum = vmlaq_n_u16(sum_02, src_1, krn1);
+      const uint16x8_t result = vrshrq_n_u16(sum, 4);
+      const uint16x8_t mask = vld1q_u16(kRemainderMask[remainder]);
+      const uint16x8_t dst_remainder = vbslq_u16(mask, result, src_1);
+      vst1q_u16(dst_buffer + i, dst_remainder);
+    }
+
+    dst_buffer[size - 1] = last_val;
+    return;
+  }
+
+  assert(strength == 3);
+  // 5 tap filter. The first element requires duplicating |buffer[0]| and the
+  // last two elements require duplicating |buffer[size - 1]|.
+  uint16_t special_vals[3];
+  special_vals[0] = RightShiftWithRounding(
+      (dst_buffer[0] << 1) + (dst_buffer[0] << 2) + (dst_buffer[1] << 2) +
+          (dst_buffer[2] << 2) + (dst_buffer[3] << 1),
+      4);
+  // Clamp index for very small |size| values.
+  const int first_index_min = std::max(size - 4, 0);
+  const int second_index_min = std::max(size - 3, 0);
+  const int third_index_min = std::max(size - 2, 0);
+  special_vals[1] = RightShiftWithRounding(
+      (dst_buffer[first_index_min] << 1) + (dst_buffer[second_index_min] << 2) +
+          (dst_buffer[third_index_min] << 2) + (dst_buffer[size - 1] << 2) +
+          (dst_buffer[size - 1] << 1),
+      4);
+  special_vals[2] = RightShiftWithRounding(
+      (dst_buffer[second_index_min] << 1) + (dst_buffer[third_index_min] << 2) +
+          // x << 2 + x << 2 == x << 3
+          (dst_buffer[size - 1] << 3) + (dst_buffer[size - 1] << 1),
+      4);
+
+  // The first two values we need get overwritten by the output from the
+  // previous iteration.
+  uint16x8_t src_0 = vld1q_u16(dst_buffer - 1);
+  uint16x8_t src_1 = vld1q_u16(dst_buffer);
+  int i = 1;
+
+  for (; i < size - 7; i += 8) {
+    // Loading these at the end of the block with |src_[01]| will read past
+    // the end of |top_row_data[160]|, the source of |buffer|.
+    const uint16x8_t src_2 = vld1q_u16(dst_buffer + i);
+    const uint16x8_t src_3 = vld1q_u16(dst_buffer + i + 1);
+    const uint16x8_t src_4 = vld1q_u16(dst_buffer + i + 2);
+    const uint16x8_t sum_04 = vshlq_n_u16(vaddq_u16(src_0, src_4), 1);
+    const uint16x8_t sum_123 = vaddq_u16(vaddq_u16(src_1, src_2), src_3);
+    const uint16x8_t sum = vaddq_u16(sum_04, vshlq_n_u16(sum_123, 2));
+    const uint16x8_t result = vrshrq_n_u16(sum, 4);
+
+    // Load the next before overwriting.
+    src_0 = vld1q_u16(dst_buffer + i + 6);
+    src_1 = vld1q_u16(dst_buffer + i + 7);
+
+    vst1q_u16(dst_buffer + i, result);
+  }
+
+  const int remainder = (size - 1) & 0x7;
+  // Like the 3 tap but if there are two remaining values we have already
+  // calculated them.
+  if (remainder > 2) {
+    const uint16x8_t src_2 = vld1q_u16(dst_buffer + i);
+    const uint16x8_t src_3 = vld1q_u16(dst_buffer + i + 1);
+    const uint16x8_t src_4 = vld1q_u16(dst_buffer + i + 2);
+    const uint16x8_t sum_04 = vshlq_n_u16(vaddq_u16(src_0, src_4), 1);
+    const uint16x8_t sum_123 = vaddq_u16(vaddq_u16(src_1, src_2), src_3);
+    const uint16x8_t sum = vaddq_u16(sum_04, vshlq_n_u16(sum_123, 2));
+    const uint16x8_t result = vrshrq_n_u16(sum, 4);
+    const uint16x8_t mask = vld1q_u16(kRemainderMask[remainder]);
+    const uint16x8_t dst_remainder = vbslq_u16(mask, result, src_2);
+    vst1q_u16(dst_buffer + i, dst_remainder);
+  }
+
+  dst_buffer[1] = special_vals[0];
+  // Avoid overwriting |dst_buffer[0]|.
+  if (size > 2) dst_buffer[size - 2] = special_vals[1];
+  dst_buffer[size - 1] = special_vals[2];
+}
+
+void IntraEdgeUpsampler_NEON(void* buffer, const int size) {
+  assert(size % 4 == 0 && size <= 16);
+  auto* const pixel_buffer = static_cast<uint16_t*>(buffer);
 
-void IntraEdgeInit_NEON() { Init8bpp(); }
+  // Extend first/last samples
+  pixel_buffer[-2] = pixel_buffer[-1];
+  pixel_buffer[size] = pixel_buffer[size - 1];
+
+  const int16x8_t src_lo = vreinterpretq_s16_u16(vld1q_u16(pixel_buffer - 2));
+  const int16x8_t src_hi =
+      vreinterpretq_s16_u16(vld1q_u16(pixel_buffer - 2 + 8));
+  const int16x8_t src9_hi = vaddq_s16(src_hi, vshlq_n_s16(src_hi, 3));
+  const int16x8_t src9_lo = vaddq_s16(src_lo, vshlq_n_s16(src_lo, 3));
+
+  int16x8_t sum_lo = vsubq_s16(vextq_s16(src9_lo, src9_hi, 1), src_lo);
+  sum_lo = vaddq_s16(sum_lo, vextq_s16(src9_lo, src9_hi, 2));
+  sum_lo = vsubq_s16(sum_lo, vextq_s16(src_lo, src_hi, 3));
+  sum_lo = vrshrq_n_s16(sum_lo, 4);
+
+  uint16x8x2_t result_lo;
+  result_lo.val[0] =
+      vminq_u16(vreinterpretq_u16_s16(vmaxq_s16(sum_lo, vdupq_n_s16(0))),
+                vdupq_n_u16((1 << kBitdepth10) - 1));
+  result_lo.val[1] = vreinterpretq_u16_s16(vextq_s16(src_lo, src_hi, 2));
+
+  if (size > 8) {
+    const int16x8_t src_hi_extra =
+        vreinterpretq_s16_u16(vld1q_u16(pixel_buffer + 16 - 2));
+    const int16x8_t src9_hi_extra =
+        vaddq_s16(src_hi_extra, vshlq_n_s16(src_hi_extra, 3));
+
+    int16x8_t sum_hi = vsubq_s16(vextq_s16(src9_hi, src9_hi_extra, 1), src_hi);
+    sum_hi = vaddq_s16(sum_hi, vextq_s16(src9_hi, src9_hi_extra, 2));
+    sum_hi = vsubq_s16(sum_hi, vextq_s16(src_hi, src_hi_extra, 3));
+    sum_hi = vrshrq_n_s16(sum_hi, 4);
+
+    uint16x8x2_t result_hi;
+    result_hi.val[0] =
+        vminq_u16(vreinterpretq_u16_s16(vmaxq_s16(sum_hi, vdupq_n_s16(0))),
+                  vdupq_n_u16((1 << kBitdepth10) - 1));
+    result_hi.val[1] =
+        vreinterpretq_u16_s16(vextq_s16(src_hi, src_hi_extra, 2));
+    vst2q_u16(pixel_buffer - 1, result_lo);
+    vst2q_u16(pixel_buffer + 15, result_hi);
+  } else {
+    vst2q_u16(pixel_buffer - 1, result_lo);
+  }
+}
+
+void Init10bpp() {
+  Dsp* dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+  dsp->intra_edge_filter = IntraEdgeFilter_NEON;
+  dsp->intra_edge_upsampler = IntraEdgeUpsampler_NEON;
+}
+
+}  // namespace
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+void IntraEdgeInit_NEON() {
+  low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif
+}
 
 }  // namespace dsp
 }  // namespace libgav1
 
-#else  // !LIBGAV1_ENABLE_NEON
+#else   // !LIBGAV1_ENABLE_NEON
 namespace libgav1 {
 namespace dsp {
 
diff --git a/src/dsp/arm/intra_edge_neon.h b/src/dsp/arm/intra_edge_neon.h
index d3bb243..28e3494 100644
--- a/src/dsp/arm/intra_edge_neon.h
+++ b/src/dsp/arm/intra_edge_neon.h
@@ -34,6 +34,9 @@ void IntraEdgeInit_NEON();
 #define LIBGAV1_Dsp8bpp_IntraEdgeFilter LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_IntraEdgeUpsampler LIBGAV1_CPU_NEON
 
+#define LIBGAV1_Dsp10bpp_IntraEdgeFilter LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_IntraEdgeUpsampler LIBGAV1_CPU_NEON
+
 #endif  // LIBGAV1_ENABLE_NEON
 
 #endif  // LIBGAV1_SRC_DSP_ARM_INTRA_EDGE_NEON_H_
diff --git a/src/dsp/arm/intrapred_cfl_neon.cc b/src/dsp/arm/intrapred_cfl_neon.cc
index 45fe33b..8d8748f 100644
--- a/src/dsp/arm/intrapred_cfl_neon.cc
+++ b/src/dsp/arm/intrapred_cfl_neon.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "src/dsp/intrapred.h"
+#include "src/dsp/intrapred_cfl.h"
 #include "src/utils/cpu.h"
 
 #if LIBGAV1_ENABLE_NEON
@@ -27,45 +27,20 @@
 #include "src/dsp/constants.h"
 #include "src/dsp/dsp.h"
 #include "src/utils/common.h"
+#include "src/utils/constants.h"
 
 namespace libgav1 {
 namespace dsp {
-namespace low_bitdepth {
-namespace {
-
-uint8x16_t Set2ValuesQ(const uint8_t* a) {
-  uint16_t combined_values = a[0] | a[1] << 8;
-  return vreinterpretq_u8_u16(vdupq_n_u16(combined_values));
-}
-
-uint32_t SumVector(uint32x2_t a) {
-#if defined(__aarch64__)
-  return vaddv_u32(a);
-#else
-  const uint64x1_t b = vpaddl_u32(a);
-  return vget_lane_u32(vreinterpret_u32_u64(b), 0);
-#endif  // defined(__aarch64__)
-}
-
-uint32_t SumVector(uint32x4_t a) {
-#if defined(__aarch64__)
-  return vaddvq_u32(a);
-#else
-  const uint64x2_t b = vpaddlq_u32(a);
-  const uint64x1_t c = vadd_u64(vget_low_u64(b), vget_high_u64(b));
-  return vget_lane_u32(vreinterpret_u32_u64(c), 0);
-#endif  // defined(__aarch64__)
-}
 
 // Divide by the number of elements.
-uint32_t Average(const uint32_t sum, const int width, const int height) {
+inline uint32_t Average(const uint32_t sum, const int width, const int height) {
   return RightShiftWithRounding(sum, FloorLog2(width) + FloorLog2(height));
 }
 
 // Subtract |val| from every element in |a|.
-void BlockSubtract(const uint32_t val,
-                   int16_t a[kCflLumaBufferStride][kCflLumaBufferStride],
-                   const int width, const int height) {
+inline void BlockSubtract(const uint32_t val,
+                          int16_t a[kCflLumaBufferStride][kCflLumaBufferStride],
+                          const int width, const int height) {
   assert(val <= INT16_MAX);
   const int16x8_t val_v = vdupq_n_s16(static_cast<int16_t>(val));
 
@@ -94,6 +69,9 @@ void BlockSubtract(const uint32_t val,
   }
 }
 
+namespace low_bitdepth {
+namespace {
+
 template <int block_width, int block_height>
 void CflSubsampler420_NEON(
     int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
@@ -122,26 +100,27 @@ void CflSubsampler420_NEON(
 
     sum = SumVector(running_sum);
   } else if (block_width == 8) {
-    const uint8x16_t x_index = {0, 0, 2,  2,  4,  4,  6,  6,
-                                8, 8, 10, 10, 12, 12, 14, 14};
-    const uint8x16_t x_max_index = vdupq_n_u8(max_luma_width - 2);
-    const uint8x16_t x_mask = vcltq_u8(x_index, x_max_index);
+    const uint16x8_t x_index = {0, 2, 4, 6, 8, 10, 12, 14};
+    const uint16x8_t x_max_index =
+        vdupq_n_u16(max_luma_width == 8 ? max_luma_width - 2 : 16);
+    const uint16x8_t x_mask = vcltq_u16(x_index, x_max_index);
 
     uint32x4_t running_sum = vdupq_n_u32(0);
 
     for (int y = 0; y < block_height; ++y) {
-      const uint8x16_t x_max0 = Set2ValuesQ(src + max_luma_width - 2);
-      const uint8x16_t x_max1 = Set2ValuesQ(src + max_luma_width - 2 + stride);
+      const uint8x16_t row0 = vld1q_u8(src);
+      const uint8x16_t row1 = vld1q_u8(src + stride);
+      const uint16x8_t sum_row = vpadalq_u8(vpaddlq_u8(row0), row1);
+      const uint16x8_t sum_row_shifted = vshlq_n_u16(sum_row, 1);
 
-      uint8x16_t row0 = vld1q_u8(src);
-      row0 = vbslq_u8(x_mask, row0, x_max0);
-      uint8x16_t row1 = vld1q_u8(src + stride);
-      row1 = vbslq_u8(x_mask, row1, x_max1);
+      // Dup the 2x2 sum at the max luma offset.
+      const uint16x8_t max_luma_sum =
+          vdupq_lane_u16(vget_low_u16(sum_row_shifted), 3);
+      const uint16x8_t final_sum_row =
+          vbslq_u16(x_mask, sum_row_shifted, max_luma_sum);
+      vst1q_s16(luma[y], vreinterpretq_s16_u16(final_sum_row));
 
-      uint16x8_t sum_row = vpadalq_u8(vpaddlq_u8(row0), row1);
-      sum_row = vshlq_n_u16(sum_row, 1);
-      running_sum = vpadalq_u16(running_sum, sum_row);
-      vst1q_s16(luma[y], vreinterpretq_s16_u16(sum_row));
+      running_sum = vpadalq_u16(running_sum, final_sum_row);
 
       if (y << 1 < max_luma_height - 2) {
         src += stride << 1;
@@ -150,45 +129,35 @@ void CflSubsampler420_NEON(
 
     sum = SumVector(running_sum);
   } else /* block_width >= 16 */ {
-    const uint8x16_t x_max_index = vdupq_n_u8(max_luma_width - 2);
+    const uint16x8_t x_max_index = vdupq_n_u16(max_luma_width - 2);
     uint32x4_t running_sum = vdupq_n_u32(0);
 
     for (int y = 0; y < block_height; ++y) {
-      uint8x16_t x_index = {0,  2,  4,  6,  8,  10, 12, 14,
-                            16, 18, 20, 22, 24, 26, 28, 30};
-      const uint8x16_t x_max00 = vdupq_n_u8(src[max_luma_width - 2]);
-      const uint8x16_t x_max01 = vdupq_n_u8(src[max_luma_width - 2 + 1]);
-      const uint8x16_t x_max10 = vdupq_n_u8(src[stride + max_luma_width - 2]);
-      const uint8x16_t x_max11 =
-          vdupq_n_u8(src[stride + max_luma_width - 2 + 1]);
-      for (int x = 0; x < block_width; x += 16) {
-        const ptrdiff_t src_x_offset = x << 1;
-        const uint8x16_t x_mask = vcltq_u8(x_index, x_max_index);
-        const uint8x16x2_t row0 = vld2q_u8(src + src_x_offset);
-        const uint8x16x2_t row1 = vld2q_u8(src + src_x_offset + stride);
-        const uint8x16_t row_masked_00 = vbslq_u8(x_mask, row0.val[0], x_max00);
-        const uint8x16_t row_masked_01 = vbslq_u8(x_mask, row0.val[1], x_max01);
-        const uint8x16_t row_masked_10 = vbslq_u8(x_mask, row1.val[0], x_max10);
-        const uint8x16_t row_masked_11 = vbslq_u8(x_mask, row1.val[1], x_max11);
-
-        uint16x8_t sum_row_lo =
-            vaddl_u8(vget_low_u8(row_masked_00), vget_low_u8(row_masked_01));
-        sum_row_lo = vaddw_u8(sum_row_lo, vget_low_u8(row_masked_10));
-        sum_row_lo = vaddw_u8(sum_row_lo, vget_low_u8(row_masked_11));
-        sum_row_lo = vshlq_n_u16(sum_row_lo, 1);
-        running_sum = vpadalq_u16(running_sum, sum_row_lo);
-        vst1q_s16(luma[y] + x, vreinterpretq_s16_u16(sum_row_lo));
-
-        uint16x8_t sum_row_hi =
-            vaddl_u8(vget_high_u8(row_masked_00), vget_high_u8(row_masked_01));
-        sum_row_hi = vaddw_u8(sum_row_hi, vget_high_u8(row_masked_10));
-        sum_row_hi = vaddw_u8(sum_row_hi, vget_high_u8(row_masked_11));
-        sum_row_hi = vshlq_n_u16(sum_row_hi, 1);
-        running_sum = vpadalq_u16(running_sum, sum_row_hi);
-        vst1q_s16(luma[y] + x + 8, vreinterpretq_s16_u16(sum_row_hi));
-
-        x_index = vaddq_u8(x_index, vdupq_n_u8(32));
+      // Calculate the 2x2 sum at the max_luma offset
+      const uint8_t a00 = src[max_luma_width - 2];
+      const uint8_t a01 = src[max_luma_width - 1];
+      const uint8_t a10 = src[max_luma_width - 2 + stride];
+      const uint8_t a11 = src[max_luma_width - 1 + stride];
+      // Dup the 2x2 sum at the max luma offset.
+      const uint16x8_t max_luma_sum =
+          vdupq_n_u16((uint16_t)((a00 + a01 + a10 + a11) << 1));
+      uint16x8_t x_index = {0, 2, 4, 6, 8, 10, 12, 14};
+
+      ptrdiff_t src_x_offset = 0;
+      for (int x = 0; x < block_width; x += 8, src_x_offset += 16) {
+        const uint16x8_t x_mask = vcltq_u16(x_index, x_max_index);
+        const uint8x16_t row0 = vld1q_u8(src + src_x_offset);
+        const uint8x16_t row1 = vld1q_u8(src + src_x_offset + stride);
+        const uint16x8_t sum_row = vpadalq_u8(vpaddlq_u8(row0), row1);
+        const uint16x8_t sum_row_shifted = vshlq_n_u16(sum_row, 1);
+        const uint16x8_t final_sum_row =
+            vbslq_u16(x_mask, sum_row_shifted, max_luma_sum);
+        vst1q_s16(luma[y] + x, vreinterpretq_s16_u16(final_sum_row));
+
+        running_sum = vpadalq_u16(running_sum, final_sum_row);
+        x_index = vaddq_u16(x_index, vdupq_n_u16(16));
       }
+
       if (y << 1 < max_luma_height - 2) {
         src += stride << 1;
       }
@@ -209,17 +178,30 @@ void CflSubsampler444_NEON(
   uint32_t sum;
   if (block_width == 4) {
     assert(max_luma_width >= 4);
+    assert(max_luma_height <= block_height);
+    assert((max_luma_height % 2) == 0);
     uint32x4_t running_sum = vdupq_n_u32(0);
     uint8x8_t row = vdup_n_u8(0);
 
-    for (int y = 0; y < block_height; y += 2) {
+    uint16x8_t row_shifted;
+    int y = 0;
+    do {
       row = Load4<0>(src, row);
       row = Load4<1>(src + stride, row);
       if (y < (max_luma_height - 1)) {
         src += stride << 1;
       }
 
-      const uint16x8_t row_shifted = vshll_n_u8(row, 3);
+      row_shifted = vshll_n_u8(row, 3);
+      running_sum = vpadalq_u16(running_sum, row_shifted);
+      vst1_s16(luma[y], vreinterpret_s16_u16(vget_low_u16(row_shifted)));
+      vst1_s16(luma[y + 1], vreinterpret_s16_u16(vget_high_u16(row_shifted)));
+      y += 2;
+    } while (y < max_luma_height);
+
+    row_shifted =
+        vcombine_u16(vget_high_u16(row_shifted), vget_high_u16(row_shifted));
+    for (; y < block_height; y += 2) {
       running_sum = vpadalq_u16(running_sum, row_shifted);
       vst1_s16(luma[y], vreinterpret_s16_u16(vget_low_u16(row_shifted)));
       vst1_s16(luma[y + 1], vreinterpret_s16_u16(vget_high_u16(row_shifted)));
@@ -463,12 +445,874 @@ void Init8bpp() {
 }  // namespace
 }  // namespace low_bitdepth
 
-void IntraPredCflInit_NEON() { low_bitdepth::Init8bpp(); }
+//------------------------------------------------------------------------------
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+//------------------------------------------------------------------------------
+// CflSubsampler
+#ifndef __aarch64__
+uint16x8_t vpaddq_u16(uint16x8_t a, uint16x8_t b) {
+  return vcombine_u16(vpadd_u16(vget_low_u16(a), vget_high_u16(a)),
+                      vpadd_u16(vget_low_u16(b), vget_high_u16(b)));
+}
+#endif
+
+// This duplicates the last two 16-bit values in |row|.
+inline uint16x8_t LastRowSamples(const uint16x8_t row) {
+  const uint32x2_t a = vget_high_u32(vreinterpretq_u32_u16(row));
+  const uint32x4_t b = vdupq_lane_u32(a, 1);
+  return vreinterpretq_u16_u32(b);
+}
+
+// This duplicates the last unsigned 16-bit value in |row|.
+inline uint16x8_t LastRowResult(const uint16x8_t row) {
+  const uint16x4_t a = vget_high_u16(row);
+  const uint16x8_t b = vdupq_lane_u16(a, 0x3);
+  return b;
+}
+
+// This duplicates the last signed 16-bit value in |row|.
+inline int16x8_t LastRowResult(const int16x8_t row) {
+  const int16x4_t a = vget_high_s16(row);
+  const int16x8_t b = vdupq_lane_s16(a, 0x3);
+  return b;
+}
+
+// Takes in two sums of input row pairs, and completes the computation for two
+// output rows.
+inline uint16x8_t StoreLumaResults4_420(const uint16x8_t vertical_sum0,
+                                        const uint16x8_t vertical_sum1,
+                                        int16_t* luma_ptr) {
+  const uint16x8_t result = vpaddq_u16(vertical_sum0, vertical_sum1);
+  const uint16x8_t result_shifted = vshlq_n_u16(result, 1);
+  vst1_s16(luma_ptr, vreinterpret_s16_u16(vget_low_u16(result_shifted)));
+  vst1_s16(luma_ptr + kCflLumaBufferStride,
+           vreinterpret_s16_u16(vget_high_u16(result_shifted)));
+  return result_shifted;
+}
+
+// Takes two halves of a vertically added pair of rows and completes the
+// computation for one output row.
+inline uint16x8_t StoreLumaResults8_420(const uint16x8_t vertical_sum0,
+                                        const uint16x8_t vertical_sum1,
+                                        int16_t* luma_ptr) {
+  const uint16x8_t result = vpaddq_u16(vertical_sum0, vertical_sum1);
+  const uint16x8_t result_shifted = vshlq_n_u16(result, 1);
+  vst1q_s16(luma_ptr, vreinterpretq_s16_u16(result_shifted));
+  return result_shifted;
+}
+
+template <int block_height_log2, bool is_inside>
+void CflSubsampler444_4xH_NEON(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_height, const void* const source, ptrdiff_t stride) {
+  static_assert(block_height_log2 <= 4, "");
+  const int block_height = 1 << block_height_log2;
+  const int visible_height = max_luma_height;
+  const auto* src = static_cast<const uint16_t*>(source);
+  const ptrdiff_t src_stride = stride / sizeof(src[0]);
+  int16_t* luma_ptr = luma[0];
+  uint16x4_t sum = vdup_n_u16(0);
+  uint16x4_t samples[2];
+  int y = visible_height;
+
+  do {
+    samples[0] = vld1_u16(src);
+    samples[1] = vld1_u16(src + src_stride);
+    src += src_stride << 1;
+    sum = vadd_u16(sum, samples[0]);
+    sum = vadd_u16(sum, samples[1]);
+    y -= 2;
+  } while (y != 0);
+
+  if (!is_inside) {
+    y = visible_height;
+    samples[1] = vshl_n_u16(samples[1], 1);
+    do {
+      sum = vadd_u16(sum, samples[1]);
+      y += 2;
+    } while (y < block_height);
+  }
+
+  // Here the left shift by 3 (to increase precision) is nullified in right
+  // shift ((log2 of width 4) + 1).
+  const uint32_t average_sum =
+      RightShiftWithRounding(SumVector(vpaddl_u16(sum)), block_height_log2 - 1);
+  const int16x4_t averages = vdup_n_s16(static_cast<int16_t>(average_sum));
+
+  const auto* ssrc = static_cast<const int16_t*>(source);
+  int16x4_t ssample;
+  luma_ptr = luma[0];
+  y = visible_height;
+  do {
+    ssample = vld1_s16(ssrc);
+    ssample = vshl_n_s16(ssample, 3);
+    vst1_s16(luma_ptr, vsub_s16(ssample, averages));
+    ssrc += src_stride;
+    luma_ptr += kCflLumaBufferStride;
+  } while (--y != 0);
+
+  if (!is_inside) {
+    y = visible_height;
+    // Replicate last line
+    do {
+      vst1_s16(luma_ptr, vsub_s16(ssample, averages));
+      luma_ptr += kCflLumaBufferStride;
+    } while (++y < block_height);
+  }
+}
+
+template <int block_height_log2>
+void CflSubsampler444_4xH_NEON(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_width, const int max_luma_height,
+    const void* const source, ptrdiff_t stride) {
+  static_cast<void>(max_luma_width);
+  static_cast<void>(max_luma_height);
+  static_assert(block_height_log2 <= 4, "");
+  assert(max_luma_width >= 4);
+  assert(max_luma_height >= 4);
+  const int block_height = 1 << block_height_log2;
+
+  if (block_height <= max_luma_height) {
+    CflSubsampler444_4xH_NEON<block_height_log2, true>(luma, max_luma_height,
+                                                       source, stride);
+  } else {
+    CflSubsampler444_4xH_NEON<block_height_log2, false>(luma, max_luma_height,
+                                                        source, stride);
+  }
+}
+
+template <int block_height_log2, bool is_inside>
+void CflSubsampler444_8xH_NEON(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_height, const void* const source, ptrdiff_t stride) {
+  const int block_height = 1 << block_height_log2;
+  const int visible_height = max_luma_height;
+  const auto* src = static_cast<const uint16_t*>(source);
+  const ptrdiff_t src_stride = stride / sizeof(src[0]);
+  int16_t* luma_ptr = luma[0];
+  uint32x4_t sum = vdupq_n_u32(0);
+  uint16x8_t samples;
+  int y = visible_height;
+
+  do {
+    samples = vld1q_u16(src);
+    src += src_stride;
+    sum = vpadalq_u16(sum, samples);
+  } while (--y != 0);
+
+  if (!is_inside) {
+    y = visible_height;
+    do {
+      sum = vpadalq_u16(sum, samples);
+    } while (++y < block_height);
+  }
+
+  // Here the left shift by 3 (to increase precision) is nullified in right
+  // shift (log2 of width 8).
+  const uint32_t average_sum =
+      RightShiftWithRounding(SumVector(sum), block_height_log2);
+  const int16x8_t averages = vdupq_n_s16(static_cast<int16_t>(average_sum));
+
+  const auto* ssrc = static_cast<const int16_t*>(source);
+  int16x8_t ssample;
+  luma_ptr = luma[0];
+  y = visible_height;
+  do {
+    ssample = vld1q_s16(ssrc);
+    ssample = vshlq_n_s16(ssample, 3);
+    vst1q_s16(luma_ptr, vsubq_s16(ssample, averages));
+    ssrc += src_stride;
+    luma_ptr += kCflLumaBufferStride;
+  } while (--y != 0);
+
+  if (!is_inside) {
+    y = visible_height;
+    // Replicate last line
+    do {
+      vst1q_s16(luma_ptr, vsubq_s16(ssample, averages));
+      luma_ptr += kCflLumaBufferStride;
+    } while (++y < block_height);
+  }
+}
+
+template <int block_height_log2>
+void CflSubsampler444_8xH_NEON(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_width, const int max_luma_height,
+    const void* const source, ptrdiff_t stride) {
+  static_cast<void>(max_luma_width);
+  static_cast<void>(max_luma_height);
+  static_assert(block_height_log2 <= 5, "");
+  assert(max_luma_width >= 4);
+  assert(max_luma_height >= 4);
+  const int block_height = 1 << block_height_log2;
+  const int block_width = 8;
+
+  const int horz_inside = block_width <= max_luma_width;
+  const int vert_inside = block_height <= max_luma_height;
+  if (horz_inside && vert_inside) {
+    CflSubsampler444_8xH_NEON<block_height_log2, true>(luma, max_luma_height,
+                                                       source, stride);
+  } else {
+    CflSubsampler444_8xH_NEON<block_height_log2, false>(luma, max_luma_height,
+                                                        source, stride);
+  }
+}
+
+template <int block_width_log2, int block_height_log2, bool is_inside>
+void CflSubsampler444_WxH_NEON(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_width, const int max_luma_height,
+    const void* const source, ptrdiff_t stride) {
+  const int block_height = 1 << block_height_log2;
+  const int visible_height = max_luma_height;
+  const int block_width = 1 << block_width_log2;
+  const auto* src = static_cast<const uint16_t*>(source);
+  const ptrdiff_t src_stride = stride / sizeof(src[0]);
+  int16_t* luma_ptr = luma[0];
+  uint32x4_t sum = vdupq_n_u32(0);
+  uint16x8_t samples[4];
+  int y = visible_height;
+
+  do {
+    samples[0] = vld1q_u16(src);
+    samples[1] =
+        (max_luma_width >= 16) ? vld1q_u16(src + 8) : LastRowResult(samples[0]);
+    uint16x8_t inner_sum = vaddq_u16(samples[0], samples[1]);
+    if (block_width == 32) {
+      samples[2] = (max_luma_width >= 24) ? vld1q_u16(src + 16)
+                                          : LastRowResult(samples[1]);
+      samples[3] = (max_luma_width == 32) ? vld1q_u16(src + 24)
+                                          : LastRowResult(samples[2]);
+      inner_sum = vaddq_u16(samples[2], inner_sum);
+      inner_sum = vaddq_u16(samples[3], inner_sum);
+    }
+    sum = vpadalq_u16(sum, inner_sum);
+    src += src_stride;
+  } while (--y != 0);
+
+  if (!is_inside) {
+    y = visible_height;
+    uint16x8_t inner_sum = vaddq_u16(samples[0], samples[1]);
+    if (block_width == 32) {
+      inner_sum = vaddq_u16(samples[2], inner_sum);
+      inner_sum = vaddq_u16(samples[3], inner_sum);
+    }
+    do {
+      sum = vpadalq_u16(sum, inner_sum);
+    } while (++y < block_height);
+  }
+
+  // Here the left shift by 3 (to increase precision) is subtracted in right
+  // shift factor (block_width_log2 + block_height_log2 - 3).
+  const uint32_t average_sum = RightShiftWithRounding(
+      SumVector(sum), block_width_log2 + block_height_log2 - 3);
+  const int16x8_t averages = vdupq_n_s16(static_cast<int16_t>(average_sum));
+
+  const auto* ssrc = static_cast<const int16_t*>(source);
+  int16x8_t ssamples_ext = vdupq_n_s16(0);
+  int16x8_t ssamples[4];
+  luma_ptr = luma[0];
+  y = visible_height;
+  do {
+    int idx = 0;
+    for (int x = 0; x < block_width; x += 8) {
+      if (max_luma_width > x) {
+        ssamples[idx] = vld1q_s16(&ssrc[x]);
+        ssamples[idx] = vshlq_n_s16(ssamples[idx], 3);
+        ssamples_ext = ssamples[idx];
+      } else {
+        ssamples[idx] = LastRowResult(ssamples_ext);
+      }
+      vst1q_s16(&luma_ptr[x], vsubq_s16(ssamples[idx++], averages));
+    }
+    ssrc += src_stride;
+    luma_ptr += kCflLumaBufferStride;
+  } while (--y != 0);
+
+  if (!is_inside) {
+    y = visible_height;
+    // Replicate last line
+    do {
+      int idx = 0;
+      for (int x = 0; x < block_width; x += 8) {
+        vst1q_s16(&luma_ptr[x], vsubq_s16(ssamples[idx++], averages));
+      }
+      luma_ptr += kCflLumaBufferStride;
+    } while (++y < block_height);
+  }
+}
+
+template <int block_width_log2, int block_height_log2>
+void CflSubsampler444_WxH_NEON(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_width, const int max_luma_height,
+    const void* const source, ptrdiff_t stride) {
+  static_assert(block_width_log2 == 4 || block_width_log2 == 5,
+                "This function will only work for block_width 16 and 32.");
+  static_assert(block_height_log2 <= 5, "");
+  assert(max_luma_width >= 4);
+  assert(max_luma_height >= 4);
+
+  const int block_height = 1 << block_height_log2;
+  const int vert_inside = block_height <= max_luma_height;
+  if (vert_inside) {
+    CflSubsampler444_WxH_NEON<block_width_log2, block_height_log2, true>(
+        luma, max_luma_width, max_luma_height, source, stride);
+  } else {
+    CflSubsampler444_WxH_NEON<block_width_log2, block_height_log2, false>(
+        luma, max_luma_width, max_luma_height, source, stride);
+  }
+}
+
+template <int block_height_log2>
+void CflSubsampler420_4xH_NEON(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int /*max_luma_width*/, const int max_luma_height,
+    const void* const source, ptrdiff_t stride) {
+  const int block_height = 1 << block_height_log2;
+  const auto* src = static_cast<const uint16_t*>(source);
+  const ptrdiff_t src_stride = stride / sizeof(src[0]);
+  int16_t* luma_ptr = luma[0];
+  const int luma_height = std::min(block_height, max_luma_height >> 1);
+  int y = luma_height;
+
+  uint32x4_t final_sum = vdupq_n_u32(0);
+  do {
+    const uint16x8_t samples_row0 = vld1q_u16(src);
+    src += src_stride;
+    const uint16x8_t samples_row1 = vld1q_u16(src);
+    src += src_stride;
+    const uint16x8_t luma_sum01 = vaddq_u16(samples_row0, samples_row1);
+
+    const uint16x8_t samples_row2 = vld1q_u16(src);
+    src += src_stride;
+    const uint16x8_t samples_row3 = vld1q_u16(src);
+    src += src_stride;
+    const uint16x8_t luma_sum23 = vaddq_u16(samples_row2, samples_row3);
+    uint16x8_t sum = StoreLumaResults4_420(luma_sum01, luma_sum23, luma_ptr);
+    luma_ptr += kCflLumaBufferStride << 1;
+
+    const uint16x8_t samples_row4 = vld1q_u16(src);
+    src += src_stride;
+    const uint16x8_t samples_row5 = vld1q_u16(src);
+    src += src_stride;
+    const uint16x8_t luma_sum45 = vaddq_u16(samples_row4, samples_row5);
+
+    const uint16x8_t samples_row6 = vld1q_u16(src);
+    src += src_stride;
+    const uint16x8_t samples_row7 = vld1q_u16(src);
+    src += src_stride;
+    const uint16x8_t luma_sum67 = vaddq_u16(samples_row6, samples_row7);
+    sum =
+        vaddq_u16(sum, StoreLumaResults4_420(luma_sum45, luma_sum67, luma_ptr));
+    luma_ptr += kCflLumaBufferStride << 1;
+
+    final_sum = vpadalq_u16(final_sum, sum);
+    y -= 4;
+  } while (y != 0);
+
+  const uint16x4_t final_fill =
+      vreinterpret_u16_s16(vld1_s16(luma_ptr - kCflLumaBufferStride));
+  const uint32x4_t final_fill_to_sum = vmovl_u16(final_fill);
+  for (y = luma_height; y < block_height; ++y) {
+    vst1_s16(luma_ptr, vreinterpret_s16_u16(final_fill));
+    luma_ptr += kCflLumaBufferStride;
+    final_sum = vaddq_u32(final_sum, final_fill_to_sum);
+  }
+  const uint32_t average_sum = RightShiftWithRounding(
+      SumVector(final_sum), block_height_log2 + 2 /*log2 of width 4*/);
+  const int16x4_t averages = vdup_n_s16(static_cast<int16_t>(average_sum));
+  luma_ptr = luma[0];
+  y = block_height;
+  do {
+    const int16x4_t samples = vld1_s16(luma_ptr);
+    vst1_s16(luma_ptr, vsub_s16(samples, averages));
+    luma_ptr += kCflLumaBufferStride;
+  } while (--y != 0);
+}
+
+template <int block_height_log2, int max_luma_width>
+inline void CflSubsampler420Impl_8xH_NEON(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_height, const void* const source, ptrdiff_t stride) {
+  const int block_height = 1 << block_height_log2;
+  const auto* src = static_cast<const uint16_t*>(source);
+  const ptrdiff_t src_stride = stride / sizeof(src[0]);
+  int16_t* luma_ptr = luma[0];
+  const int luma_height = std::min(block_height, max_luma_height >> 1);
+  int y = luma_height;
+
+  uint32x4_t final_sum = vdupq_n_u32(0);
+  do {
+    const uint16x8_t samples_row00 = vld1q_u16(src);
+    const uint16x8_t samples_row01 = (max_luma_width == 16)
+                                         ? vld1q_u16(src + 8)
+                                         : LastRowSamples(samples_row00);
+    src += src_stride;
+    const uint16x8_t samples_row10 = vld1q_u16(src);
+    const uint16x8_t samples_row11 = (max_luma_width == 16)
+                                         ? vld1q_u16(src + 8)
+                                         : LastRowSamples(samples_row10);
+    src += src_stride;
+    const uint16x8_t luma_sum00 = vaddq_u16(samples_row00, samples_row10);
+    const uint16x8_t luma_sum01 = vaddq_u16(samples_row01, samples_row11);
+    uint16x8_t sum = StoreLumaResults8_420(luma_sum00, luma_sum01, luma_ptr);
+    luma_ptr += kCflLumaBufferStride;
+
+    const uint16x8_t samples_row20 = vld1q_u16(src);
+    const uint16x8_t samples_row21 = (max_luma_width == 16)
+                                         ? vld1q_u16(src + 8)
+                                         : LastRowSamples(samples_row20);
+    src += src_stride;
+    const uint16x8_t samples_row30 = vld1q_u16(src);
+    const uint16x8_t samples_row31 = (max_luma_width == 16)
+                                         ? vld1q_u16(src + 8)
+                                         : LastRowSamples(samples_row30);
+    src += src_stride;
+    const uint16x8_t luma_sum10 = vaddq_u16(samples_row20, samples_row30);
+    const uint16x8_t luma_sum11 = vaddq_u16(samples_row21, samples_row31);
+    sum =
+        vaddq_u16(sum, StoreLumaResults8_420(luma_sum10, luma_sum11, luma_ptr));
+    luma_ptr += kCflLumaBufferStride;
+
+    const uint16x8_t samples_row40 = vld1q_u16(src);
+    const uint16x8_t samples_row41 = (max_luma_width == 16)
+                                         ? vld1q_u16(src + 8)
+                                         : LastRowSamples(samples_row40);
+    src += src_stride;
+    const uint16x8_t samples_row50 = vld1q_u16(src);
+    const uint16x8_t samples_row51 = (max_luma_width == 16)
+                                         ? vld1q_u16(src + 8)
+                                         : LastRowSamples(samples_row50);
+    src += src_stride;
+    const uint16x8_t luma_sum20 = vaddq_u16(samples_row40, samples_row50);
+    const uint16x8_t luma_sum21 = vaddq_u16(samples_row41, samples_row51);
+    sum =
+        vaddq_u16(sum, StoreLumaResults8_420(luma_sum20, luma_sum21, luma_ptr));
+    luma_ptr += kCflLumaBufferStride;
+
+    const uint16x8_t samples_row60 = vld1q_u16(src);
+    const uint16x8_t samples_row61 = (max_luma_width == 16)
+                                         ? vld1q_u16(src + 8)
+                                         : LastRowSamples(samples_row60);
+    src += src_stride;
+    const uint16x8_t samples_row70 = vld1q_u16(src);
+    const uint16x8_t samples_row71 = (max_luma_width == 16)
+                                         ? vld1q_u16(src + 8)
+                                         : LastRowSamples(samples_row70);
+    src += src_stride;
+    const uint16x8_t luma_sum30 = vaddq_u16(samples_row60, samples_row70);
+    const uint16x8_t luma_sum31 = vaddq_u16(samples_row61, samples_row71);
+    sum =
+        vaddq_u16(sum, StoreLumaResults8_420(luma_sum30, luma_sum31, luma_ptr));
+    luma_ptr += kCflLumaBufferStride;
+
+    final_sum = vpadalq_u16(final_sum, sum);
+    y -= 4;
+  } while (y != 0);
+
+  // Duplicate the final row downward to the end after max_luma_height.
+  const uint16x8_t final_fill =
+      vreinterpretq_u16_s16(vld1q_s16(luma_ptr - kCflLumaBufferStride));
+  const uint32x4_t final_fill_to_sum =
+      vaddl_u16(vget_low_u16(final_fill), vget_high_u16(final_fill));
+
+  for (y = luma_height; y < block_height; ++y) {
+    vst1q_s16(luma_ptr, vreinterpretq_s16_u16(final_fill));
+    luma_ptr += kCflLumaBufferStride;
+    final_sum = vaddq_u32(final_sum, final_fill_to_sum);
+  }
+
+  const uint32_t average_sum = RightShiftWithRounding(
+      SumVector(final_sum), block_height_log2 + 3 /*log2 of width 8*/);
+  const int16x8_t averages = vdupq_n_s16(static_cast<int16_t>(average_sum));
+  luma_ptr = luma[0];
+  y = block_height;
+  do {
+    const int16x8_t samples = vld1q_s16(luma_ptr);
+    vst1q_s16(luma_ptr, vsubq_s16(samples, averages));
+    luma_ptr += kCflLumaBufferStride;
+  } while (--y != 0);
+}
+
+template <int block_height_log2>
+void CflSubsampler420_8xH_NEON(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_width, const int max_luma_height,
+    const void* const source, ptrdiff_t stride) {
+  if (max_luma_width == 8) {
+    CflSubsampler420Impl_8xH_NEON<block_height_log2, 8>(luma, max_luma_height,
+                                                        source, stride);
+  } else {
+    CflSubsampler420Impl_8xH_NEON<block_height_log2, 16>(luma, max_luma_height,
+                                                         source, stride);
+  }
+}
+
+template <int block_width_log2, int block_height_log2, int max_luma_width>
+inline void CflSubsampler420Impl_WxH_NEON(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_height, const void* const source, ptrdiff_t stride) {
+  const auto* src = static_cast<const uint16_t*>(source);
+  const ptrdiff_t src_stride = stride / sizeof(src[0]);
+  const int block_height = 1 << block_height_log2;
+  const int luma_height = std::min(block_height, max_luma_height >> 1);
+  int16_t* luma_ptr = luma[0];
+  // Begin first y section, covering width up to 32.
+  int y = luma_height;
+
+  uint16x8_t final_fill0, final_fill1;
+  uint32x4_t final_sum = vdupq_n_u32(0);
+  do {
+    const uint16_t* src_next = src + src_stride;
+    const uint16x8_t samples_row00 = vld1q_u16(src);
+    const uint16x8_t samples_row01 = (max_luma_width >= 16)
+                                         ? vld1q_u16(src + 8)
+                                         : LastRowSamples(samples_row00);
+    const uint16x8_t samples_row02 = (max_luma_width >= 24)
+                                         ? vld1q_u16(src + 16)
+                                         : LastRowSamples(samples_row01);
+    const uint16x8_t samples_row03 = (max_luma_width == 32)
+                                         ? vld1q_u16(src + 24)
+                                         : LastRowSamples(samples_row02);
+    const uint16x8_t samples_row10 = vld1q_u16(src_next);
+    const uint16x8_t samples_row11 = (max_luma_width >= 16)
+                                         ? vld1q_u16(src_next + 8)
+                                         : LastRowSamples(samples_row10);
+    const uint16x8_t samples_row12 = (max_luma_width >= 24)
+                                         ? vld1q_u16(src_next + 16)
+                                         : LastRowSamples(samples_row11);
+    const uint16x8_t samples_row13 = (max_luma_width == 32)
+                                         ? vld1q_u16(src_next + 24)
+                                         : LastRowSamples(samples_row12);
+    const uint16x8_t luma_sum0 = vaddq_u16(samples_row00, samples_row10);
+    const uint16x8_t luma_sum1 = vaddq_u16(samples_row01, samples_row11);
+    const uint16x8_t luma_sum2 = vaddq_u16(samples_row02, samples_row12);
+    const uint16x8_t luma_sum3 = vaddq_u16(samples_row03, samples_row13);
+    final_fill0 = StoreLumaResults8_420(luma_sum0, luma_sum1, luma_ptr);
+    final_fill1 = StoreLumaResults8_420(luma_sum2, luma_sum3, luma_ptr + 8);
+    const uint16x8_t sum = vaddq_u16(final_fill0, final_fill1);
+
+    final_sum = vpadalq_u16(final_sum, sum);
+
+    // Because max_luma_width is at most 32, any values beyond x=16 will
+    // necessarily be duplicated.
+    if (block_width_log2 == 5) {
+      const uint16x8_t wide_fill = LastRowResult(final_fill1);
+      final_sum = vpadalq_u16(final_sum, vshlq_n_u16(wide_fill, 1));
+    }
+    src += src_stride << 1;
+    luma_ptr += kCflLumaBufferStride;
+  } while (--y != 0);
+
+  // Begin second y section.
+  y = luma_height;
+  if (y < block_height) {
+    uint32x4_t wide_fill;
+    if (block_width_log2 == 5) {
+      // There are 16 16-bit fill values per row, shifting by 2 accounts for
+      // the widening to 32-bit.  (a << 2) = (a + a) << 1.
+      wide_fill = vshll_n_u16(vget_low_u16(LastRowResult(final_fill1)), 2);
+    }
+    const uint16x8_t final_inner_sum = vaddq_u16(final_fill0, final_fill1);
+    const uint32x4_t final_fill_to_sum = vaddl_u16(
+        vget_low_u16(final_inner_sum), vget_high_u16(final_inner_sum));
+
+    do {
+      vst1q_s16(luma_ptr, vreinterpretq_s16_u16(final_fill0));
+      vst1q_s16(luma_ptr + 8, vreinterpretq_s16_u16(final_fill1));
+      if (block_width_log2 == 5) {
+        final_sum = vaddq_u32(final_sum, wide_fill);
+      }
+      luma_ptr += kCflLumaBufferStride;
+      final_sum = vaddq_u32(final_sum, final_fill_to_sum);
+    } while (++y < block_height);
+  }  // End second y section.
+
+  const uint32_t average_sum = RightShiftWithRounding(
+      SumVector(final_sum), block_width_log2 + block_height_log2);
+  const int16x8_t averages = vdupq_n_s16(static_cast<int16_t>(average_sum));
+
+  luma_ptr = luma[0];
+  y = block_height;
+  do {
+    const int16x8_t samples0 = vld1q_s16(luma_ptr);
+    vst1q_s16(luma_ptr, vsubq_s16(samples0, averages));
+    const int16x8_t samples1 = vld1q_s16(luma_ptr + 8);
+    const int16x8_t final_row_result = vsubq_s16(samples1, averages);
+    vst1q_s16(luma_ptr + 8, final_row_result);
+
+    if (block_width_log2 == 5) {
+      const int16x8_t wide_fill = LastRowResult(final_row_result);
+      vst1q_s16(luma_ptr + 16, wide_fill);
+      vst1q_s16(luma_ptr + 24, wide_fill);
+    }
+    luma_ptr += kCflLumaBufferStride;
+  } while (--y != 0);
+}
+
+//------------------------------------------------------------------------------
+// Choose subsampler based on max_luma_width
+template <int block_width_log2, int block_height_log2>
+void CflSubsampler420_WxH_NEON(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_width, const int max_luma_height,
+    const void* const source, ptrdiff_t stride) {
+  switch (max_luma_width) {
+    case 8:
+      CflSubsampler420Impl_WxH_NEON<block_width_log2, block_height_log2, 8>(
+          luma, max_luma_height, source, stride);
+      return;
+    case 16:
+      CflSubsampler420Impl_WxH_NEON<block_width_log2, block_height_log2, 16>(
+          luma, max_luma_height, source, stride);
+      return;
+    case 24:
+      CflSubsampler420Impl_WxH_NEON<block_width_log2, block_height_log2, 24>(
+          luma, max_luma_height, source, stride);
+      return;
+    default:
+      assert(max_luma_width == 32);
+      CflSubsampler420Impl_WxH_NEON<block_width_log2, block_height_log2, 32>(
+          luma, max_luma_height, source, stride);
+      return;
+  }
+}
+
+//------------------------------------------------------------------------------
+// CflIntraPredictor
+
+// |luma| can be within +/-(((1 << bitdepth) - 1) << 3), inclusive.
+// |alpha| can be -16 to 16 (inclusive).
+// Clip |dc + ((alpha * luma) >> 6))| to 0, (1 << bitdepth) - 1.
+inline uint16x8_t Combine8(const int16x8_t luma, const int16x8_t alpha_abs,
+                           const int16x8_t alpha_signed, const int16x8_t dc,
+                           const uint16x8_t max_value) {
+  const int16x8_t luma_abs = vabsq_s16(luma);
+  const int16x8_t luma_alpha_sign =
+      vshrq_n_s16(veorq_s16(luma, alpha_signed), 15);
+  // (alpha * luma) >> 6
+  const int16x8_t la_abs = vqrdmulhq_s16(luma_abs, alpha_abs);
+  // Convert back to signed values.
+  const int16x8_t la =
+      vsubq_s16(veorq_s16(la_abs, luma_alpha_sign), luma_alpha_sign);
+  const int16x8_t result = vaddq_s16(la, dc);
+  const int16x8_t zero = vdupq_n_s16(0);
+  // Clip.
+  return vminq_u16(vreinterpretq_u16_s16(vmaxq_s16(result, zero)), max_value);
+}
+
+template <int block_height, int bitdepth = 10>
+inline void CflIntraPredictor4xN_NEON(
+    void* const dest, const ptrdiff_t stride,
+    const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int alpha) {
+  auto* dst = static_cast<uint16_t*>(dest);
+  const ptrdiff_t dst_stride = stride >> 1;
+  const uint16x8_t max_value = vdupq_n_u16((1 << bitdepth) - 1);
+  const int16x8_t alpha_signed = vdupq_n_s16(alpha << 9);
+  const int16x8_t alpha_abs = vabsq_s16(alpha_signed);
+  const int16x8_t dc = vdupq_n_s16(dst[0]);
+  for (int y = 0; y < block_height; y += 2) {
+    const int16x4_t luma_row0 = vld1_s16(luma[y]);
+    const int16x4_t luma_row1 = vld1_s16(luma[y + 1]);
+    const int16x8_t combined_luma = vcombine_s16(luma_row0, luma_row1);
+    const uint16x8_t sum =
+        Combine8(combined_luma, alpha_abs, alpha_signed, dc, max_value);
+    vst1_u16(dst, vget_low_u16(sum));
+    dst += dst_stride;
+    vst1_u16(dst, vget_high_u16(sum));
+    dst += dst_stride;
+  }
+}
+
+template <int block_height, int bitdepth = 10>
+inline void CflIntraPredictor8xN_NEON(
+    void* const dest, const ptrdiff_t stride,
+    const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int alpha) {
+  auto* dst = static_cast<uint16_t*>(dest);
+  const ptrdiff_t dst_stride = stride >> 1;
+  const uint16x8_t max_value = vdupq_n_u16((1 << bitdepth) - 1);
+  const int16x8_t alpha_signed = vdupq_n_s16(alpha << 9);
+  const int16x8_t alpha_abs = vabsq_s16(alpha_signed);
+  const int16x8_t dc = vdupq_n_s16(dst[0]);
+  for (int y = 0; y < block_height; ++y) {
+    const int16x8_t luma_row = vld1q_s16(luma[y]);
+    const uint16x8_t sum =
+        Combine8(luma_row, alpha_abs, alpha_signed, dc, max_value);
+    vst1q_u16(dst, sum);
+    dst += dst_stride;
+  }
+}
+
+template <int block_height, int bitdepth = 10>
+inline void CflIntraPredictor16xN_NEON(
+    void* const dest, const ptrdiff_t stride,
+    const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int alpha) {
+  auto* dst = static_cast<uint16_t*>(dest);
+  const ptrdiff_t dst_stride = stride >> 1;
+  const uint16x8_t max_value = vdupq_n_u16((1 << bitdepth) - 1);
+  const int16x8_t alpha_signed = vdupq_n_s16(alpha << 9);
+  const int16x8_t alpha_abs = vabsq_s16(alpha_signed);
+  const int16x8_t dc = vdupq_n_s16(dst[0]);
+  for (int y = 0; y < block_height; ++y) {
+    const int16x8_t luma_row_0 = vld1q_s16(luma[y]);
+    const int16x8_t luma_row_1 = vld1q_s16(luma[y] + 8);
+    const uint16x8_t sum_0 =
+        Combine8(luma_row_0, alpha_abs, alpha_signed, dc, max_value);
+    const uint16x8_t sum_1 =
+        Combine8(luma_row_1, alpha_abs, alpha_signed, dc, max_value);
+    vst1q_u16(dst, sum_0);
+    vst1q_u16(dst + 8, sum_1);
+    dst += dst_stride;
+  }
+}
+
+template <int block_height, int bitdepth = 10>
+inline void CflIntraPredictor32xN_NEON(
+    void* const dest, const ptrdiff_t stride,
+    const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int alpha) {
+  auto* dst = static_cast<uint16_t*>(dest);
+  const ptrdiff_t dst_stride = stride >> 1;
+  const uint16x8_t max_value = vdupq_n_u16((1 << bitdepth) - 1);
+  const int16x8_t alpha_signed = vdupq_n_s16(alpha << 9);
+  const int16x8_t alpha_abs = vabsq_s16(alpha_signed);
+  const int16x8_t dc = vdupq_n_s16(dst[0]);
+  for (int y = 0; y < block_height; ++y) {
+    const int16x8_t luma_row_0 = vld1q_s16(luma[y]);
+    const int16x8_t luma_row_1 = vld1q_s16(luma[y] + 8);
+    const int16x8_t luma_row_2 = vld1q_s16(luma[y] + 16);
+    const int16x8_t luma_row_3 = vld1q_s16(luma[y] + 24);
+    const uint16x8_t sum_0 =
+        Combine8(luma_row_0, alpha_abs, alpha_signed, dc, max_value);
+    const uint16x8_t sum_1 =
+        Combine8(luma_row_1, alpha_abs, alpha_signed, dc, max_value);
+    const uint16x8_t sum_2 =
+        Combine8(luma_row_2, alpha_abs, alpha_signed, dc, max_value);
+    const uint16x8_t sum_3 =
+        Combine8(luma_row_3, alpha_abs, alpha_signed, dc, max_value);
+    vst1q_u16(dst, sum_0);
+    vst1q_u16(dst + 8, sum_1);
+    vst1q_u16(dst + 16, sum_2);
+    vst1q_u16(dst + 24, sum_3);
+    dst += dst_stride;
+  }
+}
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+
+  dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType420] =
+      CflSubsampler420_4xH_NEON<2>;
+  dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType420] =
+      CflSubsampler420_4xH_NEON<3>;
+  dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType420] =
+      CflSubsampler420_4xH_NEON<4>;
+
+  dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType420] =
+      CflSubsampler420_8xH_NEON<2>;
+  dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType420] =
+      CflSubsampler420_8xH_NEON<3>;
+  dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType420] =
+      CflSubsampler420_8xH_NEON<4>;
+  dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType420] =
+      CflSubsampler420_8xH_NEON<5>;
+
+  dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType420] =
+      CflSubsampler420_WxH_NEON<4, 2>;
+  dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType420] =
+      CflSubsampler420_WxH_NEON<4, 3>;
+  dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType420] =
+      CflSubsampler420_WxH_NEON<4, 4>;
+  dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType420] =
+      CflSubsampler420_WxH_NEON<4, 5>;
+
+  dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType420] =
+      CflSubsampler420_WxH_NEON<5, 3>;
+  dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType420] =
+      CflSubsampler420_WxH_NEON<5, 4>;
+  dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType420] =
+      CflSubsampler420_WxH_NEON<5, 5>;
+
+  dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType444] =
+      CflSubsampler444_4xH_NEON<2>;
+  dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType444] =
+      CflSubsampler444_4xH_NEON<3>;
+  dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType444] =
+      CflSubsampler444_4xH_NEON<4>;
+
+  dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType444] =
+      CflSubsampler444_8xH_NEON<2>;
+  dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType444] =
+      CflSubsampler444_8xH_NEON<3>;
+  dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType444] =
+      CflSubsampler444_8xH_NEON<4>;
+  dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType444] =
+      CflSubsampler444_8xH_NEON<5>;
+
+  dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType444] =
+      CflSubsampler444_WxH_NEON<4, 2>;
+  dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType444] =
+      CflSubsampler444_WxH_NEON<4, 3>;
+  dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType444] =
+      CflSubsampler444_WxH_NEON<4, 4>;
+  dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType444] =
+      CflSubsampler444_WxH_NEON<4, 5>;
+
+  dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType444] =
+      CflSubsampler444_WxH_NEON<5, 3>;
+  dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType444] =
+      CflSubsampler444_WxH_NEON<5, 4>;
+  dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType444] =
+      CflSubsampler444_WxH_NEON<5, 5>;
+
+  dsp->cfl_intra_predictors[kTransformSize4x4] = CflIntraPredictor4xN_NEON<4>;
+  dsp->cfl_intra_predictors[kTransformSize4x8] = CflIntraPredictor4xN_NEON<8>;
+  dsp->cfl_intra_predictors[kTransformSize4x16] = CflIntraPredictor4xN_NEON<16>;
+
+  dsp->cfl_intra_predictors[kTransformSize8x4] = CflIntraPredictor8xN_NEON<4>;
+  dsp->cfl_intra_predictors[kTransformSize8x8] = CflIntraPredictor8xN_NEON<8>;
+  dsp->cfl_intra_predictors[kTransformSize8x16] = CflIntraPredictor8xN_NEON<16>;
+  dsp->cfl_intra_predictors[kTransformSize8x32] = CflIntraPredictor8xN_NEON<32>;
+
+  dsp->cfl_intra_predictors[kTransformSize16x4] = CflIntraPredictor16xN_NEON<4>;
+  dsp->cfl_intra_predictors[kTransformSize16x8] = CflIntraPredictor16xN_NEON<8>;
+  dsp->cfl_intra_predictors[kTransformSize16x16] =
+      CflIntraPredictor16xN_NEON<16>;
+  dsp->cfl_intra_predictors[kTransformSize16x32] =
+      CflIntraPredictor16xN_NEON<32>;
+  dsp->cfl_intra_predictors[kTransformSize32x8] = CflIntraPredictor32xN_NEON<8>;
+  dsp->cfl_intra_predictors[kTransformSize32x16] =
+      CflIntraPredictor32xN_NEON<16>;
+  dsp->cfl_intra_predictors[kTransformSize32x32] =
+      CflIntraPredictor32xN_NEON<32>;
+  // Max Cfl predictor size is 32x32.
+}
+
+}  // namespace
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+void IntraPredCflInit_NEON() {
+  low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif
+}
 
 }  // namespace dsp
 }  // namespace libgav1
 
-#else  // !LIBGAV1_ENABLE_NEON
+#else   // !LIBGAV1_ENABLE_NEON
 namespace libgav1 {
 namespace dsp {
 
diff --git a/src/dsp/arm/intrapred_cfl_neon.h b/src/dsp/arm/intrapred_cfl_neon.h
new file mode 100644
index 0000000..b4f983a
--- /dev/null
+++ b/src/dsp/arm/intrapred_cfl_neon.h
@@ -0,0 +1,179 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_INTRAPRED_CFL_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_INTRAPRED_CFL_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::cfl_intra_predictors and Dsp::cfl_subsamplers, see the
+// defines below for specifics. These functions are not thread-safe.
+void IntraPredCflInit_NEON();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+// 4x4
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 4x8
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 4x16
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 8x4
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 8x8
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 8x16
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 8x32
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 16x4
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 16x8
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 16x16
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 16x32
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 32x8
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 32x16
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 32x32
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// -----------------------------------------------------------------------------
+// 10bpp
+
+// 4x4
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 4x8
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 4x16
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 8x4
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 8x8
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 8x16
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 8x32
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 16x4
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 16x8
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 16x16
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 16x32
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 32x8
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 32x16
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 32x32
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler444 LIBGAV1_CPU_NEON
+
+#endif  // LIBGAV1_ENABLE_NEON
+
+#endif  // LIBGAV1_SRC_DSP_ARM_INTRAPRED_CFL_NEON_H_
diff --git a/src/dsp/arm/intrapred_directional_neon.cc b/src/dsp/arm/intrapred_directional_neon.cc
index 805ba81..3f5edbd 100644
--- a/src/dsp/arm/intrapred_directional_neon.cc
+++ b/src/dsp/arm/intrapred_directional_neon.cc
@@ -12,18 +12,18 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "src/dsp/intrapred.h"
+#include "src/dsp/intrapred_directional.h"
 #include "src/utils/cpu.h"
 
 #if LIBGAV1_ENABLE_NEON
 
 #include <arm_neon.h>
 
-#include <algorithm>  // std::min
+#include <algorithm>
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
-#include <cstring>  // memset
+#include <cstring>
 
 #include "src/dsp/arm/common_neon.h"
 #include "src/dsp/constants.h"
@@ -35,14 +35,14 @@ namespace dsp {
 namespace low_bitdepth {
 namespace {
 
-// Blend two values based on a 32 bit weight.
+// Blend two values based on weights that sum to 32.
 inline uint8x8_t WeightedBlend(const uint8x8_t a, const uint8x8_t b,
                                const uint8x8_t a_weight,
                                const uint8x8_t b_weight) {
   const uint16x8_t a_product = vmull_u8(a, a_weight);
   const uint16x8_t b_product = vmull_u8(b, b_weight);
 
-  return vrshrn_n_u16(vaddq_u16(a_product, b_product), 5);
+  return vrshrn_n_u16(vaddq_u16(a_product, b_product), 5 /*log2(32)*/);
 }
 
 // For vertical operations the weights are one constant value.
@@ -112,7 +112,7 @@ inline void DirectionalZone1_WxH(uint8_t* dst, const ptrdiff_t stride,
     // 4 wide subsamples the output. 8 wide subsamples the input.
     if (width == 4) {
       const uint8x8_t left_values = vld1_u8(top + top_base_x);
-      const uint8x8_t right_values = RightShift<8>(left_values);
+      const uint8x8_t right_values = RightShiftVector<8>(left_values);
       const uint8x8_t value = WeightedBlend(left_values, right_values, shift);
 
       // If |upsampled| is true then extract every other value for output.
@@ -910,12 +910,590 @@ void Init8bpp() {
 }  // namespace
 }  // namespace low_bitdepth
 
-void IntraPredDirectionalInit_NEON() { low_bitdepth::Init8bpp(); }
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+// Blend two values based on weights that sum to 32.
+inline uint16x4_t WeightedBlend(const uint16x4_t a, const uint16x4_t b,
+                                const int a_weight, const int b_weight) {
+  const uint16x4_t a_product = vmul_n_u16(a, a_weight);
+  const uint16x4_t sum = vmla_n_u16(a_product, b, b_weight);
+
+  return vrshr_n_u16(sum, 5 /*log2(32)*/);
+}
+
+// Blend two values based on weights that sum to 32.
+inline uint16x8_t WeightedBlend(const uint16x8_t a, const uint16x8_t b,
+                                const uint16_t a_weight,
+                                const uint16_t b_weight) {
+  const uint16x8_t a_product = vmulq_n_u16(a, a_weight);
+  const uint16x8_t sum = vmlaq_n_u16(a_product, b, b_weight);
+
+  return vrshrq_n_u16(sum, 5 /*log2(32)*/);
+}
+
+// Each element of |dest| contains values associated with one weight value.
+inline void LoadEdgeVals(uint16x4x2_t* dest, const uint16_t* const source,
+                         const bool upsampled) {
+  if (upsampled) {
+    *dest = vld2_u16(source);
+  } else {
+    dest->val[0] = vld1_u16(source);
+    dest->val[1] = vld1_u16(source + 1);
+  }
+}
+
+// Each element of |dest| contains values associated with one weight value.
+inline void LoadEdgeVals(uint16x8x2_t* dest, const uint16_t* const source,
+                         const bool upsampled) {
+  if (upsampled) {
+    *dest = vld2q_u16(source);
+  } else {
+    dest->val[0] = vld1q_u16(source);
+    dest->val[1] = vld1q_u16(source + 1);
+  }
+}
+
+template <bool upsampled>
+inline void DirectionalZone1_4xH(uint16_t* dst, const ptrdiff_t stride,
+                                 const int height, const uint16_t* const top,
+                                 const int xstep) {
+  const int upsample_shift = static_cast<int>(upsampled);
+  const int index_scale_bits = 6 - upsample_shift;
+
+  const int max_base_x = (4 + height - 1) << upsample_shift;
+  const int16x4_t max_base = vdup_n_s16(max_base_x);
+  const uint16x4_t final_top_val = vdup_n_u16(top[max_base_x]);
+  const int16x4_t index_offset = {0, 1, 2, 3};
+
+  // All rows from |min_corner_only_y| down will simply use Memset.
+  // |max_base_x| is always greater than |height|, so clipping the denominator
+  // to 1 is enough to make the logic work.
+  const int xstep_units = std::max(xstep >> index_scale_bits, 1);
+  const int min_corner_only_y = std::min(max_base_x / xstep_units, height);
+
+  int top_x = xstep;
+  int y = 0;
+  for (; y < min_corner_only_y; ++y, dst += stride, top_x += xstep) {
+    const int top_base_x = top_x >> index_scale_bits;
+
+    // To accommodate reuse of this function in Zone2, permit negative values
+    // for |xstep|.
+    const uint16_t shift_0 = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+    const uint16_t shift_1 = 32 - shift_0;
+
+    // Use signed values to compare |top_base_x| to |max_base_x|.
+    const int16x4_t base_x = vadd_s16(vdup_n_s16(top_base_x), index_offset);
+    const uint16x4_t max_base_mask = vclt_s16(base_x, max_base);
+
+    uint16x4x2_t sampled_top_row;
+    LoadEdgeVals(&sampled_top_row, top + top_base_x, upsampled);
+    const uint16x4_t combined = WeightedBlend(
+        sampled_top_row.val[0], sampled_top_row.val[1], shift_1, shift_0);
+
+    // If |upsampled| is true then extract every other value for output.
+    const uint16x4_t masked_result =
+        vbsl_u16(max_base_mask, combined, final_top_val);
+
+    vst1_u16(dst, masked_result);
+  }
+  for (; y < height; ++y) {
+    Memset(dst, top[max_base_x], 4 /* width */);
+    dst += stride;
+  }
+}
+
+// Process a multiple of 8 |width| by any |height|. Processes horizontally
+// before vertically in the hopes of being a little more cache friendly.
+template <bool upsampled>
+inline void DirectionalZone1_WxH(uint16_t* dst, const ptrdiff_t stride,
+                                 const int width, const int height,
+                                 const uint16_t* const top, const int xstep) {
+  assert(width % 8 == 0);
+  const int upsample_shift = static_cast<int>(upsampled);
+  const int index_scale_bits = 6 - upsample_shift;
+
+  const int max_base_index = (width + height - 1) << upsample_shift;
+  const int16x8_t max_base_x = vdupq_n_s16(max_base_index);
+  const uint16x8_t final_top_val = vdupq_n_u16(top[max_base_index]);
+  const int16x8_t index_offset = {0, 1, 2, 3, 4, 5, 6, 7};
+
+  const int base_step = 1 << upsample_shift;
+  const int base_step8 = base_step << 3;
+  const int16x8_t block_step = vdupq_n_s16(base_step8);
+
+  // All rows from |min_corner_only_y| down will simply use Memset.
+  // |max_base_x| is always greater than |height|, so clipping the denominator
+  // to 1 is enough to make the logic work.
+  const int xstep_units = std::max(xstep >> index_scale_bits, 1);
+  const int min_corner_only_y = std::min(max_base_index / xstep_units, height);
+
+  int top_x = xstep;
+  int y = 0;
+  for (; y < min_corner_only_y; ++y, dst += stride, top_x += xstep) {
+    int top_base_x = top_x >> index_scale_bits;
+
+    // To accommodate reuse of this function in Zone2, permit negative values
+    // for |xstep|.
+    const uint16_t shift_0 = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+    const uint16_t shift_1 = 32 - shift_0;
+
+    // Use signed values to compare |top_base_x| to |max_base_x|.
+    int16x8_t base_x = vaddq_s16(vdupq_n_s16(top_base_x), index_offset);
+
+    int x = 0;
+    do {
+      const uint16x8_t max_base_mask = vcltq_s16(base_x, max_base_x);
+
+      uint16x8x2_t sampled_top_row;
+      LoadEdgeVals(&sampled_top_row, top + top_base_x, upsampled);
+      const uint16x8_t combined = WeightedBlend(
+          sampled_top_row.val[0], sampled_top_row.val[1], shift_1, shift_0);
+
+      const uint16x8_t masked_result =
+          vbslq_u16(max_base_mask, combined, final_top_val);
+      vst1q_u16(dst + x, masked_result);
+
+      base_x = vaddq_s16(base_x, block_step);
+      top_base_x += base_step8;
+      x += 8;
+    } while (x < width);
+  }
+  for (int i = y; i < height; ++i) {
+    Memset(dst, top[max_base_index], width);
+    dst += stride;
+  }
+}
+
+// Process a multiple of 8 |width| by any |height|. Processes horizontally
+// before vertically in the hopes of being a little more cache friendly.
+inline void DirectionalZone1_Large(uint16_t* dst, const ptrdiff_t stride,
+                                   const int width, const int height,
+                                   const uint16_t* const top, const int xstep,
+                                   const bool upsampled) {
+  assert(width % 8 == 0);
+  const int upsample_shift = static_cast<int>(upsampled);
+  const int index_scale_bits = 6 - upsample_shift;
+
+  const int max_base_index = (width + height - 1) << upsample_shift;
+  const int16x8_t max_base_x = vdupq_n_s16(max_base_index);
+  const uint16x8_t final_top_val = vdupq_n_u16(top[max_base_index]);
+  const int16x8_t index_offset = {0, 1, 2, 3, 4, 5, 6, 7};
+
+  const int base_step = 1 << upsample_shift;
+  const int base_step8 = base_step << 3;
+  const int16x8_t block_step = vdupq_n_s16(base_step8);
+
+  // All rows from |min_corner_only_y| down will simply use Memset.
+  // |max_base_x| is always greater than |height|, so clipping the denominator
+  // to 1 is enough to make the logic work.
+  const int xstep_units = std::max(xstep >> index_scale_bits, 1);
+  const int min_corner_only_y = std::min(max_base_index / xstep_units, height);
+
+  // Rows up to this y-value can be computed without checking for bounds.
+  const int max_no_corner_y = std::min(
+      ((max_base_index - (base_step * width)) << index_scale_bits) / xstep,
+      height);
+  // No need to check for exceeding |max_base_x| in the first loop.
+  int y = 0;
+  int top_x = xstep;
+  for (; y < max_no_corner_y; ++y, dst += stride, top_x += xstep) {
+    int top_base_x = top_x >> index_scale_bits;
+    // To accommodate reuse of this function in Zone2, permit negative values
+    // for |xstep|.
+    const uint16_t shift_0 = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+    const uint16_t shift_1 = 32 - shift_0;
+
+    int x = 0;
+    do {
+      uint16x8x2_t sampled_top_row;
+      LoadEdgeVals(&sampled_top_row, top + top_base_x, upsampled);
+      const uint16x8_t combined = WeightedBlend(
+          sampled_top_row.val[0], sampled_top_row.val[1], shift_1, shift_0);
+
+      vst1q_u16(dst + x, combined);
+
+      top_base_x += base_step8;
+      x += 8;
+    } while (x < width);
+  }
+
+  for (; y < min_corner_only_y; ++y, dst += stride, top_x += xstep) {
+    int top_base_x = top_x >> index_scale_bits;
+
+    // To accommodate reuse of this function in Zone2, permit negative values
+    // for |xstep|.
+    const uint16_t shift_0 = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+    const uint16_t shift_1 = 32 - shift_0;
+
+    // Use signed values to compare |top_base_x| to |max_base_x|.
+    int16x8_t base_x = vaddq_s16(vdupq_n_s16(top_base_x), index_offset);
+
+    int x = 0;
+    const int min_corner_only_x =
+        std::min(width, ((max_base_index - top_base_x) >> upsample_shift) + 7) &
+        ~7;
+    for (; x < min_corner_only_x; x += 8, top_base_x += base_step8,
+                                  base_x = vaddq_s16(base_x, block_step)) {
+      const uint16x8_t max_base_mask = vcltq_s16(base_x, max_base_x);
+
+      uint16x8x2_t sampled_top_row;
+      LoadEdgeVals(&sampled_top_row, top + top_base_x, upsampled);
+      const uint16x8_t combined = WeightedBlend(
+          sampled_top_row.val[0], sampled_top_row.val[1], shift_1, shift_0);
+
+      const uint16x8_t masked_result =
+          vbslq_u16(max_base_mask, combined, final_top_val);
+      vst1q_u16(dst + x, masked_result);
+    }
+    // Corner-only section of the row.
+    Memset(dst + x, top[max_base_index], width - x);
+  }
+  for (; y < height; ++y) {
+    Memset(dst, top[max_base_index], width);
+    dst += stride;
+  }
+}
+
+void DirectionalIntraPredictorZone1_NEON(void* const dest, ptrdiff_t stride,
+                                         const void* const top_row,
+                                         const int width, const int height,
+                                         const int xstep,
+                                         const bool upsampled_top) {
+  const uint16_t* const top = static_cast<const uint16_t*>(top_row);
+  uint16_t* dst = static_cast<uint16_t*>(dest);
+  stride /= sizeof(top[0]);
+
+  assert(xstep > 0);
+
+  if (xstep == 64) {
+    assert(!upsampled_top);
+    const uint16_t* top_ptr = top + 1;
+    const int width_bytes = width * sizeof(top[0]);
+    int y = height;
+    do {
+      memcpy(dst, top_ptr, width_bytes);
+      memcpy(dst + stride, top_ptr + 1, width_bytes);
+      memcpy(dst + 2 * stride, top_ptr + 2, width_bytes);
+      memcpy(dst + 3 * stride, top_ptr + 3, width_bytes);
+      dst += 4 * stride;
+      top_ptr += 4;
+      y -= 4;
+    } while (y != 0);
+  } else {
+    if (width == 4) {
+      if (upsampled_top) {
+        DirectionalZone1_4xH<true>(dst, stride, height, top, xstep);
+      } else {
+        DirectionalZone1_4xH<false>(dst, stride, height, top, xstep);
+      }
+    } else if (width >= 32) {
+      if (upsampled_top) {
+        DirectionalZone1_Large(dst, stride, width, height, top, xstep, true);
+      } else {
+        DirectionalZone1_Large(dst, stride, width, height, top, xstep, false);
+      }
+    } else if (upsampled_top) {
+      DirectionalZone1_WxH<true>(dst, stride, width, height, top, xstep);
+    } else {
+      DirectionalZone1_WxH<false>(dst, stride, width, height, top, xstep);
+    }
+  }
+}
+
+// -----------------------------------------------------------------------------
+// Zone 3
+// This can be considered "the transpose of Zone 1." In Zone 1, the fractional
+// step applies when moving vertically in the destination block, connected to
+// the change in |y|, whereas in this mode, the step applies when moving
+// horizontally, connected to the change in |x|. This makes vectorization very
+// complicated in row-order, because a given vector may need source pixels that
+// span 16 or 32 pixels in steep angles, requiring multiple expensive table
+// lookups and checked loads. Rather than work in row order, it is simpler to
+// compute |dest| in column order, and then store the transposed results.
+
+// Compute 4x4 sub-blocks.
+// Example of computed sub-blocks of a 4x8 block before and after transpose:
+// 00 10 20 30             00 01 02 03
+// 01 11 21 31             10 11 12 13
+// 02 12 22 32             20 21 22 23
+// 03 13 23 33             30 31 32 33
+// -----------     -->     -----------
+// 40 50 60 70             40 41 42 43
+// 41 51 61 71             50 51 52 53
+// 42 52 62 72             60 61 62 63
+// 43 53 63 73             70 71 72 73
+template <bool upsampled>
+inline void DirectionalZone3_4x4(uint8_t* dst, const ptrdiff_t stride,
+                                 const uint16_t* const left, const int ystep,
+                                 const int base_left_y = 0) {
+  const int upsample_shift = static_cast<int>(upsampled);
+  const int index_scale_bits = 6 - upsample_shift;
+
+  // Compute one column at a time, then transpose for storage.
+  uint16x4_t result[4];
+
+  int left_y = base_left_y + ystep;
+  int left_offset = left_y >> index_scale_bits;
+  int shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+  int shift_1 = 32 - shift_0;
+  uint16x4x2_t sampled_left_col;
+  LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+  result[0] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+                            shift_1, shift_0);
+
+  left_y += ystep;
+  left_offset = left_y >> index_scale_bits;
+  shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+  shift_1 = 32 - shift_0;
+  LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+  result[1] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+                            shift_1, shift_0);
+
+  left_y += ystep;
+  left_offset = left_y >> index_scale_bits;
+  shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+  shift_1 = 32 - shift_0;
+  LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+  result[2] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+                            shift_1, shift_0);
+
+  left_y += ystep;
+  left_offset = left_y >> index_scale_bits;
+  shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+  shift_1 = 32 - shift_0;
+  LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+  result[3] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+                            shift_1, shift_0);
+
+  Transpose4x4(result);
+  Store4(dst, result[0]);
+  dst += stride;
+  Store4(dst, result[1]);
+  dst += stride;
+  Store4(dst, result[2]);
+  dst += stride;
+  Store4(dst, result[3]);
+}
+
+template <bool upsampled>
+inline void DirectionalZone3_4xH(uint8_t* dest, const ptrdiff_t stride,
+                                 const int height, const uint16_t* const left,
+                                 const int ystep) {
+  const int upsample_shift = static_cast<int>(upsampled);
+  int y = 0;
+  do {
+    DirectionalZone3_4x4<upsampled>(dest, stride, left + (y << upsample_shift),
+                                    ystep);
+    dest += 4 * stride;
+    y += 4;
+  } while (y < height);
+}
+
+template <bool upsampled>
+inline void DirectionalZone3_Wx4(uint8_t* dest, const ptrdiff_t stride,
+                                 const int width, const uint16_t* const left,
+                                 const int ystep) {
+  int x = 0;
+  int base_left_y = 0;
+  do {
+    // TODO(petersonab): Establish 8x4 transpose to reserve this function for
+    // 8x4 and 16x4.
+    DirectionalZone3_4x4<upsampled>(dest + 2 * x, stride, left, ystep,
+                                    base_left_y);
+    base_left_y += 4 * ystep;
+    x += 4;
+  } while (x < width);
+}
+
+template <bool upsampled>
+inline void DirectionalZone3_8x8(uint8_t* dest, const ptrdiff_t stride,
+                                 const uint16_t* const left, const int ystep,
+                                 const int base_left_y = 0) {
+  const int upsample_shift = static_cast<int>(upsampled);
+  const int index_scale_bits = 6 - upsample_shift;
+
+  // Compute one column at a time, then transpose for storage.
+  uint16x8_t result[8];
+
+  int left_y = base_left_y + ystep;
+  uint16x8x2_t sampled_left_col;
+  int left_offset = left_y >> index_scale_bits;
+  int shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+  int shift_1 = 32 - shift_0;
+  LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+  result[0] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+                            shift_1, shift_0);
+  left_y += ystep;
+  left_offset = left_y >> index_scale_bits;
+  shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+  shift_1 = 32 - shift_0;
+  LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+  result[1] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+                            shift_1, shift_0);
+
+  left_y += ystep;
+  left_offset = left_y >> index_scale_bits;
+  shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+  shift_1 = 32 - shift_0;
+  LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+  result[2] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+                            shift_1, shift_0);
+
+  left_y += ystep;
+  left_offset = left_y >> index_scale_bits;
+  shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+  shift_1 = 32 - shift_0;
+  LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+  result[3] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+                            shift_1, shift_0);
+
+  left_y += ystep;
+  left_offset = left_y >> index_scale_bits;
+  shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+  shift_1 = 32 - shift_0;
+  LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+  result[4] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+                            shift_1, shift_0);
+
+  left_y += ystep;
+  left_offset = left_y >> index_scale_bits;
+  shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+  shift_1 = 32 - shift_0;
+  LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+  result[5] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+                            shift_1, shift_0);
+
+  left_y += ystep;
+  left_offset = left_y >> index_scale_bits;
+  shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+  shift_1 = 32 - shift_0;
+  LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+  result[6] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+                            shift_1, shift_0);
+
+  left_y += ystep;
+  left_offset = left_y >> index_scale_bits;
+  shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+  shift_1 = 32 - shift_0;
+  LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+  result[7] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+                            shift_1, shift_0);
+
+  Transpose8x8(result);
+  Store8(dest, result[0]);
+  dest += stride;
+  Store8(dest, result[1]);
+  dest += stride;
+  Store8(dest, result[2]);
+  dest += stride;
+  Store8(dest, result[3]);
+  dest += stride;
+  Store8(dest, result[4]);
+  dest += stride;
+  Store8(dest, result[5]);
+  dest += stride;
+  Store8(dest, result[6]);
+  dest += stride;
+  Store8(dest, result[7]);
+}
+
+template <bool upsampled>
+inline void DirectionalZone3_WxH(uint8_t* dest, const ptrdiff_t stride,
+                                 const int width, const int height,
+                                 const uint16_t* const left, const int ystep) {
+  const int upsample_shift = static_cast<int>(upsampled);
+  // Zone3 never runs out of left_column values.
+  assert((width + height - 1) << upsample_shift >  // max_base_y
+         ((ystep * width) >> (6 - upsample_shift)) +
+             (/* base_step */ 1 << upsample_shift) *
+                 (height - 1));  // left_base_y
+  int y = 0;
+  do {
+    int x = 0;
+    uint8_t* dst_x = dest + y * stride;
+    do {
+      const int base_left_y = ystep * x;
+      DirectionalZone3_8x8<upsampled>(
+          dst_x, stride, left + (y << upsample_shift), ystep, base_left_y);
+      dst_x += 8 * sizeof(uint16_t);
+      x += 8;
+    } while (x < width);
+    y += 8;
+  } while (y < height);
+}
+
+void DirectionalIntraPredictorZone3_NEON(void* const dest,
+                                         const ptrdiff_t stride,
+                                         const void* const left_column,
+                                         const int width, const int height,
+                                         const int ystep,
+                                         const bool upsampled_left) {
+  const uint16_t* const left = static_cast<const uint16_t*>(left_column);
+  uint8_t* dst = static_cast<uint8_t*>(dest);
+
+  if (ystep == 64) {
+    assert(!upsampled_left);
+    const int width_bytes = width * sizeof(left[0]);
+    int y = height;
+    do {
+      const uint16_t* left_ptr = left + 1;
+      memcpy(dst, left_ptr, width_bytes);
+      memcpy(dst + stride, left_ptr + 1, width_bytes);
+      memcpy(dst + 2 * stride, left_ptr + 2, width_bytes);
+      memcpy(dst + 3 * stride, left_ptr + 3, width_bytes);
+      dst += 4 * stride;
+      left_ptr += 4;
+      y -= 4;
+    } while (y != 0);
+    return;
+  }
+  if (width == 4) {
+    if (upsampled_left) {
+      DirectionalZone3_4xH<true>(dst, stride, height, left, ystep);
+    } else {
+      DirectionalZone3_4xH<false>(dst, stride, height, left, ystep);
+    }
+  } else if (height == 4) {
+    if (upsampled_left) {
+      DirectionalZone3_Wx4<true>(dst, stride, width, left, ystep);
+    } else {
+      DirectionalZone3_Wx4<false>(dst, stride, width, left, ystep);
+    }
+  } else {
+    if (upsampled_left) {
+      // |upsampled_left| can only be true if |width| + |height| <= 16,
+      // therefore this is 8x8.
+      DirectionalZone3_8x8<true>(dst, stride, left, ystep);
+    } else {
+      DirectionalZone3_WxH<false>(dst, stride, width, height, left, ystep);
+    }
+  }
+}
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+  dsp->directional_intra_predictor_zone1 = DirectionalIntraPredictorZone1_NEON;
+  dsp->directional_intra_predictor_zone3 = DirectionalIntraPredictorZone3_NEON;
+}
+
+}  // namespace
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+void IntraPredDirectionalInit_NEON() {
+  low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+}
 
 }  // namespace dsp
 }  // namespace libgav1
 
-#else  // !LIBGAV1_ENABLE_NEON
+#else   // !LIBGAV1_ENABLE_NEON
 namespace libgav1 {
 namespace dsp {
 
diff --git a/src/dsp/arm/intrapred_directional_neon.h b/src/dsp/arm/intrapred_directional_neon.h
new file mode 100644
index 0000000..f7d6235
--- /dev/null
+++ b/src/dsp/arm/intrapred_directional_neon.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_INTRAPRED_DIRECTIONAL_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_INTRAPRED_DIRECTIONAL_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::directional_intra_predictor_zone*, see the defines below for
+// specifics. These functions are not thread-safe.
+void IntraPredDirectionalInit_NEON();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone1
+#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone1 LIBGAV1_CPU_NEON
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone2
+#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone2 LIBGAV1_CPU_NEON
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone3
+#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone3 LIBGAV1_CPU_NEON
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone1
+#define LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone1 LIBGAV1_CPU_NEON
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone3
+#define LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone3 LIBGAV1_CPU_NEON
+#endif
+
+#endif  // LIBGAV1_ENABLE_NEON
+
+#endif  // LIBGAV1_SRC_DSP_ARM_INTRAPRED_DIRECTIONAL_NEON_H_
diff --git a/src/dsp/arm/intrapred_filter_intra_neon.cc b/src/dsp/arm/intrapred_filter_neon.cc
index 411708e..bd9f61d 100644
--- a/src/dsp/arm/intrapred_filter_intra_neon.cc
+++ b/src/dsp/arm/intrapred_filter_neon.cc
@@ -1,4 +1,4 @@
-// Copyright 2019 The libgav1 Authors
+// Copyright 2021 The libgav1 Authors
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "src/dsp/intrapred.h"
+#include "src/dsp/intrapred_filter.h"
 #include "src/utils/cpu.h"
 
 #if LIBGAV1_ENABLE_NEON
@@ -160,16 +160,16 @@ void Init8bpp() {
 }  // namespace
 }  // namespace low_bitdepth
 
-void IntraPredFilterIntraInit_NEON() { low_bitdepth::Init8bpp(); }
+void IntraPredFilterInit_NEON() { low_bitdepth::Init8bpp(); }
 
 }  // namespace dsp
 }  // namespace libgav1
 
-#else  // !LIBGAV1_ENABLE_NEON
+#else   // !LIBGAV1_ENABLE_NEON
 namespace libgav1 {
 namespace dsp {
 
-void IntraPredFilterIntraInit_NEON() {}
+void IntraPredFilterInit_NEON() {}
 
 }  // namespace dsp
 }  // namespace libgav1
diff --git a/src/dsp/arm/intrapred_filter_neon.h b/src/dsp/arm/intrapred_filter_neon.h
new file mode 100644
index 0000000..283c1b1
--- /dev/null
+++ b/src/dsp/arm/intrapred_filter_neon.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_INTRAPRED_FILTER_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_INTRAPRED_FILTER_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::filter_intra_predictor, see the defines below for specifics.
+// These functions are not thread-safe.
+void IntraPredFilterInit_NEON();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+#define LIBGAV1_Dsp8bpp_FilterIntraPredictor LIBGAV1_CPU_NEON
+#endif  // LIBGAV1_ENABLE_NEON
+
+#endif  // LIBGAV1_SRC_DSP_ARM_INTRAPRED_FILTER_NEON_H_
diff --git a/src/dsp/arm/intrapred_neon.cc b/src/dsp/arm/intrapred_neon.cc
index c967d82..c143648 100644
--- a/src/dsp/arm/intrapred_neon.cc
+++ b/src/dsp/arm/intrapred_neon.cc
@@ -26,6 +26,7 @@
 #include "src/dsp/arm/common_neon.h"
 #include "src/dsp/constants.h"
 #include "src/dsp/dsp.h"
+#include "src/utils/constants.h"
 
 namespace libgav1 {
 namespace dsp {
@@ -964,6 +965,200 @@ struct DcDefs {
   using _64x64 = DcPredFuncs_NEON<6, 6, DcSum_NEON, DcStore_NEON<64, 64>>;
 };
 
+// IntraPredFuncs_NEON::Horizontal -- duplicate left column across all rows
+
+template <int block_height>
+void Horizontal4xH_NEON(void* const dest, ptrdiff_t stride,
+                        const void* /*top_row*/,
+                        const void* const left_column) {
+  const auto* const left = static_cast<const uint16_t*>(left_column);
+  auto* dst = static_cast<uint8_t*>(dest);
+  int y = 0;
+  do {
+    auto* dst16 = reinterpret_cast<uint16_t*>(dst);
+    const uint16x4_t row = vld1_dup_u16(left + y);
+    vst1_u16(dst16, row);
+    dst += stride;
+  } while (++y < block_height);
+}
+
+template <int block_height>
+void Horizontal8xH_NEON(void* const dest, ptrdiff_t stride,
+                        const void* /*top_row*/,
+                        const void* const left_column) {
+  const auto* const left = static_cast<const uint16_t*>(left_column);
+  auto* dst = static_cast<uint8_t*>(dest);
+  int y = 0;
+  do {
+    auto* dst16 = reinterpret_cast<uint16_t*>(dst);
+    const uint16x8_t row = vld1q_dup_u16(left + y);
+    vst1q_u16(dst16, row);
+    dst += stride;
+  } while (++y < block_height);
+}
+
+template <int block_height>
+void Horizontal16xH_NEON(void* const dest, ptrdiff_t stride,
+                         const void* /*top_row*/,
+                         const void* const left_column) {
+  const auto* const left = static_cast<const uint16_t*>(left_column);
+  auto* dst = static_cast<uint8_t*>(dest);
+  int y = 0;
+  do {
+    const uint16x8_t row0 = vld1q_dup_u16(left + y);
+    const uint16x8_t row1 = vld1q_dup_u16(left + y + 1);
+    auto* dst16 = reinterpret_cast<uint16_t*>(dst);
+    vst1q_u16(dst16, row0);
+    vst1q_u16(dst16 + 8, row0);
+    dst += stride;
+    dst16 = reinterpret_cast<uint16_t*>(dst);
+    vst1q_u16(dst16, row1);
+    vst1q_u16(dst16 + 8, row1);
+    dst += stride;
+    y += 2;
+  } while (y < block_height);
+}
+
+template <int block_height>
+void Horizontal32xH_NEON(void* const dest, ptrdiff_t stride,
+                         const void* /*top_row*/,
+                         const void* const left_column) {
+  const auto* const left = static_cast<const uint16_t*>(left_column);
+  auto* dst = static_cast<uint8_t*>(dest);
+  int y = 0;
+  do {
+    const uint16x8_t row0 = vld1q_dup_u16(left + y);
+    const uint16x8_t row1 = vld1q_dup_u16(left + y + 1);
+    auto* dst16 = reinterpret_cast<uint16_t*>(dst);
+    vst1q_u16(dst16, row0);
+    vst1q_u16(dst16 + 8, row0);
+    vst1q_u16(dst16 + 16, row0);
+    vst1q_u16(dst16 + 24, row0);
+    dst += stride;
+    dst16 = reinterpret_cast<uint16_t*>(dst);
+    vst1q_u16(dst16, row1);
+    vst1q_u16(dst16 + 8, row1);
+    vst1q_u16(dst16 + 16, row1);
+    vst1q_u16(dst16 + 24, row1);
+    dst += stride;
+    y += 2;
+  } while (y < block_height);
+}
+
+// IntraPredFuncs_NEON::Vertical -- copy top row to all rows
+
+template <int block_height>
+void Vertical4xH_NEON(void* const dest, ptrdiff_t stride,
+                      const void* const top_row,
+                      const void* const /*left_column*/) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  auto* dst = static_cast<uint8_t*>(dest);
+  const uint8x8_t row = vld1_u8(top);
+  int y = block_height;
+  do {
+    vst1_u8(dst, row);
+    dst += stride;
+  } while (--y != 0);
+}
+
+template <int block_height>
+void Vertical8xH_NEON(void* const dest, ptrdiff_t stride,
+                      const void* const top_row,
+                      const void* const /*left_column*/) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  auto* dst = static_cast<uint8_t*>(dest);
+  const uint8x16_t row = vld1q_u8(top);
+  int y = block_height;
+  do {
+    vst1q_u8(dst, row);
+    dst += stride;
+  } while (--y != 0);
+}
+
+template <int block_height>
+void Vertical16xH_NEON(void* const dest, ptrdiff_t stride,
+                       const void* const top_row,
+                       const void* const /*left_column*/) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  auto* dst = static_cast<uint8_t*>(dest);
+  const uint8x16_t row0 = vld1q_u8(top);
+  const uint8x16_t row1 = vld1q_u8(top + 16);
+  int y = block_height;
+  do {
+    vst1q_u8(dst, row0);
+    vst1q_u8(dst + 16, row1);
+    dst += stride;
+    vst1q_u8(dst, row0);
+    vst1q_u8(dst + 16, row1);
+    dst += stride;
+    y -= 2;
+  } while (y != 0);
+}
+
+template <int block_height>
+void Vertical32xH_NEON(void* const dest, ptrdiff_t stride,
+                       const void* const top_row,
+                       const void* const /*left_column*/) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  auto* dst = static_cast<uint8_t*>(dest);
+  const uint8x16_t row0 = vld1q_u8(top);
+  const uint8x16_t row1 = vld1q_u8(top + 16);
+  const uint8x16_t row2 = vld1q_u8(top + 32);
+  const uint8x16_t row3 = vld1q_u8(top + 48);
+  int y = block_height;
+  do {
+    vst1q_u8(dst, row0);
+    vst1q_u8(dst + 16, row1);
+    vst1q_u8(dst + 32, row2);
+    vst1q_u8(dst + 48, row3);
+    dst += stride;
+    vst1q_u8(dst, row0);
+    vst1q_u8(dst + 16, row1);
+    vst1q_u8(dst + 32, row2);
+    vst1q_u8(dst + 48, row3);
+    dst += stride;
+    y -= 2;
+  } while (y != 0);
+}
+
+template <int block_height>
+void Vertical64xH_NEON(void* const dest, ptrdiff_t stride,
+                       const void* const top_row,
+                       const void* const /*left_column*/) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  auto* dst = static_cast<uint8_t*>(dest);
+  const uint8x16_t row0 = vld1q_u8(top);
+  const uint8x16_t row1 = vld1q_u8(top + 16);
+  const uint8x16_t row2 = vld1q_u8(top + 32);
+  const uint8x16_t row3 = vld1q_u8(top + 48);
+  const uint8x16_t row4 = vld1q_u8(top + 64);
+  const uint8x16_t row5 = vld1q_u8(top + 80);
+  const uint8x16_t row6 = vld1q_u8(top + 96);
+  const uint8x16_t row7 = vld1q_u8(top + 112);
+  int y = block_height;
+  do {
+    vst1q_u8(dst, row0);
+    vst1q_u8(dst + 16, row1);
+    vst1q_u8(dst + 32, row2);
+    vst1q_u8(dst + 48, row3);
+    vst1q_u8(dst + 64, row4);
+    vst1q_u8(dst + 80, row5);
+    vst1q_u8(dst + 96, row6);
+    vst1q_u8(dst + 112, row7);
+    dst += stride;
+    vst1q_u8(dst, row0);
+    vst1q_u8(dst + 16, row1);
+    vst1q_u8(dst + 32, row2);
+    vst1q_u8(dst + 48, row3);
+    vst1q_u8(dst + 64, row4);
+    vst1q_u8(dst + 80, row5);
+    vst1q_u8(dst + 96, row6);
+    vst1q_u8(dst + 112, row7);
+    dst += stride;
+    y -= 2;
+  } while (y != 0);
+}
+
 void Init10bpp() {
   Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
   assert(dsp != nullptr);
@@ -973,6 +1168,8 @@ void Init10bpp() {
       DcDefs::_4x4::DcLeft;
   dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDc] =
       DcDefs::_4x4::Dc;
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorVertical] =
+      Vertical4xH_NEON<4>;
 
   // 4x8
   dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcTop] =
@@ -981,6 +1178,10 @@ void Init10bpp() {
       DcDefs::_4x8::DcLeft;
   dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDc] =
       DcDefs::_4x8::Dc;
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorHorizontal] =
+      Horizontal4xH_NEON<8>;
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorVertical] =
+      Vertical4xH_NEON<8>;
 
   // 4x16
   dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcTop] =
@@ -989,6 +1190,10 @@ void Init10bpp() {
       DcDefs::_4x16::DcLeft;
   dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDc] =
       DcDefs::_4x16::Dc;
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorHorizontal] =
+      Horizontal4xH_NEON<16>;
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorVertical] =
+      Vertical4xH_NEON<16>;
 
   // 8x4
   dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcTop] =
@@ -997,6 +1202,8 @@ void Init10bpp() {
       DcDefs::_8x4::DcLeft;
   dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDc] =
       DcDefs::_8x4::Dc;
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorVertical] =
+      Vertical8xH_NEON<4>;
 
   // 8x8
   dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcTop] =
@@ -1005,6 +1212,10 @@ void Init10bpp() {
       DcDefs::_8x8::DcLeft;
   dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDc] =
       DcDefs::_8x8::Dc;
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorHorizontal] =
+      Horizontal8xH_NEON<8>;
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorVertical] =
+      Vertical8xH_NEON<8>;
 
   // 8x16
   dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcTop] =
@@ -1013,6 +1224,8 @@ void Init10bpp() {
       DcDefs::_8x16::DcLeft;
   dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDc] =
       DcDefs::_8x16::Dc;
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorVertical] =
+      Vertical8xH_NEON<16>;
 
   // 8x32
   dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcTop] =
@@ -1021,6 +1234,10 @@ void Init10bpp() {
       DcDefs::_8x32::DcLeft;
   dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDc] =
       DcDefs::_8x32::Dc;
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorHorizontal] =
+      Horizontal8xH_NEON<32>;
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorVertical] =
+      Vertical8xH_NEON<32>;
 
   // 16x4
   dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcTop] =
@@ -1029,6 +1246,8 @@ void Init10bpp() {
       DcDefs::_16x4::DcLeft;
   dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDc] =
       DcDefs::_16x4::Dc;
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorVertical] =
+      Vertical16xH_NEON<4>;
 
   // 16x8
   dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcTop] =
@@ -1037,6 +1256,10 @@ void Init10bpp() {
       DcDefs::_16x8::DcLeft;
   dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDc] =
       DcDefs::_16x8::Dc;
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorHorizontal] =
+      Horizontal16xH_NEON<8>;
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorVertical] =
+      Vertical16xH_NEON<8>;
 
   // 16x16
   dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcTop] =
@@ -1045,6 +1268,8 @@ void Init10bpp() {
       DcDefs::_16x16::DcLeft;
   dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDc] =
       DcDefs::_16x16::Dc;
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorVertical] =
+      Vertical16xH_NEON<16>;
 
   // 16x32
   dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcTop] =
@@ -1053,6 +1278,8 @@ void Init10bpp() {
       DcDefs::_16x32::DcLeft;
   dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDc] =
       DcDefs::_16x32::Dc;
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorVertical] =
+      Vertical16xH_NEON<32>;
 
   // 16x64
   dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcTop] =
@@ -1061,6 +1288,8 @@ void Init10bpp() {
       DcDefs::_16x64::DcLeft;
   dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDc] =
       DcDefs::_16x64::Dc;
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorVertical] =
+      Vertical16xH_NEON<64>;
 
   // 32x8
   dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcTop] =
@@ -1069,6 +1298,8 @@ void Init10bpp() {
       DcDefs::_32x8::DcLeft;
   dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDc] =
       DcDefs::_32x8::Dc;
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorVertical] =
+      Vertical32xH_NEON<8>;
 
   // 32x16
   dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcTop] =
@@ -1077,6 +1308,8 @@ void Init10bpp() {
       DcDefs::_32x16::DcLeft;
   dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDc] =
       DcDefs::_32x16::Dc;
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorVertical] =
+      Vertical32xH_NEON<16>;
 
   // 32x32
   dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcTop] =
@@ -1085,6 +1318,8 @@ void Init10bpp() {
       DcDefs::_32x32::DcLeft;
   dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDc] =
       DcDefs::_32x32::Dc;
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorVertical] =
+      Vertical32xH_NEON<32>;
 
   // 32x64
   dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcTop] =
@@ -1093,6 +1328,10 @@ void Init10bpp() {
       DcDefs::_32x64::DcLeft;
   dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDc] =
       DcDefs::_32x64::Dc;
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorHorizontal] =
+      Horizontal32xH_NEON<64>;
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorVertical] =
+      Vertical32xH_NEON<64>;
 
   // 64x16
   dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcTop] =
@@ -1101,6 +1340,8 @@ void Init10bpp() {
       DcDefs::_64x16::DcLeft;
   dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDc] =
       DcDefs::_64x16::Dc;
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorVertical] =
+      Vertical64xH_NEON<16>;
 
   // 64x32
   dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcTop] =
@@ -1109,6 +1350,8 @@ void Init10bpp() {
       DcDefs::_64x32::DcLeft;
   dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDc] =
       DcDefs::_64x32::Dc;
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorVertical] =
+      Vertical64xH_NEON<32>;
 
   // 64x64
   dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcTop] =
@@ -1117,6 +1360,8 @@ void Init10bpp() {
       DcDefs::_64x64::DcLeft;
   dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDc] =
       DcDefs::_64x64::Dc;
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorVertical] =
+      Vertical64xH_NEON<64>;
 }
 
 }  // namespace
@@ -1133,7 +1378,7 @@ void IntraPredInit_NEON() {
 }  // namespace dsp
 }  // namespace libgav1
 
-#else  // !LIBGAV1_ENABLE_NEON
+#else   // !LIBGAV1_ENABLE_NEON
 namespace libgav1 {
 namespace dsp {
 
diff --git a/src/dsp/arm/intrapred_neon.h b/src/dsp/arm/intrapred_neon.h
index 16f858c..b27f29f 100644
--- a/src/dsp/arm/intrapred_neon.h
+++ b/src/dsp/arm/intrapred_neon.h
@@ -23,396 +23,282 @@
 namespace libgav1 {
 namespace dsp {
 
-// Initializes Dsp::intra_predictors, Dsp::directional_intra_predictor_zone*,
-// Dsp::cfl_intra_predictors, Dsp::cfl_subsamplers and
-// Dsp::filter_intra_predictor, see the defines below for specifics. These
-// functions are not thread-safe.
-void IntraPredCflInit_NEON();
-void IntraPredDirectionalInit_NEON();
-void IntraPredFilterIntraInit_NEON();
+// Initializes Dsp::intra_predictors.
+// See the defines below for specifics. These functions are not thread-safe.
 void IntraPredInit_NEON();
-void IntraPredSmoothInit_NEON();
 
 }  // namespace dsp
 }  // namespace libgav1
 
 #if LIBGAV1_ENABLE_NEON
-// 8 bit
-#define LIBGAV1_Dsp8bpp_FilterIntraPredictor LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone1 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone2 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone3 LIBGAV1_CPU_NEON
-
 // 4x4
 #define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcLeft LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDc LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflIntraPredictor LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler420 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler444 LIBGAV1_CPU_NEON
 
 // 4x8
 #define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDc LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflIntraPredictor LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler420 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler444 LIBGAV1_CPU_NEON
 
 // 4x16
 #define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDc LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflIntraPredictor LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler420 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler444 LIBGAV1_CPU_NEON
 
 // 8x4
 #define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDcLeft LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDc LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflIntraPredictor LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler420 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler444 LIBGAV1_CPU_NEON
 
 // 8x8
 #define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDc LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflIntraPredictor LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler420 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler444 LIBGAV1_CPU_NEON
 
 // 8x16
 #define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDc LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflIntraPredictor LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler420 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler444 LIBGAV1_CPU_NEON
 
 // 8x32
 #define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDcLeft LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDc LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflIntraPredictor LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler420 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler444 LIBGAV1_CPU_NEON
 
 // 16x4
 #define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDcLeft LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDc LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflIntraPredictor LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler420 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler444 LIBGAV1_CPU_NEON
 
 // 16x8
 #define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDc LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflIntraPredictor LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler420 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler444 LIBGAV1_CPU_NEON
 
 // 16x16
 #define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDc LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflIntraPredictor LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler420 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler444 LIBGAV1_CPU_NEON
 
 // 16x32
 #define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDcLeft LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDc LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflIntraPredictor LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler420 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler444 LIBGAV1_CPU_NEON
 
 // 16x64
 #define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDcLeft LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDc LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_NEON
 
 // 32x8
 #define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDc LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflIntraPredictor LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler420 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler444 LIBGAV1_CPU_NEON
 
 // 32x16
 #define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDc LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflIntraPredictor LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler420 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler444 LIBGAV1_CPU_NEON
 
 // 32x32
 #define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDcLeft LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDc LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflIntraPredictor LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler420 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler444 LIBGAV1_CPU_NEON
 
 // 32x64
 #define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDcLeft LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDc LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_NEON
 
 // 64x16
 #define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDc LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_NEON
 
 // 64x32
 #define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDcLeft LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDc LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_NEON
 
 // 64x64
 #define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDcLeft LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDc LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_NEON
 
 // 10 bit
 // 4x4
 #define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDcLeft LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
 
 // 4x8
 #define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorHorizontal \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
 
 // 4x16
 #define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorHorizontal \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
 
 // 8x4
 #define LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorDcLeft LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
 
 // 8x8
 #define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorHorizontal \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
 
 // 8x16
 #define LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
 
 // 8x32
 #define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorDcLeft LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorHorizontal \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
 
 // 16x4
 #define LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorDcLeft LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
 
 // 16x8
 #define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorHorizontal \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
 
 // 16x16
 #define LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorDcLeft \
   LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
 
 // 16x32
 #define LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorDcLeft \
   LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
 
 // 16x64
 #define LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorDcLeft \
   LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
 
 // 32x8
 #define LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
 
 // 32x16
 #define LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorDcLeft \
   LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
 
 // 32x32
 #define LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorDcLeft \
   LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
 
 // 32x64
 #define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorDcLeft \
   LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorHorizontal \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
 
 // 64x16
 #define LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorDcLeft \
   LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
 
 // 64x32
 #define LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorDcLeft \
   LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
 
 // 64x64
 #define LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorDcLeft \
   LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
 #endif  // LIBGAV1_ENABLE_NEON
 
 #endif  // LIBGAV1_SRC_DSP_ARM_INTRAPRED_NEON_H_
diff --git a/src/dsp/arm/intrapred_smooth_neon.cc b/src/dsp/arm/intrapred_smooth_neon.cc
index abc93e8..c33f333 100644
--- a/src/dsp/arm/intrapred_smooth_neon.cc
+++ b/src/dsp/arm/intrapred_smooth_neon.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "src/dsp/intrapred.h"
+#include "src/dsp/intrapred_smooth.h"
 #include "src/utils/cpu.h"
 
 #if LIBGAV1_ENABLE_NEON
@@ -26,6 +26,7 @@
 #include "src/dsp/arm/common_neon.h"
 #include "src/dsp/constants.h"
 #include "src/dsp/dsp.h"
+#include "src/utils/constants.h"
 
 namespace libgav1 {
 namespace dsp {
@@ -605,7 +606,7 @@ void IntraPredSmoothInit_NEON() { low_bitdepth::Init8bpp(); }
 }  // namespace dsp
 }  // namespace libgav1
 
-#else  // !LIBGAV1_ENABLE_NEON
+#else   // !LIBGAV1_ENABLE_NEON
 namespace libgav1 {
 namespace dsp {
 
diff --git a/src/dsp/arm/intrapred_smooth_neon.h b/src/dsp/arm/intrapred_smooth_neon.h
new file mode 100644
index 0000000..edd01be
--- /dev/null
+++ b/src/dsp/arm/intrapred_smooth_neon.h
@@ -0,0 +1,149 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_INTRAPRED_SMOOTH_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_INTRAPRED_SMOOTH_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::intra_predictors[][kIntraPredictorSmooth.*].
+// This function is not thread-safe.
+void IntraPredSmoothInit_NEON();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+#endif  // LIBGAV1_ENABLE_NEON
+
+#endif  // LIBGAV1_SRC_DSP_ARM_INTRAPRED_SMOOTH_NEON_H_
diff --git a/src/dsp/arm/inverse_transform_10bit_neon.cc b/src/dsp/arm/inverse_transform_10bit_neon.cc
new file mode 100644
index 0000000..ff184a1
--- /dev/null
+++ b/src/dsp/arm/inverse_transform_10bit_neon.cc
@@ -0,0 +1,2543 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/inverse_transform.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON && LIBGAV1_MAX_BITDEPTH >= 10
+
+#include <arm_neon.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+// Include the constants and utility functions inside the anonymous namespace.
+#include "src/dsp/inverse_transform.inc"
+
+//------------------------------------------------------------------------------
+
+LIBGAV1_ALWAYS_INLINE void Transpose4x4(const int32x4_t in[4],
+                                        int32x4_t out[4]) {
+  // in:
+  // 00 01 02 03
+  // 10 11 12 13
+  // 20 21 22 23
+  // 30 31 32 33
+
+  // 00 10 02 12   a.val[0]
+  // 01 11 03 13   a.val[1]
+  // 20 30 22 32   b.val[0]
+  // 21 31 23 33   b.val[1]
+  const int32x4x2_t a = vtrnq_s32(in[0], in[1]);
+  const int32x4x2_t b = vtrnq_s32(in[2], in[3]);
+  out[0] = vextq_s32(vextq_s32(a.val[0], a.val[0], 2), b.val[0], 2);
+  out[1] = vextq_s32(vextq_s32(a.val[1], a.val[1], 2), b.val[1], 2);
+  out[2] = vextq_s32(a.val[0], vextq_s32(b.val[0], b.val[0], 2), 2);
+  out[3] = vextq_s32(a.val[1], vextq_s32(b.val[1], b.val[1], 2), 2);
+  // out:
+  // 00 10 20 30
+  // 01 11 21 31
+  // 02 12 22 32
+  // 03 13 23 33
+}
+
+//------------------------------------------------------------------------------
+template <int store_count>
+LIBGAV1_ALWAYS_INLINE void StoreDst(int32_t* dst, int32_t stride, int32_t idx,
+                                    const int32x4_t* const s) {
+  assert(store_count % 4 == 0);
+  for (int i = 0; i < store_count; i += 4) {
+    vst1q_s32(&dst[i * stride + idx], s[i]);
+    vst1q_s32(&dst[(i + 1) * stride + idx], s[i + 1]);
+    vst1q_s32(&dst[(i + 2) * stride + idx], s[i + 2]);
+    vst1q_s32(&dst[(i + 3) * stride + idx], s[i + 3]);
+  }
+}
+
+template <int load_count>
+LIBGAV1_ALWAYS_INLINE void LoadSrc(const int32_t* src, int32_t stride,
+                                   int32_t idx, int32x4_t* x) {
+  assert(load_count % 4 == 0);
+  for (int i = 0; i < load_count; i += 4) {
+    x[i] = vld1q_s32(&src[i * stride + idx]);
+    x[i + 1] = vld1q_s32(&src[(i + 1) * stride + idx]);
+    x[i + 2] = vld1q_s32(&src[(i + 2) * stride + idx]);
+    x[i + 3] = vld1q_s32(&src[(i + 3) * stride + idx]);
+  }
+}
+
+// Butterfly rotate 4 values.
+LIBGAV1_ALWAYS_INLINE void ButterflyRotation_4(int32x4_t* a, int32x4_t* b,
+                                               const int angle,
+                                               const bool flip) {
+  const int32_t cos128 = Cos128(angle);
+  const int32_t sin128 = Sin128(angle);
+  const int32x4_t acc_x = vmulq_n_s32(*a, cos128);
+  const int32x4_t acc_y = vmulq_n_s32(*a, sin128);
+  // The max range for the input is 18 bits. The cos128/sin128 is 13 bits,
+  // which leaves 1 bit for the add/subtract. For 10bpp, x/y will fit in a 32
+  // bit lane.
+  const int32x4_t x0 = vmlsq_n_s32(acc_x, *b, sin128);
+  const int32x4_t y0 = vmlaq_n_s32(acc_y, *b, cos128);
+  const int32x4_t x = vrshrq_n_s32(x0, 12);
+  const int32x4_t y = vrshrq_n_s32(y0, 12);
+  if (flip) {
+    *a = y;
+    *b = x;
+  } else {
+    *a = x;
+    *b = y;
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE void ButterflyRotation_FirstIsZero(int32x4_t* a,
+                                                         int32x4_t* b,
+                                                         const int angle,
+                                                         const bool flip) {
+  const int32_t cos128 = Cos128(angle);
+  const int32_t sin128 = Sin128(angle);
+  assert(sin128 <= 0xfff);
+  const int32x4_t x0 = vmulq_n_s32(*b, -sin128);
+  const int32x4_t y0 = vmulq_n_s32(*b, cos128);
+  const int32x4_t x = vrshrq_n_s32(x0, 12);
+  const int32x4_t y = vrshrq_n_s32(y0, 12);
+  if (flip) {
+    *a = y;
+    *b = x;
+  } else {
+    *a = x;
+    *b = y;
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE void ButterflyRotation_SecondIsZero(int32x4_t* a,
+                                                          int32x4_t* b,
+                                                          const int angle,
+                                                          const bool flip) {
+  const int32_t cos128 = Cos128(angle);
+  const int32_t sin128 = Sin128(angle);
+  const int32x4_t x0 = vmulq_n_s32(*a, cos128);
+  const int32x4_t y0 = vmulq_n_s32(*a, sin128);
+  const int32x4_t x = vrshrq_n_s32(x0, 12);
+  const int32x4_t y = vrshrq_n_s32(y0, 12);
+  if (flip) {
+    *a = y;
+    *b = x;
+  } else {
+    *a = x;
+    *b = y;
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE void HadamardRotation(int32x4_t* a, int32x4_t* b,
+                                            bool flip) {
+  int32x4_t x, y;
+  if (flip) {
+    y = vqaddq_s32(*b, *a);
+    x = vqsubq_s32(*b, *a);
+  } else {
+    x = vqaddq_s32(*a, *b);
+    y = vqsubq_s32(*a, *b);
+  }
+  *a = x;
+  *b = y;
+}
+
+LIBGAV1_ALWAYS_INLINE void HadamardRotation(int32x4_t* a, int32x4_t* b,
+                                            bool flip, const int32x4_t* min,
+                                            const int32x4_t* max) {
+  int32x4_t x, y;
+  if (flip) {
+    y = vqaddq_s32(*b, *a);
+    x = vqsubq_s32(*b, *a);
+  } else {
+    x = vqaddq_s32(*a, *b);
+    y = vqsubq_s32(*a, *b);
+  }
+  *a = vmaxq_s32(vminq_s32(x, *max), *min);
+  *b = vmaxq_s32(vminq_s32(y, *max), *min);
+}
+
+using ButterflyRotationFunc = void (*)(int32x4_t* a, int32x4_t* b, int angle,
+                                       bool flip);
+
+//------------------------------------------------------------------------------
+// Discrete Cosine Transforms (DCT).
+
+template <int width>
+LIBGAV1_ALWAYS_INLINE bool DctDcOnly(void* dest, int adjusted_tx_height,
+                                     bool should_round, int row_shift) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int32_t*>(dest);
+  const int32x4_t v_src = vdupq_n_s32(dst[0]);
+  const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0);
+  const int32x4_t v_src_round =
+      vqrdmulhq_n_s32(v_src, kTransformRowMultiplier << (31 - 12));
+  const int32x4_t s0 = vbslq_s32(v_mask, v_src_round, v_src);
+  const int32_t cos128 = Cos128(32);
+  const int32x4_t xy = vqrdmulhq_n_s32(s0, cos128 << (31 - 12));
+  // vqrshlq_s32 will shift right if shift value is negative.
+  const int32x4_t xy_shifted = vqrshlq_s32(xy, vdupq_n_s32(-row_shift));
+  // Clamp result to signed 16 bits.
+  const int32x4_t result = vmovl_s16(vqmovn_s32(xy_shifted));
+  if (width == 4) {
+    vst1q_s32(dst, result);
+  } else {
+    for (int i = 0; i < width; i += 4) {
+      vst1q_s32(dst, result);
+      dst += 4;
+    }
+  }
+  return true;
+}
+
+template <int height>
+LIBGAV1_ALWAYS_INLINE bool DctDcOnlyColumn(void* dest, int adjusted_tx_height,
+                                           int width) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int32_t*>(dest);
+  const int32_t cos128 = Cos128(32);
+
+  // Calculate dc values for first row.
+  if (width == 4) {
+    const int32x4_t v_src = vld1q_s32(dst);
+    const int32x4_t xy = vqrdmulhq_n_s32(v_src, cos128 << (31 - 12));
+    vst1q_s32(dst, xy);
+  } else {
+    int i = 0;
+    do {
+      const int32x4_t v_src = vld1q_s32(&dst[i]);
+      const int32x4_t xy = vqrdmulhq_n_s32(v_src, cos128 << (31 - 12));
+      vst1q_s32(&dst[i], xy);
+      i += 4;
+    } while (i < width);
+  }
+
+  // Copy first row to the rest of the block.
+  for (int y = 1; y < height; ++y) {
+    memcpy(&dst[y * width], dst, width * sizeof(dst[0]));
+  }
+  return true;
+}
+
+template <ButterflyRotationFunc butterfly_rotation,
+          bool is_fast_butterfly = false>
+LIBGAV1_ALWAYS_INLINE void Dct4Stages(int32x4_t* s, const int32x4_t* min,
+                                      const int32x4_t* max,
+                                      const bool is_last_stage) {
+  // stage 12.
+  if (is_fast_butterfly) {
+    ButterflyRotation_SecondIsZero(&s[0], &s[1], 32, true);
+    ButterflyRotation_SecondIsZero(&s[2], &s[3], 48, false);
+  } else {
+    butterfly_rotation(&s[0], &s[1], 32, true);
+    butterfly_rotation(&s[2], &s[3], 48, false);
+  }
+
+  // stage 17.
+  if (is_last_stage) {
+    HadamardRotation(&s[0], &s[3], false);
+    HadamardRotation(&s[1], &s[2], false);
+  } else {
+    HadamardRotation(&s[0], &s[3], false, min, max);
+    HadamardRotation(&s[1], &s[2], false, min, max);
+  }
+}
+
+template <ButterflyRotationFunc butterfly_rotation>
+LIBGAV1_ALWAYS_INLINE void Dct4_NEON(void* dest, int32_t step, bool is_row,
+                                     int row_shift) {
+  auto* const dst = static_cast<int32_t*>(dest);
+  // When |is_row| is true, set range to the row range, otherwise, set to the
+  // column range.
+  const int32_t range = is_row ? kBitdepth10 + 7 : 15;
+  const int32x4_t min = vdupq_n_s32(-(1 << range));
+  const int32x4_t max = vdupq_n_s32((1 << range) - 1);
+  int32x4_t s[4], x[4];
+
+  LoadSrc<4>(dst, step, 0, x);
+  if (is_row) {
+    Transpose4x4(x, x);
+  }
+
+  // stage 1.
+  // kBitReverseLookup 0, 2, 1, 3
+  s[0] = x[0];
+  s[1] = x[2];
+  s[2] = x[1];
+  s[3] = x[3];
+
+  Dct4Stages<butterfly_rotation>(s, &min, &max, /*is_last_stage=*/true);
+
+  if (is_row) {
+    const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
+    for (int i = 0; i < 4; ++i) {
+      s[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(s[i], v_row_shift)));
+    }
+    Transpose4x4(s, s);
+  }
+  StoreDst<4>(dst, step, 0, s);
+}
+
+template <ButterflyRotationFunc butterfly_rotation,
+          bool is_fast_butterfly = false>
+LIBGAV1_ALWAYS_INLINE void Dct8Stages(int32x4_t* s, const int32x4_t* min,
+                                      const int32x4_t* max,
+                                      const bool is_last_stage) {
+  // stage 8.
+  if (is_fast_butterfly) {
+    ButterflyRotation_SecondIsZero(&s[4], &s[7], 56, false);
+    ButterflyRotation_FirstIsZero(&s[5], &s[6], 24, false);
+  } else {
+    butterfly_rotation(&s[4], &s[7], 56, false);
+    butterfly_rotation(&s[5], &s[6], 24, false);
+  }
+
+  // stage 13.
+  HadamardRotation(&s[4], &s[5], false, min, max);
+  HadamardRotation(&s[6], &s[7], true, min, max);
+
+  // stage 18.
+  butterfly_rotation(&s[6], &s[5], 32, true);
+
+  // stage 22.
+  if (is_last_stage) {
+    HadamardRotation(&s[0], &s[7], false);
+    HadamardRotation(&s[1], &s[6], false);
+    HadamardRotation(&s[2], &s[5], false);
+    HadamardRotation(&s[3], &s[4], false);
+  } else {
+    HadamardRotation(&s[0], &s[7], false, min, max);
+    HadamardRotation(&s[1], &s[6], false, min, max);
+    HadamardRotation(&s[2], &s[5], false, min, max);
+    HadamardRotation(&s[3], &s[4], false, min, max);
+  }
+}
+
+// Process dct8 rows or columns, depending on the |is_row| flag.
+template <ButterflyRotationFunc butterfly_rotation>
+LIBGAV1_ALWAYS_INLINE void Dct8_NEON(void* dest, int32_t step, bool is_row,
+                                     int row_shift) {
+  auto* const dst = static_cast<int32_t*>(dest);
+  const int32_t range = is_row ? kBitdepth10 + 7 : 15;
+  const int32x4_t min = vdupq_n_s32(-(1 << range));
+  const int32x4_t max = vdupq_n_s32((1 << range) - 1);
+  int32x4_t s[8], x[8];
+
+  if (is_row) {
+    LoadSrc<4>(dst, step, 0, &x[0]);
+    LoadSrc<4>(dst, step, 4, &x[4]);
+    Transpose4x4(&x[0], &x[0]);
+    Transpose4x4(&x[4], &x[4]);
+  } else {
+    LoadSrc<8>(dst, step, 0, &x[0]);
+  }
+
+  // stage 1.
+  // kBitReverseLookup 0, 4, 2, 6, 1, 5, 3, 7,
+  s[0] = x[0];
+  s[1] = x[4];
+  s[2] = x[2];
+  s[3] = x[6];
+  s[4] = x[1];
+  s[5] = x[5];
+  s[6] = x[3];
+  s[7] = x[7];
+
+  Dct4Stages<butterfly_rotation>(s, &min, &max, /*is_last_stage=*/false);
+  Dct8Stages<butterfly_rotation>(s, &min, &max, /*is_last_stage=*/true);
+
+  if (is_row) {
+    const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
+    for (int i = 0; i < 8; ++i) {
+      s[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(s[i], v_row_shift)));
+    }
+    Transpose4x4(&s[0], &s[0]);
+    Transpose4x4(&s[4], &s[4]);
+    StoreDst<4>(dst, step, 0, &s[0]);
+    StoreDst<4>(dst, step, 4, &s[4]);
+  } else {
+    StoreDst<8>(dst, step, 0, &s[0]);
+  }
+}
+
+template <ButterflyRotationFunc butterfly_rotation,
+          bool is_fast_butterfly = false>
+LIBGAV1_ALWAYS_INLINE void Dct16Stages(int32x4_t* s, const int32x4_t* min,
+                                       const int32x4_t* max,
+                                       const bool is_last_stage) {
+  // stage 5.
+  if (is_fast_butterfly) {
+    ButterflyRotation_SecondIsZero(&s[8], &s[15], 60, false);
+    ButterflyRotation_FirstIsZero(&s[9], &s[14], 28, false);
+    ButterflyRotation_SecondIsZero(&s[10], &s[13], 44, false);
+    ButterflyRotation_FirstIsZero(&s[11], &s[12], 12, false);
+  } else {
+    butterfly_rotation(&s[8], &s[15], 60, false);
+    butterfly_rotation(&s[9], &s[14], 28, false);
+    butterfly_rotation(&s[10], &s[13], 44, false);
+    butterfly_rotation(&s[11], &s[12], 12, false);
+  }
+
+  // stage 9.
+  HadamardRotation(&s[8], &s[9], false, min, max);
+  HadamardRotation(&s[10], &s[11], true, min, max);
+  HadamardRotation(&s[12], &s[13], false, min, max);
+  HadamardRotation(&s[14], &s[15], true, min, max);
+
+  // stage 14.
+  butterfly_rotation(&s[14], &s[9], 48, true);
+  butterfly_rotation(&s[13], &s[10], 112, true);
+
+  // stage 19.
+  HadamardRotation(&s[8], &s[11], false, min, max);
+  HadamardRotation(&s[9], &s[10], false, min, max);
+  HadamardRotation(&s[12], &s[15], true, min, max);
+  HadamardRotation(&s[13], &s[14], true, min, max);
+
+  // stage 23.
+  butterfly_rotation(&s[13], &s[10], 32, true);
+  butterfly_rotation(&s[12], &s[11], 32, true);
+
+  // stage 26.
+  if (is_last_stage) {
+    HadamardRotation(&s[0], &s[15], false);
+    HadamardRotation(&s[1], &s[14], false);
+    HadamardRotation(&s[2], &s[13], false);
+    HadamardRotation(&s[3], &s[12], false);
+    HadamardRotation(&s[4], &s[11], false);
+    HadamardRotation(&s[5], &s[10], false);
+    HadamardRotation(&s[6], &s[9], false);
+    HadamardRotation(&s[7], &s[8], false);
+  } else {
+    HadamardRotation(&s[0], &s[15], false, min, max);
+    HadamardRotation(&s[1], &s[14], false, min, max);
+    HadamardRotation(&s[2], &s[13], false, min, max);
+    HadamardRotation(&s[3], &s[12], false, min, max);
+    HadamardRotation(&s[4], &s[11], false, min, max);
+    HadamardRotation(&s[5], &s[10], false, min, max);
+    HadamardRotation(&s[6], &s[9], false, min, max);
+    HadamardRotation(&s[7], &s[8], false, min, max);
+  }
+}
+
+// Process dct16 rows or columns, depending on the |is_row| flag.
+template <ButterflyRotationFunc butterfly_rotation>
+LIBGAV1_ALWAYS_INLINE void Dct16_NEON(void* dest, int32_t step, bool is_row,
+                                      int row_shift) {
+  auto* const dst = static_cast<int32_t*>(dest);
+  const int32_t range = is_row ? kBitdepth10 + 7 : 15;
+  const int32x4_t min = vdupq_n_s32(-(1 << range));
+  const int32x4_t max = vdupq_n_s32((1 << range) - 1);
+  int32x4_t s[16], x[16];
+
+  if (is_row) {
+    for (int idx = 0; idx < 16; idx += 8) {
+      LoadSrc<4>(dst, step, idx, &x[idx]);
+      LoadSrc<4>(dst, step, idx + 4, &x[idx + 4]);
+      Transpose4x4(&x[idx], &x[idx]);
+      Transpose4x4(&x[idx + 4], &x[idx + 4]);
+    }
+  } else {
+    LoadSrc<16>(dst, step, 0, &x[0]);
+  }
+
+  // stage 1
+  // kBitReverseLookup 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15,
+  s[0] = x[0];
+  s[1] = x[8];
+  s[2] = x[4];
+  s[3] = x[12];
+  s[4] = x[2];
+  s[5] = x[10];
+  s[6] = x[6];
+  s[7] = x[14];
+  s[8] = x[1];
+  s[9] = x[9];
+  s[10] = x[5];
+  s[11] = x[13];
+  s[12] = x[3];
+  s[13] = x[11];
+  s[14] = x[7];
+  s[15] = x[15];
+
+  Dct4Stages<butterfly_rotation>(s, &min, &max, /*is_last_stage=*/false);
+  Dct8Stages<butterfly_rotation>(s, &min, &max, /*is_last_stage=*/false);
+  Dct16Stages<butterfly_rotation>(s, &min, &max, /*is_last_stage=*/true);
+
+  if (is_row) {
+    const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
+    for (int i = 0; i < 16; ++i) {
+      s[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(s[i], v_row_shift)));
+    }
+    for (int idx = 0; idx < 16; idx += 8) {
+      Transpose4x4(&s[idx], &s[idx]);
+      Transpose4x4(&s[idx + 4], &s[idx + 4]);
+      StoreDst<4>(dst, step, idx, &s[idx]);
+      StoreDst<4>(dst, step, idx + 4, &s[idx + 4]);
+    }
+  } else {
+    StoreDst<16>(dst, step, 0, &s[0]);
+  }
+}
+
+template <ButterflyRotationFunc butterfly_rotation,
+          bool is_fast_butterfly = false>
+LIBGAV1_ALWAYS_INLINE void Dct32Stages(int32x4_t* s, const int32x4_t* min,
+                                       const int32x4_t* max,
+                                       const bool is_last_stage) {
+  // stage 3
+  if (is_fast_butterfly) {
+    ButterflyRotation_SecondIsZero(&s[16], &s[31], 62, false);
+    ButterflyRotation_FirstIsZero(&s[17], &s[30], 30, false);
+    ButterflyRotation_SecondIsZero(&s[18], &s[29], 46, false);
+    ButterflyRotation_FirstIsZero(&s[19], &s[28], 14, false);
+    ButterflyRotation_SecondIsZero(&s[20], &s[27], 54, false);
+    ButterflyRotation_FirstIsZero(&s[21], &s[26], 22, false);
+    ButterflyRotation_SecondIsZero(&s[22], &s[25], 38, false);
+    ButterflyRotation_FirstIsZero(&s[23], &s[24], 6, false);
+  } else {
+    butterfly_rotation(&s[16], &s[31], 62, false);
+    butterfly_rotation(&s[17], &s[30], 30, false);
+    butterfly_rotation(&s[18], &s[29], 46, false);
+    butterfly_rotation(&s[19], &s[28], 14, false);
+    butterfly_rotation(&s[20], &s[27], 54, false);
+    butterfly_rotation(&s[21], &s[26], 22, false);
+    butterfly_rotation(&s[22], &s[25], 38, false);
+    butterfly_rotation(&s[23], &s[24], 6, false);
+  }
+
+  // stage 6.
+  HadamardRotation(&s[16], &s[17], false, min, max);
+  HadamardRotation(&s[18], &s[19], true, min, max);
+  HadamardRotation(&s[20], &s[21], false, min, max);
+  HadamardRotation(&s[22], &s[23], true, min, max);
+  HadamardRotation(&s[24], &s[25], false, min, max);
+  HadamardRotation(&s[26], &s[27], true, min, max);
+  HadamardRotation(&s[28], &s[29], false, min, max);
+  HadamardRotation(&s[30], &s[31], true, min, max);
+
+  // stage 10.
+  butterfly_rotation(&s[30], &s[17], 24 + 32, true);
+  butterfly_rotation(&s[29], &s[18], 24 + 64 + 32, true);
+  butterfly_rotation(&s[26], &s[21], 24, true);
+  butterfly_rotation(&s[25], &s[22], 24 + 64, true);
+
+  // stage 15.
+  HadamardRotation(&s[16], &s[19], false, min, max);
+  HadamardRotation(&s[17], &s[18], false, min, max);
+  HadamardRotation(&s[20], &s[23], true, min, max);
+  HadamardRotation(&s[21], &s[22], true, min, max);
+  HadamardRotation(&s[24], &s[27], false, min, max);
+  HadamardRotation(&s[25], &s[26], false, min, max);
+  HadamardRotation(&s[28], &s[31], true, min, max);
+  HadamardRotation(&s[29], &s[30], true, min, max);
+
+  // stage 20.
+  butterfly_rotation(&s[29], &s[18], 48, true);
+  butterfly_rotation(&s[28], &s[19], 48, true);
+  butterfly_rotation(&s[27], &s[20], 48 + 64, true);
+  butterfly_rotation(&s[26], &s[21], 48 + 64, true);
+
+  // stage 24.
+  HadamardRotation(&s[16], &s[23], false, min, max);
+  HadamardRotation(&s[17], &s[22], false, min, max);
+  HadamardRotation(&s[18], &s[21], false, min, max);
+  HadamardRotation(&s[19], &s[20], false, min, max);
+  HadamardRotation(&s[24], &s[31], true, min, max);
+  HadamardRotation(&s[25], &s[30], true, min, max);
+  HadamardRotation(&s[26], &s[29], true, min, max);
+  HadamardRotation(&s[27], &s[28], true, min, max);
+
+  // stage 27.
+  butterfly_rotation(&s[27], &s[20], 32, true);
+  butterfly_rotation(&s[26], &s[21], 32, true);
+  butterfly_rotation(&s[25], &s[22], 32, true);
+  butterfly_rotation(&s[24], &s[23], 32, true);
+
+  // stage 29.
+  if (is_last_stage) {
+    HadamardRotation(&s[0], &s[31], false);
+    HadamardRotation(&s[1], &s[30], false);
+    HadamardRotation(&s[2], &s[29], false);
+    HadamardRotation(&s[3], &s[28], false);
+    HadamardRotation(&s[4], &s[27], false);
+    HadamardRotation(&s[5], &s[26], false);
+    HadamardRotation(&s[6], &s[25], false);
+    HadamardRotation(&s[7], &s[24], false);
+    HadamardRotation(&s[8], &s[23], false);
+    HadamardRotation(&s[9], &s[22], false);
+    HadamardRotation(&s[10], &s[21], false);
+    HadamardRotation(&s[11], &s[20], false);
+    HadamardRotation(&s[12], &s[19], false);
+    HadamardRotation(&s[13], &s[18], false);
+    HadamardRotation(&s[14], &s[17], false);
+    HadamardRotation(&s[15], &s[16], false);
+  } else {
+    HadamardRotation(&s[0], &s[31], false, min, max);
+    HadamardRotation(&s[1], &s[30], false, min, max);
+    HadamardRotation(&s[2], &s[29], false, min, max);
+    HadamardRotation(&s[3], &s[28], false, min, max);
+    HadamardRotation(&s[4], &s[27], false, min, max);
+    HadamardRotation(&s[5], &s[26], false, min, max);
+    HadamardRotation(&s[6], &s[25], false, min, max);
+    HadamardRotation(&s[7], &s[24], false, min, max);
+    HadamardRotation(&s[8], &s[23], false, min, max);
+    HadamardRotation(&s[9], &s[22], false, min, max);
+    HadamardRotation(&s[10], &s[21], false, min, max);
+    HadamardRotation(&s[11], &s[20], false, min, max);
+    HadamardRotation(&s[12], &s[19], false, min, max);
+    HadamardRotation(&s[13], &s[18], false, min, max);
+    HadamardRotation(&s[14], &s[17], false, min, max);
+    HadamardRotation(&s[15], &s[16], false, min, max);
+  }
+}
+
+// Process dct32 rows or columns, depending on the |is_row| flag.
+LIBGAV1_ALWAYS_INLINE void Dct32_NEON(void* dest, const int32_t step,
+                                      const bool is_row, int row_shift) {
+  auto* const dst = static_cast<int32_t*>(dest);
+  const int32_t range = is_row ? kBitdepth10 + 7 : 15;
+  const int32x4_t min = vdupq_n_s32(-(1 << range));
+  const int32x4_t max = vdupq_n_s32((1 << range) - 1);
+  int32x4_t s[32], x[32];
+
+  if (is_row) {
+    for (int idx = 0; idx < 32; idx += 8) {
+      LoadSrc<4>(dst, step, idx, &x[idx]);
+      LoadSrc<4>(dst, step, idx + 4, &x[idx + 4]);
+      Transpose4x4(&x[idx], &x[idx]);
+      Transpose4x4(&x[idx + 4], &x[idx + 4]);
+    }
+  } else {
+    LoadSrc<32>(dst, step, 0, &x[0]);
+  }
+
+  // stage 1
+  // kBitReverseLookup
+  // 0, 16, 8, 24, 4, 20, 12, 28, 2, 18, 10, 26, 6, 22, 14, 30,
+  s[0] = x[0];
+  s[1] = x[16];
+  s[2] = x[8];
+  s[3] = x[24];
+  s[4] = x[4];
+  s[5] = x[20];
+  s[6] = x[12];
+  s[7] = x[28];
+  s[8] = x[2];
+  s[9] = x[18];
+  s[10] = x[10];
+  s[11] = x[26];
+  s[12] = x[6];
+  s[13] = x[22];
+  s[14] = x[14];
+  s[15] = x[30];
+
+  // 1, 17, 9, 25, 5, 21, 13, 29, 3, 19, 11, 27, 7, 23, 15, 31,
+  s[16] = x[1];
+  s[17] = x[17];
+  s[18] = x[9];
+  s[19] = x[25];
+  s[20] = x[5];
+  s[21] = x[21];
+  s[22] = x[13];
+  s[23] = x[29];
+  s[24] = x[3];
+  s[25] = x[19];
+  s[26] = x[11];
+  s[27] = x[27];
+  s[28] = x[7];
+  s[29] = x[23];
+  s[30] = x[15];
+  s[31] = x[31];
+
+  Dct4Stages<ButterflyRotation_4>(s, &min, &max, /*is_last_stage=*/false);
+  Dct8Stages<ButterflyRotation_4>(s, &min, &max, /*is_last_stage=*/false);
+  Dct16Stages<ButterflyRotation_4>(s, &min, &max, /*is_last_stage=*/false);
+  Dct32Stages<ButterflyRotation_4>(s, &min, &max, /*is_last_stage=*/true);
+
+  if (is_row) {
+    const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
+    for (int idx = 0; idx < 32; idx += 8) {
+      int32x4_t output[8];
+      Transpose4x4(&s[idx], &output[0]);
+      Transpose4x4(&s[idx + 4], &output[4]);
+      for (int i = 0; i < 8; ++i) {
+        output[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(output[i], v_row_shift)));
+      }
+      StoreDst<4>(dst, step, idx, &output[0]);
+      StoreDst<4>(dst, step, idx + 4, &output[4]);
+    }
+  } else {
+    StoreDst<32>(dst, step, 0, &s[0]);
+  }
+}
+
+void Dct64_NEON(void* dest, int32_t step, bool is_row, int row_shift) {
+  auto* const dst = static_cast<int32_t*>(dest);
+  const int32_t range = is_row ? kBitdepth10 + 7 : 15;
+  const int32x4_t min = vdupq_n_s32(-(1 << range));
+  const int32x4_t max = vdupq_n_s32((1 << range) - 1);
+  int32x4_t s[64], x[32];
+
+  if (is_row) {
+    // The last 32 values of every row are always zero if the |tx_width| is
+    // 64.
+    for (int idx = 0; idx < 32; idx += 8) {
+      LoadSrc<4>(dst, step, idx, &x[idx]);
+      LoadSrc<4>(dst, step, idx + 4, &x[idx + 4]);
+      Transpose4x4(&x[idx], &x[idx]);
+      Transpose4x4(&x[idx + 4], &x[idx + 4]);
+    }
+  } else {
+    // The last 32 values of every column are always zero if the |tx_height| is
+    // 64.
+    LoadSrc<32>(dst, step, 0, &x[0]);
+  }
+
+  // stage 1
+  // kBitReverseLookup
+  // 0, 32, 16, 48, 8, 40, 24, 56, 4, 36, 20, 52, 12, 44, 28, 60,
+  s[0] = x[0];
+  s[2] = x[16];
+  s[4] = x[8];
+  s[6] = x[24];
+  s[8] = x[4];
+  s[10] = x[20];
+  s[12] = x[12];
+  s[14] = x[28];
+
+  // 2, 34, 18, 50, 10, 42, 26, 58, 6, 38, 22, 54, 14, 46, 30, 62,
+  s[16] = x[2];
+  s[18] = x[18];
+  s[20] = x[10];
+  s[22] = x[26];
+  s[24] = x[6];
+  s[26] = x[22];
+  s[28] = x[14];
+  s[30] = x[30];
+
+  // 1, 33, 17, 49, 9, 41, 25, 57, 5, 37, 21, 53, 13, 45, 29, 61,
+  s[32] = x[1];
+  s[34] = x[17];
+  s[36] = x[9];
+  s[38] = x[25];
+  s[40] = x[5];
+  s[42] = x[21];
+  s[44] = x[13];
+  s[46] = x[29];
+
+  // 3, 35, 19, 51, 11, 43, 27, 59, 7, 39, 23, 55, 15, 47, 31, 63
+  s[48] = x[3];
+  s[50] = x[19];
+  s[52] = x[11];
+  s[54] = x[27];
+  s[56] = x[7];
+  s[58] = x[23];
+  s[60] = x[15];
+  s[62] = x[31];
+
+  Dct4Stages<ButterflyRotation_4, /*is_fast_butterfly=*/true>(
+      s, &min, &max, /*is_last_stage=*/false);
+  Dct8Stages<ButterflyRotation_4, /*is_fast_butterfly=*/true>(
+      s, &min, &max, /*is_last_stage=*/false);
+  Dct16Stages<ButterflyRotation_4, /*is_fast_butterfly=*/true>(
+      s, &min, &max, /*is_last_stage=*/false);
+  Dct32Stages<ButterflyRotation_4, /*is_fast_butterfly=*/true>(
+      s, &min, &max, /*is_last_stage=*/false);
+
+  //-- start dct 64 stages
+  // stage 2.
+  ButterflyRotation_SecondIsZero(&s[32], &s[63], 63 - 0, false);
+  ButterflyRotation_FirstIsZero(&s[33], &s[62], 63 - 32, false);
+  ButterflyRotation_SecondIsZero(&s[34], &s[61], 63 - 16, false);
+  ButterflyRotation_FirstIsZero(&s[35], &s[60], 63 - 48, false);
+  ButterflyRotation_SecondIsZero(&s[36], &s[59], 63 - 8, false);
+  ButterflyRotation_FirstIsZero(&s[37], &s[58], 63 - 40, false);
+  ButterflyRotation_SecondIsZero(&s[38], &s[57], 63 - 24, false);
+  ButterflyRotation_FirstIsZero(&s[39], &s[56], 63 - 56, false);
+  ButterflyRotation_SecondIsZero(&s[40], &s[55], 63 - 4, false);
+  ButterflyRotation_FirstIsZero(&s[41], &s[54], 63 - 36, false);
+  ButterflyRotation_SecondIsZero(&s[42], &s[53], 63 - 20, false);
+  ButterflyRotation_FirstIsZero(&s[43], &s[52], 63 - 52, false);
+  ButterflyRotation_SecondIsZero(&s[44], &s[51], 63 - 12, false);
+  ButterflyRotation_FirstIsZero(&s[45], &s[50], 63 - 44, false);
+  ButterflyRotation_SecondIsZero(&s[46], &s[49], 63 - 28, false);
+  ButterflyRotation_FirstIsZero(&s[47], &s[48], 63 - 60, false);
+
+  // stage 4.
+  HadamardRotation(&s[32], &s[33], false, &min, &max);
+  HadamardRotation(&s[34], &s[35], true, &min, &max);
+  HadamardRotation(&s[36], &s[37], false, &min, &max);
+  HadamardRotation(&s[38], &s[39], true, &min, &max);
+  HadamardRotation(&s[40], &s[41], false, &min, &max);
+  HadamardRotation(&s[42], &s[43], true, &min, &max);
+  HadamardRotation(&s[44], &s[45], false, &min, &max);
+  HadamardRotation(&s[46], &s[47], true, &min, &max);
+  HadamardRotation(&s[48], &s[49], false, &min, &max);
+  HadamardRotation(&s[50], &s[51], true, &min, &max);
+  HadamardRotation(&s[52], &s[53], false, &min, &max);
+  HadamardRotation(&s[54], &s[55], true, &min, &max);
+  HadamardRotation(&s[56], &s[57], false, &min, &max);
+  HadamardRotation(&s[58], &s[59], true, &min, &max);
+  HadamardRotation(&s[60], &s[61], false, &min, &max);
+  HadamardRotation(&s[62], &s[63], true, &min, &max);
+
+  // stage 7.
+  ButterflyRotation_4(&s[62], &s[33], 60 - 0, true);
+  ButterflyRotation_4(&s[61], &s[34], 60 - 0 + 64, true);
+  ButterflyRotation_4(&s[58], &s[37], 60 - 32, true);
+  ButterflyRotation_4(&s[57], &s[38], 60 - 32 + 64, true);
+  ButterflyRotation_4(&s[54], &s[41], 60 - 16, true);
+  ButterflyRotation_4(&s[53], &s[42], 60 - 16 + 64, true);
+  ButterflyRotation_4(&s[50], &s[45], 60 - 48, true);
+  ButterflyRotation_4(&s[49], &s[46], 60 - 48 + 64, true);
+
+  // stage 11.
+  HadamardRotation(&s[32], &s[35], false, &min, &max);
+  HadamardRotation(&s[33], &s[34], false, &min, &max);
+  HadamardRotation(&s[36], &s[39], true, &min, &max);
+  HadamardRotation(&s[37], &s[38], true, &min, &max);
+  HadamardRotation(&s[40], &s[43], false, &min, &max);
+  HadamardRotation(&s[41], &s[42], false, &min, &max);
+  HadamardRotation(&s[44], &s[47], true, &min, &max);
+  HadamardRotation(&s[45], &s[46], true, &min, &max);
+  HadamardRotation(&s[48], &s[51], false, &min, &max);
+  HadamardRotation(&s[49], &s[50], false, &min, &max);
+  HadamardRotation(&s[52], &s[55], true, &min, &max);
+  HadamardRotation(&s[53], &s[54], true, &min, &max);
+  HadamardRotation(&s[56], &s[59], false, &min, &max);
+  HadamardRotation(&s[57], &s[58], false, &min, &max);
+  HadamardRotation(&s[60], &s[63], true, &min, &max);
+  HadamardRotation(&s[61], &s[62], true, &min, &max);
+
+  // stage 16.
+  ButterflyRotation_4(&s[61], &s[34], 56, true);
+  ButterflyRotation_4(&s[60], &s[35], 56, true);
+  ButterflyRotation_4(&s[59], &s[36], 56 + 64, true);
+  ButterflyRotation_4(&s[58], &s[37], 56 + 64, true);
+  ButterflyRotation_4(&s[53], &s[42], 56 - 32, true);
+  ButterflyRotation_4(&s[52], &s[43], 56 - 32, true);
+  ButterflyRotation_4(&s[51], &s[44], 56 - 32 + 64, true);
+  ButterflyRotation_4(&s[50], &s[45], 56 - 32 + 64, true);
+
+  // stage 21.
+  HadamardRotation(&s[32], &s[39], false, &min, &max);
+  HadamardRotation(&s[33], &s[38], false, &min, &max);
+  HadamardRotation(&s[34], &s[37], false, &min, &max);
+  HadamardRotation(&s[35], &s[36], false, &min, &max);
+  HadamardRotation(&s[40], &s[47], true, &min, &max);
+  HadamardRotation(&s[41], &s[46], true, &min, &max);
+  HadamardRotation(&s[42], &s[45], true, &min, &max);
+  HadamardRotation(&s[43], &s[44], true, &min, &max);
+  HadamardRotation(&s[48], &s[55], false, &min, &max);
+  HadamardRotation(&s[49], &s[54], false, &min, &max);
+  HadamardRotation(&s[50], &s[53], false, &min, &max);
+  HadamardRotation(&s[51], &s[52], false, &min, &max);
+  HadamardRotation(&s[56], &s[63], true, &min, &max);
+  HadamardRotation(&s[57], &s[62], true, &min, &max);
+  HadamardRotation(&s[58], &s[61], true, &min, &max);
+  HadamardRotation(&s[59], &s[60], true, &min, &max);
+
+  // stage 25.
+  ButterflyRotation_4(&s[59], &s[36], 48, true);
+  ButterflyRotation_4(&s[58], &s[37], 48, true);
+  ButterflyRotation_4(&s[57], &s[38], 48, true);
+  ButterflyRotation_4(&s[56], &s[39], 48, true);
+  ButterflyRotation_4(&s[55], &s[40], 112, true);
+  ButterflyRotation_4(&s[54], &s[41], 112, true);
+  ButterflyRotation_4(&s[53], &s[42], 112, true);
+  ButterflyRotation_4(&s[52], &s[43], 112, true);
+
+  // stage 28.
+  HadamardRotation(&s[32], &s[47], false, &min, &max);
+  HadamardRotation(&s[33], &s[46], false, &min, &max);
+  HadamardRotation(&s[34], &s[45], false, &min, &max);
+  HadamardRotation(&s[35], &s[44], false, &min, &max);
+  HadamardRotation(&s[36], &s[43], false, &min, &max);
+  HadamardRotation(&s[37], &s[42], false, &min, &max);
+  HadamardRotation(&s[38], &s[41], false, &min, &max);
+  HadamardRotation(&s[39], &s[40], false, &min, &max);
+  HadamardRotation(&s[48], &s[63], true, &min, &max);
+  HadamardRotation(&s[49], &s[62], true, &min, &max);
+  HadamardRotation(&s[50], &s[61], true, &min, &max);
+  HadamardRotation(&s[51], &s[60], true, &min, &max);
+  HadamardRotation(&s[52], &s[59], true, &min, &max);
+  HadamardRotation(&s[53], &s[58], true, &min, &max);
+  HadamardRotation(&s[54], &s[57], true, &min, &max);
+  HadamardRotation(&s[55], &s[56], true, &min, &max);
+
+  // stage 30.
+  ButterflyRotation_4(&s[55], &s[40], 32, true);
+  ButterflyRotation_4(&s[54], &s[41], 32, true);
+  ButterflyRotation_4(&s[53], &s[42], 32, true);
+  ButterflyRotation_4(&s[52], &s[43], 32, true);
+  ButterflyRotation_4(&s[51], &s[44], 32, true);
+  ButterflyRotation_4(&s[50], &s[45], 32, true);
+  ButterflyRotation_4(&s[49], &s[46], 32, true);
+  ButterflyRotation_4(&s[48], &s[47], 32, true);
+
+  // stage 31.
+  for (int i = 0; i < 32; i += 4) {
+    HadamardRotation(&s[i], &s[63 - i], false, &min, &max);
+    HadamardRotation(&s[i + 1], &s[63 - i - 1], false, &min, &max);
+    HadamardRotation(&s[i + 2], &s[63 - i - 2], false, &min, &max);
+    HadamardRotation(&s[i + 3], &s[63 - i - 3], false, &min, &max);
+  }
+  //-- end dct 64 stages
+  if (is_row) {
+    const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
+    for (int idx = 0; idx < 64; idx += 8) {
+      int32x4_t output[8];
+      Transpose4x4(&s[idx], &output[0]);
+      Transpose4x4(&s[idx + 4], &output[4]);
+      for (int i = 0; i < 8; ++i) {
+        output[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(output[i], v_row_shift)));
+      }
+      StoreDst<4>(dst, step, idx, &output[0]);
+      StoreDst<4>(dst, step, idx + 4, &output[4]);
+    }
+  } else {
+    StoreDst<64>(dst, step, 0, &s[0]);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Asymmetric Discrete Sine Transforms (ADST).
+LIBGAV1_ALWAYS_INLINE void Adst4_NEON(void* dest, int32_t step, bool is_row,
+                                      int row_shift) {
+  auto* const dst = static_cast<int32_t*>(dest);
+  int32x4_t s[8];
+  int32x4_t x[4];
+
+  LoadSrc<4>(dst, step, 0, x);
+  if (is_row) {
+    Transpose4x4(x, x);
+  }
+
+  // stage 1.
+  s[5] = vmulq_n_s32(x[3], kAdst4Multiplier[1]);
+  s[6] = vmulq_n_s32(x[3], kAdst4Multiplier[3]);
+
+  // stage 2.
+  const int32x4_t a7 = vsubq_s32(x[0], x[2]);
+  const int32x4_t b7 = vaddq_s32(a7, x[3]);
+
+  // stage 3.
+  s[0] = vmulq_n_s32(x[0], kAdst4Multiplier[0]);
+  s[1] = vmulq_n_s32(x[0], kAdst4Multiplier[1]);
+  // s[0] = s[0] + s[3]
+  s[0] = vmlaq_n_s32(s[0], x[2], kAdst4Multiplier[3]);
+  // s[1] = s[1] - s[4]
+  s[1] = vmlsq_n_s32(s[1], x[2], kAdst4Multiplier[0]);
+
+  s[3] = vmulq_n_s32(x[1], kAdst4Multiplier[2]);
+  s[2] = vmulq_n_s32(b7, kAdst4Multiplier[2]);
+
+  // stage 4.
+  s[0] = vaddq_s32(s[0], s[5]);
+  s[1] = vsubq_s32(s[1], s[6]);
+
+  // stages 5 and 6.
+  const int32x4_t x0 = vaddq_s32(s[0], s[3]);
+  const int32x4_t x1 = vaddq_s32(s[1], s[3]);
+  const int32x4_t x3_a = vaddq_s32(s[0], s[1]);
+  const int32x4_t x3 = vsubq_s32(x3_a, s[3]);
+  x[0] = vrshrq_n_s32(x0, 12);
+  x[1] = vrshrq_n_s32(x1, 12);
+  x[2] = vrshrq_n_s32(s[2], 12);
+  x[3] = vrshrq_n_s32(x3, 12);
+
+  if (is_row) {
+    const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
+    x[0] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[0], v_row_shift)));
+    x[1] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[1], v_row_shift)));
+    x[2] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[2], v_row_shift)));
+    x[3] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[3], v_row_shift)));
+    Transpose4x4(x, x);
+  }
+  StoreDst<4>(dst, step, 0, x);
+}
+
+alignas(16) constexpr int32_t kAdst4DcOnlyMultiplier[4] = {1321, 2482, 3344,
+                                                           2482};
+
+LIBGAV1_ALWAYS_INLINE bool Adst4DcOnly(void* dest, int adjusted_tx_height,
+                                       bool should_round, int row_shift) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int32_t*>(dest);
+  int32x4_t s[2];
+
+  const int32x4_t v_src0 = vdupq_n_s32(dst[0]);
+  const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0);
+  const int32x4_t v_src0_round =
+      vqrdmulhq_n_s32(v_src0, kTransformRowMultiplier << (31 - 12));
+
+  const int32x4_t v_src = vbslq_s32(v_mask, v_src0_round, v_src0);
+  const int32x4_t kAdst4DcOnlyMultipliers = vld1q_s32(kAdst4DcOnlyMultiplier);
+  s[1] = vdupq_n_s32(0);
+
+  // s0*k0 s0*k1 s0*k2 s0*k1
+  s[0] = vmulq_s32(kAdst4DcOnlyMultipliers, v_src);
+  // 0     0     0     s0*k0
+  s[1] = vextq_s32(s[1], s[0], 1);
+
+  const int32x4_t x3 = vaddq_s32(s[0], s[1]);
+  const int32x4_t dst_0 = vrshrq_n_s32(x3, 12);
+
+  // vqrshlq_s32 will shift right if shift value is negative.
+  vst1q_s32(dst,
+            vmovl_s16(vqmovn_s32(vqrshlq_s32(dst_0, vdupq_n_s32(-row_shift)))));
+
+  return true;
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst4DcOnlyColumn(void* dest, int adjusted_tx_height,
+                                             int width) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int32_t*>(dest);
+  int32x4_t s[4];
+
+  int i = 0;
+  do {
+    const int32x4_t v_src = vld1q_s32(&dst[i]);
+
+    s[0] = vmulq_n_s32(v_src, kAdst4Multiplier[0]);
+    s[1] = vmulq_n_s32(v_src, kAdst4Multiplier[1]);
+    s[2] = vmulq_n_s32(v_src, kAdst4Multiplier[2]);
+
+    const int32x4_t x0 = s[0];
+    const int32x4_t x1 = s[1];
+    const int32x4_t x2 = s[2];
+    const int32x4_t x3 = vaddq_s32(s[0], s[1]);
+    const int32x4_t dst_0 = vrshrq_n_s32(x0, 12);
+    const int32x4_t dst_1 = vrshrq_n_s32(x1, 12);
+    const int32x4_t dst_2 = vrshrq_n_s32(x2, 12);
+    const int32x4_t dst_3 = vrshrq_n_s32(x3, 12);
+
+    vst1q_s32(&dst[i], dst_0);
+    vst1q_s32(&dst[i + width * 1], dst_1);
+    vst1q_s32(&dst[i + width * 2], dst_2);
+    vst1q_s32(&dst[i + width * 3], dst_3);
+
+    i += 4;
+  } while (i < width);
+
+  return true;
+}
+
+template <ButterflyRotationFunc butterfly_rotation>
+LIBGAV1_ALWAYS_INLINE void Adst8_NEON(void* dest, int32_t step, bool is_row,
+                                      int row_shift) {
+  auto* const dst = static_cast<int32_t*>(dest);
+  const int32_t range = is_row ? kBitdepth10 + 7 : 15;
+  const int32x4_t min = vdupq_n_s32(-(1 << range));
+  const int32x4_t max = vdupq_n_s32((1 << range) - 1);
+  int32x4_t s[8], x[8];
+
+  if (is_row) {
+    LoadSrc<4>(dst, step, 0, &x[0]);
+    LoadSrc<4>(dst, step, 4, &x[4]);
+    Transpose4x4(&x[0], &x[0]);
+    Transpose4x4(&x[4], &x[4]);
+  } else {
+    LoadSrc<8>(dst, step, 0, &x[0]);
+  }
+
+  // stage 1.
+  s[0] = x[7];
+  s[1] = x[0];
+  s[2] = x[5];
+  s[3] = x[2];
+  s[4] = x[3];
+  s[5] = x[4];
+  s[6] = x[1];
+  s[7] = x[6];
+
+  // stage 2.
+  butterfly_rotation(&s[0], &s[1], 60 - 0, true);
+  butterfly_rotation(&s[2], &s[3], 60 - 16, true);
+  butterfly_rotation(&s[4], &s[5], 60 - 32, true);
+  butterfly_rotation(&s[6], &s[7], 60 - 48, true);
+
+  // stage 3.
+  HadamardRotation(&s[0], &s[4], false, &min, &max);
+  HadamardRotation(&s[1], &s[5], false, &min, &max);
+  HadamardRotation(&s[2], &s[6], false, &min, &max);
+  HadamardRotation(&s[3], &s[7], false, &min, &max);
+
+  // stage 4.
+  butterfly_rotation(&s[4], &s[5], 48 - 0, true);
+  butterfly_rotation(&s[7], &s[6], 48 - 32, true);
+
+  // stage 5.
+  HadamardRotation(&s[0], &s[2], false, &min, &max);
+  HadamardRotation(&s[4], &s[6], false, &min, &max);
+  HadamardRotation(&s[1], &s[3], false, &min, &max);
+  HadamardRotation(&s[5], &s[7], false, &min, &max);
+
+  // stage 6.
+  butterfly_rotation(&s[2], &s[3], 32, true);
+  butterfly_rotation(&s[6], &s[7], 32, true);
+
+  // stage 7.
+  x[0] = s[0];
+  x[1] = vqnegq_s32(s[4]);
+  x[2] = s[6];
+  x[3] = vqnegq_s32(s[2]);
+  x[4] = s[3];
+  x[5] = vqnegq_s32(s[7]);
+  x[6] = s[5];
+  x[7] = vqnegq_s32(s[1]);
+
+  if (is_row) {
+    const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
+    for (int i = 0; i < 8; ++i) {
+      x[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[i], v_row_shift)));
+    }
+    Transpose4x4(&x[0], &x[0]);
+    Transpose4x4(&x[4], &x[4]);
+    StoreDst<4>(dst, step, 0, &x[0]);
+    StoreDst<4>(dst, step, 4, &x[4]);
+  } else {
+    StoreDst<8>(dst, step, 0, &x[0]);
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst8DcOnly(void* dest, int adjusted_tx_height,
+                                       bool should_round, int row_shift) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int32_t*>(dest);
+  int32x4_t s[8];
+
+  const int32x4_t v_src = vdupq_n_s32(dst[0]);
+  const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0);
+  const int32x4_t v_src_round =
+      vqrdmulhq_n_s32(v_src, kTransformRowMultiplier << (31 - 12));
+  // stage 1.
+  s[1] = vbslq_s32(v_mask, v_src_round, v_src);
+
+  // stage 2.
+  ButterflyRotation_FirstIsZero(&s[0], &s[1], 60, true);
+
+  // stage 3.
+  s[4] = s[0];
+  s[5] = s[1];
+
+  // stage 4.
+  ButterflyRotation_4(&s[4], &s[5], 48, true);
+
+  // stage 5.
+  s[2] = s[0];
+  s[3] = s[1];
+  s[6] = s[4];
+  s[7] = s[5];
+
+  // stage 6.
+  ButterflyRotation_4(&s[2], &s[3], 32, true);
+  ButterflyRotation_4(&s[6], &s[7], 32, true);
+
+  // stage 7.
+  int32x4_t x[8];
+  x[0] = s[0];
+  x[1] = vqnegq_s32(s[4]);
+  x[2] = s[6];
+  x[3] = vqnegq_s32(s[2]);
+  x[4] = s[3];
+  x[5] = vqnegq_s32(s[7]);
+  x[6] = s[5];
+  x[7] = vqnegq_s32(s[1]);
+
+  for (int i = 0; i < 8; ++i) {
+    // vqrshlq_s32 will shift right if shift value is negative.
+    x[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[i], vdupq_n_s32(-row_shift))));
+    vst1q_lane_s32(&dst[i], x[i], 0);
+  }
+
+  return true;
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst8DcOnlyColumn(void* dest, int adjusted_tx_height,
+                                             int width) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int32_t*>(dest);
+  int32x4_t s[8];
+
+  int i = 0;
+  do {
+    const int32x4_t v_src = vld1q_s32(dst);
+    // stage 1.
+    s[1] = v_src;
+
+    // stage 2.
+    ButterflyRotation_FirstIsZero(&s[0], &s[1], 60, true);
+
+    // stage 3.
+    s[4] = s[0];
+    s[5] = s[1];
+
+    // stage 4.
+    ButterflyRotation_4(&s[4], &s[5], 48, true);
+
+    // stage 5.
+    s[2] = s[0];
+    s[3] = s[1];
+    s[6] = s[4];
+    s[7] = s[5];
+
+    // stage 6.
+    ButterflyRotation_4(&s[2], &s[3], 32, true);
+    ButterflyRotation_4(&s[6], &s[7], 32, true);
+
+    // stage 7.
+    int32x4_t x[8];
+    x[0] = s[0];
+    x[1] = vqnegq_s32(s[4]);
+    x[2] = s[6];
+    x[3] = vqnegq_s32(s[2]);
+    x[4] = s[3];
+    x[5] = vqnegq_s32(s[7]);
+    x[6] = s[5];
+    x[7] = vqnegq_s32(s[1]);
+
+    for (int j = 0; j < 8; ++j) {
+      vst1q_s32(&dst[j * width], x[j]);
+    }
+    i += 4;
+    dst += 4;
+  } while (i < width);
+
+  return true;
+}
+
+template <ButterflyRotationFunc butterfly_rotation>
+LIBGAV1_ALWAYS_INLINE void Adst16_NEON(void* dest, int32_t step, bool is_row,
+                                       int row_shift) {
+  auto* const dst = static_cast<int32_t*>(dest);
+  const int32_t range = is_row ? kBitdepth10 + 7 : 15;
+  const int32x4_t min = vdupq_n_s32(-(1 << range));
+  const int32x4_t max = vdupq_n_s32((1 << range) - 1);
+  int32x4_t s[16], x[16];
+
+  if (is_row) {
+    for (int idx = 0; idx < 16; idx += 8) {
+      LoadSrc<4>(dst, step, idx, &x[idx]);
+      LoadSrc<4>(dst, step, idx + 4, &x[idx + 4]);
+      Transpose4x4(&x[idx], &x[idx]);
+      Transpose4x4(&x[idx + 4], &x[idx + 4]);
+    }
+  } else {
+    LoadSrc<16>(dst, step, 0, &x[0]);
+  }
+
+  // stage 1.
+  s[0] = x[15];
+  s[1] = x[0];
+  s[2] = x[13];
+  s[3] = x[2];
+  s[4] = x[11];
+  s[5] = x[4];
+  s[6] = x[9];
+  s[7] = x[6];
+  s[8] = x[7];
+  s[9] = x[8];
+  s[10] = x[5];
+  s[11] = x[10];
+  s[12] = x[3];
+  s[13] = x[12];
+  s[14] = x[1];
+  s[15] = x[14];
+
+  // stage 2.
+  butterfly_rotation(&s[0], &s[1], 62 - 0, true);
+  butterfly_rotation(&s[2], &s[3], 62 - 8, true);
+  butterfly_rotation(&s[4], &s[5], 62 - 16, true);
+  butterfly_rotation(&s[6], &s[7], 62 - 24, true);
+  butterfly_rotation(&s[8], &s[9], 62 - 32, true);
+  butterfly_rotation(&s[10], &s[11], 62 - 40, true);
+  butterfly_rotation(&s[12], &s[13], 62 - 48, true);
+  butterfly_rotation(&s[14], &s[15], 62 - 56, true);
+
+  // stage 3.
+  HadamardRotation(&s[0], &s[8], false, &min, &max);
+  HadamardRotation(&s[1], &s[9], false, &min, &max);
+  HadamardRotation(&s[2], &s[10], false, &min, &max);
+  HadamardRotation(&s[3], &s[11], false, &min, &max);
+  HadamardRotation(&s[4], &s[12], false, &min, &max);
+  HadamardRotation(&s[5], &s[13], false, &min, &max);
+  HadamardRotation(&s[6], &s[14], false, &min, &max);
+  HadamardRotation(&s[7], &s[15], false, &min, &max);
+
+  // stage 4.
+  butterfly_rotation(&s[8], &s[9], 56 - 0, true);
+  butterfly_rotation(&s[13], &s[12], 8 + 0, true);
+  butterfly_rotation(&s[10], &s[11], 56 - 32, true);
+  butterfly_rotation(&s[15], &s[14], 8 + 32, true);
+
+  // stage 5.
+  HadamardRotation(&s[0], &s[4], false, &min, &max);
+  HadamardRotation(&s[8], &s[12], false, &min, &max);
+  HadamardRotation(&s[1], &s[5], false, &min, &max);
+  HadamardRotation(&s[9], &s[13], false, &min, &max);
+  HadamardRotation(&s[2], &s[6], false, &min, &max);
+  HadamardRotation(&s[10], &s[14], false, &min, &max);
+  HadamardRotation(&s[3], &s[7], false, &min, &max);
+  HadamardRotation(&s[11], &s[15], false, &min, &max);
+
+  // stage 6.
+  butterfly_rotation(&s[4], &s[5], 48 - 0, true);
+  butterfly_rotation(&s[12], &s[13], 48 - 0, true);
+  butterfly_rotation(&s[7], &s[6], 48 - 32, true);
+  butterfly_rotation(&s[15], &s[14], 48 - 32, true);
+
+  // stage 7.
+  HadamardRotation(&s[0], &s[2], false, &min, &max);
+  HadamardRotation(&s[4], &s[6], false, &min, &max);
+  HadamardRotation(&s[8], &s[10], false, &min, &max);
+  HadamardRotation(&s[12], &s[14], false, &min, &max);
+  HadamardRotation(&s[1], &s[3], false, &min, &max);
+  HadamardRotation(&s[5], &s[7], false, &min, &max);
+  HadamardRotation(&s[9], &s[11], false, &min, &max);
+  HadamardRotation(&s[13], &s[15], false, &min, &max);
+
+  // stage 8.
+  butterfly_rotation(&s[2], &s[3], 32, true);
+  butterfly_rotation(&s[6], &s[7], 32, true);
+  butterfly_rotation(&s[10], &s[11], 32, true);
+  butterfly_rotation(&s[14], &s[15], 32, true);
+
+  // stage 9.
+  x[0] = s[0];
+  x[1] = vqnegq_s32(s[8]);
+  x[2] = s[12];
+  x[3] = vqnegq_s32(s[4]);
+  x[4] = s[6];
+  x[5] = vqnegq_s32(s[14]);
+  x[6] = s[10];
+  x[7] = vqnegq_s32(s[2]);
+  x[8] = s[3];
+  x[9] = vqnegq_s32(s[11]);
+  x[10] = s[15];
+  x[11] = vqnegq_s32(s[7]);
+  x[12] = s[5];
+  x[13] = vqnegq_s32(s[13]);
+  x[14] = s[9];
+  x[15] = vqnegq_s32(s[1]);
+
+  if (is_row) {
+    const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
+    for (int i = 0; i < 16; ++i) {
+      x[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[i], v_row_shift)));
+    }
+    for (int idx = 0; idx < 16; idx += 8) {
+      Transpose4x4(&x[idx], &x[idx]);
+      Transpose4x4(&x[idx + 4], &x[idx + 4]);
+      StoreDst<4>(dst, step, idx, &x[idx]);
+      StoreDst<4>(dst, step, idx + 4, &x[idx + 4]);
+    }
+  } else {
+    StoreDst<16>(dst, step, 0, &x[0]);
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE void Adst16DcOnlyInternal(int32x4_t* s, int32x4_t* x) {
+  // stage 2.
+  ButterflyRotation_FirstIsZero(&s[0], &s[1], 62, true);
+
+  // stage 3.
+  s[8] = s[0];
+  s[9] = s[1];
+
+  // stage 4.
+  ButterflyRotation_4(&s[8], &s[9], 56, true);
+
+  // stage 5.
+  s[4] = s[0];
+  s[12] = s[8];
+  s[5] = s[1];
+  s[13] = s[9];
+
+  // stage 6.
+  ButterflyRotation_4(&s[4], &s[5], 48, true);
+  ButterflyRotation_4(&s[12], &s[13], 48, true);
+
+  // stage 7.
+  s[2] = s[0];
+  s[6] = s[4];
+  s[10] = s[8];
+  s[14] = s[12];
+  s[3] = s[1];
+  s[7] = s[5];
+  s[11] = s[9];
+  s[15] = s[13];
+
+  // stage 8.
+  ButterflyRotation_4(&s[2], &s[3], 32, true);
+  ButterflyRotation_4(&s[6], &s[7], 32, true);
+  ButterflyRotation_4(&s[10], &s[11], 32, true);
+  ButterflyRotation_4(&s[14], &s[15], 32, true);
+
+  // stage 9.
+  x[0] = s[0];
+  x[1] = vqnegq_s32(s[8]);
+  x[2] = s[12];
+  x[3] = vqnegq_s32(s[4]);
+  x[4] = s[6];
+  x[5] = vqnegq_s32(s[14]);
+  x[6] = s[10];
+  x[7] = vqnegq_s32(s[2]);
+  x[8] = s[3];
+  x[9] = vqnegq_s32(s[11]);
+  x[10] = s[15];
+  x[11] = vqnegq_s32(s[7]);
+  x[12] = s[5];
+  x[13] = vqnegq_s32(s[13]);
+  x[14] = s[9];
+  x[15] = vqnegq_s32(s[1]);
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst16DcOnly(void* dest, int adjusted_tx_height,
+                                        bool should_round, int row_shift) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int32_t*>(dest);
+  int32x4_t s[16];
+  int32x4_t x[16];
+  const int32x4_t v_src = vdupq_n_s32(dst[0]);
+  const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0);
+  const int32x4_t v_src_round =
+      vqrdmulhq_n_s32(v_src, kTransformRowMultiplier << (31 - 12));
+  // stage 1.
+  s[1] = vbslq_s32(v_mask, v_src_round, v_src);
+
+  Adst16DcOnlyInternal(s, x);
+
+  for (int i = 0; i < 16; ++i) {
+    // vqrshlq_s32 will shift right if shift value is negative.
+    x[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[i], vdupq_n_s32(-row_shift))));
+    vst1q_lane_s32(&dst[i], x[i], 0);
+  }
+
+  return true;
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst16DcOnlyColumn(void* dest,
+                                              int adjusted_tx_height,
+                                              int width) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int32_t*>(dest);
+  int i = 0;
+  do {
+    int32x4_t s[16];
+    int32x4_t x[16];
+    const int32x4_t v_src = vld1q_s32(dst);
+    // stage 1.
+    s[1] = v_src;
+
+    Adst16DcOnlyInternal(s, x);
+
+    for (int j = 0; j < 16; ++j) {
+      vst1q_s32(&dst[j * width], x[j]);
+    }
+    i += 4;
+    dst += 4;
+  } while (i < width);
+
+  return true;
+}
+
+//------------------------------------------------------------------------------
+// Identity Transforms.
+
+LIBGAV1_ALWAYS_INLINE void Identity4_NEON(void* dest, int32_t step, int shift) {
+  auto* const dst = static_cast<int32_t*>(dest);
+  const int32x4_t v_dual_round = vdupq_n_s32((1 + (shift << 1)) << 11);
+  const int32x4_t v_multiplier = vdupq_n_s32(kIdentity4Multiplier);
+  const int32x4_t v_shift = vdupq_n_s32(-(12 + shift));
+  for (int i = 0; i < 4; ++i) {
+    const int32x4_t v_src = vld1q_s32(&dst[i * step]);
+    const int32x4_t v_src_mult_lo =
+        vmlaq_s32(v_dual_round, v_src, v_multiplier);
+    const int32x4_t shift_lo = vqshlq_s32(v_src_mult_lo, v_shift);
+    vst1q_s32(&dst[i * step], vmovl_s16(vqmovn_s32(shift_lo)));
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Identity4DcOnly(void* dest, int adjusted_tx_height,
+                                           bool should_round, int tx_height) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int32_t*>(dest);
+  const int32x4_t v_src0 = vdupq_n_s32(dst[0]);
+  const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0);
+  const int32x4_t v_src_round =
+      vqrdmulhq_n_s32(v_src0, kTransformRowMultiplier << (31 - 12));
+  const int32x4_t v_src = vbslq_s32(v_mask, v_src_round, v_src0);
+  const int shift = tx_height < 16 ? 0 : 1;
+  const int32x4_t v_dual_round = vdupq_n_s32((1 + (shift << 1)) << 11);
+  const int32x4_t v_multiplier = vdupq_n_s32(kIdentity4Multiplier);
+  const int32x4_t v_shift = vdupq_n_s32(-(12 + shift));
+  const int32x4_t v_src_mult_lo = vmlaq_s32(v_dual_round, v_src, v_multiplier);
+  const int32x4_t dst_0 = vqshlq_s32(v_src_mult_lo, v_shift);
+  vst1q_lane_s32(dst, vmovl_s16(vqmovn_s32(dst_0)), 0);
+  return true;
+}
+
+template <int identity_size>
+LIBGAV1_ALWAYS_INLINE void IdentityColumnStoreToFrame(
+    Array2DView<uint16_t> frame, const int start_x, const int start_y,
+    const int tx_width, const int tx_height, const int32_t* source) {
+  static_assert(identity_size == 4 || identity_size == 8 || identity_size == 16,
+                "Invalid identity_size.");
+  const int stride = frame.columns();
+  uint16_t* dst = frame[start_y] + start_x;
+  const int32x4_t v_dual_round = vdupq_n_s32((1 + (1 << 4)) << 11);
+  const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1);
+
+  if (tx_width == 4) {
+    int i = 0;
+    do {
+      int32x4x2_t v_src, v_dst_i, a, b;
+      v_src.val[0] = vld1q_s32(&source[i * 4]);
+      v_src.val[1] = vld1q_s32(&source[(i * 4) + 4]);
+      if (identity_size == 4) {
+        v_dst_i.val[0] =
+            vmlaq_n_s32(v_dual_round, v_src.val[0], kIdentity4Multiplier);
+        v_dst_i.val[1] =
+            vmlaq_n_s32(v_dual_round, v_src.val[1], kIdentity4Multiplier);
+        a.val[0] = vshrq_n_s32(v_dst_i.val[0], 4 + 12);
+        a.val[1] = vshrq_n_s32(v_dst_i.val[1], 4 + 12);
+      } else if (identity_size == 8) {
+        v_dst_i.val[0] = vaddq_s32(v_src.val[0], v_src.val[0]);
+        v_dst_i.val[1] = vaddq_s32(v_src.val[1], v_src.val[1]);
+        a.val[0] = vrshrq_n_s32(v_dst_i.val[0], 4);
+        a.val[1] = vrshrq_n_s32(v_dst_i.val[1], 4);
+      } else {  // identity_size == 16
+        v_dst_i.val[0] =
+            vmlaq_n_s32(v_dual_round, v_src.val[0], kIdentity16Multiplier);
+        v_dst_i.val[1] =
+            vmlaq_n_s32(v_dual_round, v_src.val[1], kIdentity16Multiplier);
+        a.val[0] = vshrq_n_s32(v_dst_i.val[0], 4 + 12);
+        a.val[1] = vshrq_n_s32(v_dst_i.val[1], 4 + 12);
+      }
+      uint16x4x2_t frame_data;
+      frame_data.val[0] = vld1_u16(dst);
+      frame_data.val[1] = vld1_u16(dst + stride);
+      b.val[0] = vaddw_s16(a.val[0], vreinterpret_s16_u16(frame_data.val[0]));
+      b.val[1] = vaddw_s16(a.val[1], vreinterpret_s16_u16(frame_data.val[1]));
+      vst1_u16(dst, vmin_u16(vqmovun_s32(b.val[0]), v_max_bitdepth));
+      vst1_u16(dst + stride, vmin_u16(vqmovun_s32(b.val[1]), v_max_bitdepth));
+      dst += stride << 1;
+      i += 2;
+    } while (i < tx_height);
+  } else {
+    int i = 0;
+    do {
+      const int row = i * tx_width;
+      int j = 0;
+      do {
+        int32x4x2_t v_src, v_dst_i, a, b;
+        v_src.val[0] = vld1q_s32(&source[row + j]);
+        v_src.val[1] = vld1q_s32(&source[row + j + 4]);
+        if (identity_size == 4) {
+          v_dst_i.val[0] =
+              vmlaq_n_s32(v_dual_round, v_src.val[0], kIdentity4Multiplier);
+          v_dst_i.val[1] =
+              vmlaq_n_s32(v_dual_round, v_src.val[1], kIdentity4Multiplier);
+          a.val[0] = vshrq_n_s32(v_dst_i.val[0], 4 + 12);
+          a.val[1] = vshrq_n_s32(v_dst_i.val[1], 4 + 12);
+        } else if (identity_size == 8) {
+          v_dst_i.val[0] = vaddq_s32(v_src.val[0], v_src.val[0]);
+          v_dst_i.val[1] = vaddq_s32(v_src.val[1], v_src.val[1]);
+          a.val[0] = vrshrq_n_s32(v_dst_i.val[0], 4);
+          a.val[1] = vrshrq_n_s32(v_dst_i.val[1], 4);
+        } else {  // identity_size == 16
+          v_dst_i.val[0] =
+              vmlaq_n_s32(v_dual_round, v_src.val[0], kIdentity16Multiplier);
+          v_dst_i.val[1] =
+              vmlaq_n_s32(v_dual_round, v_src.val[1], kIdentity16Multiplier);
+          a.val[0] = vshrq_n_s32(v_dst_i.val[0], 4 + 12);
+          a.val[1] = vshrq_n_s32(v_dst_i.val[1], 4 + 12);
+        }
+        uint16x4x2_t frame_data;
+        frame_data.val[0] = vld1_u16(dst + j);
+        frame_data.val[1] = vld1_u16(dst + j + 4);
+        b.val[0] = vaddw_s16(a.val[0], vreinterpret_s16_u16(frame_data.val[0]));
+        b.val[1] = vaddw_s16(a.val[1], vreinterpret_s16_u16(frame_data.val[1]));
+        vst1_u16(dst + j, vmin_u16(vqmovun_s32(b.val[0]), v_max_bitdepth));
+        vst1_u16(dst + j + 4, vmin_u16(vqmovun_s32(b.val[1]), v_max_bitdepth));
+        j += 8;
+      } while (j < tx_width);
+      dst += stride;
+    } while (++i < tx_height);
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity4RowColumnStoreToFrame(
+    Array2DView<uint16_t> frame, const int start_x, const int start_y,
+    const int tx_width, const int tx_height, const int32_t* source) {
+  const int stride = frame.columns();
+  uint16_t* dst = frame[start_y] + start_x;
+  const int32x4_t v_round = vdupq_n_s32((1 + (0)) << 11);
+  const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1);
+
+  if (tx_width == 4) {
+    int i = 0;
+    do {
+      const int32x4_t v_src = vld1q_s32(&source[i * 4]);
+      const int32x4_t v_dst_row =
+          vshrq_n_s32(vmlaq_n_s32(v_round, v_src, kIdentity4Multiplier), 12);
+      const int32x4_t v_dst_col =
+          vmlaq_n_s32(v_round, v_dst_row, kIdentity4Multiplier);
+      const uint16x4_t frame_data = vld1_u16(dst);
+      const int32x4_t a = vrshrq_n_s32(v_dst_col, 4 + 12);
+      const int32x4_t b = vaddw_s16(a, vreinterpret_s16_u16(frame_data));
+      vst1_u16(dst, vmin_u16(vqmovun_s32(b), v_max_bitdepth));
+      dst += stride;
+    } while (++i < tx_height);
+  } else {
+    int i = 0;
+    do {
+      const int row = i * tx_width;
+      int j = 0;
+      do {
+        int32x4x2_t v_src, v_src_round, v_dst_row, v_dst_col, a, b;
+        v_src.val[0] = vld1q_s32(&source[row + j]);
+        v_src.val[1] = vld1q_s32(&source[row + j + 4]);
+        v_src_round.val[0] = vshrq_n_s32(
+            vmlaq_n_s32(v_round, v_src.val[0], kTransformRowMultiplier), 12);
+        v_src_round.val[1] = vshrq_n_s32(
+            vmlaq_n_s32(v_round, v_src.val[1], kTransformRowMultiplier), 12);
+        v_dst_row.val[0] = vqaddq_s32(v_src_round.val[0], v_src_round.val[0]);
+        v_dst_row.val[1] = vqaddq_s32(v_src_round.val[1], v_src_round.val[1]);
+        v_dst_col.val[0] =
+            vmlaq_n_s32(v_round, v_dst_row.val[0], kIdentity4Multiplier);
+        v_dst_col.val[1] =
+            vmlaq_n_s32(v_round, v_dst_row.val[1], kIdentity4Multiplier);
+        uint16x4x2_t frame_data;
+        frame_data.val[0] = vld1_u16(dst + j);
+        frame_data.val[1] = vld1_u16(dst + j + 4);
+        a.val[0] = vrshrq_n_s32(v_dst_col.val[0], 4 + 12);
+        a.val[1] = vrshrq_n_s32(v_dst_col.val[1], 4 + 12);
+        b.val[0] = vaddw_s16(a.val[0], vreinterpret_s16_u16(frame_data.val[0]));
+        b.val[1] = vaddw_s16(a.val[1], vreinterpret_s16_u16(frame_data.val[1]));
+        vst1_u16(dst + j, vmin_u16(vqmovun_s32(b.val[0]), v_max_bitdepth));
+        vst1_u16(dst + j + 4, vmin_u16(vqmovun_s32(b.val[1]), v_max_bitdepth));
+        j += 8;
+      } while (j < tx_width);
+      dst += stride;
+    } while (++i < tx_height);
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity8Row32_NEON(void* dest, int32_t step) {
+  auto* const dst = static_cast<int32_t*>(dest);
+
+  // When combining the identity8 multiplier with the row shift, the
+  // calculations for tx_height equal to 32 can be simplified from
+  // ((A * 2) + 2) >> 2) to ((A + 1) >> 1).
+  for (int i = 0; i < 4; ++i) {
+    const int32x4_t v_src_lo = vld1q_s32(&dst[i * step]);
+    const int32x4_t v_src_hi = vld1q_s32(&dst[(i * step) + 4]);
+    const int32x4_t a_lo = vrshrq_n_s32(v_src_lo, 1);
+    const int32x4_t a_hi = vrshrq_n_s32(v_src_hi, 1);
+    vst1q_s32(&dst[i * step], vmovl_s16(vqmovn_s32(a_lo)));
+    vst1q_s32(&dst[(i * step) + 4], vmovl_s16(vqmovn_s32(a_hi)));
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity8Row4_NEON(void* dest, int32_t step) {
+  auto* const dst = static_cast<int32_t*>(dest);
+
+  for (int i = 0; i < 4; ++i) {
+    const int32x4_t v_src_lo = vld1q_s32(&dst[i * step]);
+    const int32x4_t v_src_hi = vld1q_s32(&dst[(i * step) + 4]);
+    const int32x4_t v_srcx2_lo = vqaddq_s32(v_src_lo, v_src_lo);
+    const int32x4_t v_srcx2_hi = vqaddq_s32(v_src_hi, v_src_hi);
+    vst1q_s32(&dst[i * step], vmovl_s16(vqmovn_s32(v_srcx2_lo)));
+    vst1q_s32(&dst[(i * step) + 4], vmovl_s16(vqmovn_s32(v_srcx2_hi)));
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Identity8DcOnly(void* dest, int adjusted_tx_height,
+                                           bool should_round, int row_shift) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int32_t*>(dest);
+  const int32x4_t v_src0 = vdupq_n_s32(dst[0]);
+  const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0);
+  const int32x4_t v_src_round =
+      vqrdmulhq_n_s32(v_src0, kTransformRowMultiplier << (31 - 12));
+  const int32x4_t v_src = vbslq_s32(v_mask, v_src_round, v_src0);
+  const int32x4_t v_srcx2 = vaddq_s32(v_src, v_src);
+  const int32x4_t dst_0 = vqrshlq_s32(v_srcx2, vdupq_n_s32(-row_shift));
+  vst1q_lane_s32(dst, vmovl_s16(vqmovn_s32(dst_0)), 0);
+  return true;
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity16Row_NEON(void* dest, int32_t step,
+                                              int shift) {
+  auto* const dst = static_cast<int32_t*>(dest);
+  const int32x4_t v_dual_round = vdupq_n_s32((1 + (shift << 1)) << 11);
+  const int32x4_t v_shift = vdupq_n_s32(-(12 + shift));
+
+  for (int i = 0; i < 4; ++i) {
+    for (int j = 0; j < 2; ++j) {
+      int32x4x2_t v_src;
+      v_src.val[0] = vld1q_s32(&dst[i * step + j * 8]);
+      v_src.val[1] = vld1q_s32(&dst[i * step + j * 8 + 4]);
+      const int32x4_t v_src_mult_lo =
+          vmlaq_n_s32(v_dual_round, v_src.val[0], kIdentity16Multiplier);
+      const int32x4_t v_src_mult_hi =
+          vmlaq_n_s32(v_dual_round, v_src.val[1], kIdentity16Multiplier);
+      const int32x4_t shift_lo = vqshlq_s32(v_src_mult_lo, v_shift);
+      const int32x4_t shift_hi = vqshlq_s32(v_src_mult_hi, v_shift);
+      vst1q_s32(&dst[i * step + j * 8], vmovl_s16(vqmovn_s32(shift_lo)));
+      vst1q_s32(&dst[i * step + j * 8 + 4], vmovl_s16(vqmovn_s32(shift_hi)));
+    }
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Identity16DcOnly(void* dest, int adjusted_tx_height,
+                                            bool should_round, int shift) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int32_t*>(dest);
+  const int32x4_t v_src0 = vdupq_n_s32(dst[0]);
+  const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0);
+  const int32x4_t v_src_round =
+      vqrdmulhq_n_s32(v_src0, kTransformRowMultiplier << (31 - 12));
+  const int32x4_t v_src = vbslq_s32(v_mask, v_src_round, v_src0);
+  const int32x4_t v_dual_round = vdupq_n_s32((1 + (shift << 1)) << 11);
+  const int32x4_t v_src_mult_lo =
+      vmlaq_n_s32(v_dual_round, v_src, kIdentity16Multiplier);
+  const int32x4_t dst_0 = vqshlq_s32(v_src_mult_lo, vdupq_n_s32(-(12 + shift)));
+  vst1q_lane_s32(dst, vmovl_s16(vqmovn_s32(dst_0)), 0);
+  return true;
+}
+
+//------------------------------------------------------------------------------
+// row/column transform loops
+
+template <int tx_height>
+LIBGAV1_ALWAYS_INLINE void FlipColumns(int32_t* source, int tx_width) {
+  if (tx_width >= 16) {
+    int i = 0;
+    do {
+      // 00 01 02 03
+      const int32x4_t a = vld1q_s32(&source[i]);
+      const int32x4_t b = vld1q_s32(&source[i + 4]);
+      const int32x4_t c = vld1q_s32(&source[i + 8]);
+      const int32x4_t d = vld1q_s32(&source[i + 12]);
+      // 01 00 03 02
+      const int32x4_t a_rev = vrev64q_s32(a);
+      const int32x4_t b_rev = vrev64q_s32(b);
+      const int32x4_t c_rev = vrev64q_s32(c);
+      const int32x4_t d_rev = vrev64q_s32(d);
+      // 03 02 01 00
+      vst1q_s32(&source[i], vextq_s32(d_rev, d_rev, 2));
+      vst1q_s32(&source[i + 4], vextq_s32(c_rev, c_rev, 2));
+      vst1q_s32(&source[i + 8], vextq_s32(b_rev, b_rev, 2));
+      vst1q_s32(&source[i + 12], vextq_s32(a_rev, a_rev, 2));
+      i += 16;
+    } while (i < tx_width * tx_height);
+  } else if (tx_width == 8) {
+    for (int i = 0; i < 8 * tx_height; i += 8) {
+      // 00 01 02 03
+      const int32x4_t a = vld1q_s32(&source[i]);
+      const int32x4_t b = vld1q_s32(&source[i + 4]);
+      // 01 00 03 02
+      const int32x4_t a_rev = vrev64q_s32(a);
+      const int32x4_t b_rev = vrev64q_s32(b);
+      // 03 02 01 00
+      vst1q_s32(&source[i], vextq_s32(b_rev, b_rev, 2));
+      vst1q_s32(&source[i + 4], vextq_s32(a_rev, a_rev, 2));
+    }
+  } else {
+    // Process two rows per iteration.
+    for (int i = 0; i < 4 * tx_height; i += 8) {
+      // 00 01 02 03
+      const int32x4_t a = vld1q_s32(&source[i]);
+      const int32x4_t b = vld1q_s32(&source[i + 4]);
+      // 01 00 03 02
+      const int32x4_t a_rev = vrev64q_s32(a);
+      const int32x4_t b_rev = vrev64q_s32(b);
+      // 03 02 01 00
+      vst1q_s32(&source[i], vextq_s32(a_rev, a_rev, 2));
+      vst1q_s32(&source[i + 4], vextq_s32(b_rev, b_rev, 2));
+    }
+  }
+}
+
+template <int tx_width>
+LIBGAV1_ALWAYS_INLINE void ApplyRounding(int32_t* source, int num_rows) {
+  // Process two rows per iteration.
+  int i = 0;
+  do {
+    const int32x4_t a_lo = vld1q_s32(&source[i]);
+    const int32x4_t a_hi = vld1q_s32(&source[i + 4]);
+    const int32x4_t b_lo =
+        vqrdmulhq_n_s32(a_lo, kTransformRowMultiplier << (31 - 12));
+    const int32x4_t b_hi =
+        vqrdmulhq_n_s32(a_hi, kTransformRowMultiplier << (31 - 12));
+    vst1q_s32(&source[i], b_lo);
+    vst1q_s32(&source[i + 4], b_hi);
+    i += 8;
+  } while (i < tx_width * num_rows);
+}
+
+template <int tx_width>
+LIBGAV1_ALWAYS_INLINE void RowShift(int32_t* source, int num_rows,
+                                    int row_shift) {
+  // vqrshlq_s32 will shift right if shift value is negative.
+  row_shift = -row_shift;
+
+  // Process two rows per iteration.
+  int i = 0;
+  do {
+    const int32x4_t residual0 = vld1q_s32(&source[i]);
+    const int32x4_t residual1 = vld1q_s32(&source[i + 4]);
+    vst1q_s32(&source[i], vqrshlq_s32(residual0, vdupq_n_s32(row_shift)));
+    vst1q_s32(&source[i + 4], vqrshlq_s32(residual1, vdupq_n_s32(row_shift)));
+    i += 8;
+  } while (i < tx_width * num_rows);
+}
+
+template <int tx_height, bool enable_flip_rows = false>
+LIBGAV1_ALWAYS_INLINE void StoreToFrameWithRound(
+    Array2DView<uint16_t> frame, const int start_x, const int start_y,
+    const int tx_width, const int32_t* source, TransformType tx_type) {
+  const bool flip_rows =
+      enable_flip_rows ? kTransformFlipRowsMask.Contains(tx_type) : false;
+  const int stride = frame.columns();
+  uint16_t* dst = frame[start_y] + start_x;
+
+  if (tx_width == 4) {
+    for (int i = 0; i < tx_height; ++i) {
+      const int row = flip_rows ? (tx_height - i - 1) * 4 : i * 4;
+      const int32x4_t residual = vld1q_s32(&source[row]);
+      const uint16x4_t frame_data = vld1_u16(dst);
+      const int32x4_t a = vrshrq_n_s32(residual, 4);
+      const uint32x4_t b = vaddw_u16(vreinterpretq_u32_s32(a), frame_data);
+      const uint16x4_t d = vqmovun_s32(vreinterpretq_s32_u32(b));
+      vst1_u16(dst, vmin_u16(d, vdup_n_u16((1 << kBitdepth10) - 1)));
+      dst += stride;
+    }
+  } else {
+    for (int i = 0; i < tx_height; ++i) {
+      const int y = start_y + i;
+      const int row = flip_rows ? (tx_height - i - 1) * tx_width : i * tx_width;
+      int j = 0;
+      do {
+        const int x = start_x + j;
+        const int32x4_t residual = vld1q_s32(&source[row + j]);
+        const int32x4_t residual_hi = vld1q_s32(&source[row + j + 4]);
+        const uint16x8_t frame_data = vld1q_u16(frame[y] + x);
+        const int32x4_t a = vrshrq_n_s32(residual, 4);
+        const int32x4_t a_hi = vrshrq_n_s32(residual_hi, 4);
+        const uint32x4_t b =
+            vaddw_u16(vreinterpretq_u32_s32(a), vget_low_u16(frame_data));
+        const uint32x4_t b_hi =
+            vaddw_u16(vreinterpretq_u32_s32(a_hi), vget_high_u16(frame_data));
+        const uint16x4_t d = vqmovun_s32(vreinterpretq_s32_u32(b));
+        const uint16x4_t d_hi = vqmovun_s32(vreinterpretq_s32_u32(b_hi));
+        vst1q_u16(frame[y] + x, vminq_u16(vcombine_u16(d, d_hi),
+                                          vdupq_n_u16((1 << kBitdepth10) - 1)));
+        j += 8;
+      } while (j < tx_width);
+    }
+  }
+}
+
+void Dct4TransformLoopRow_NEON(TransformType /*tx_type*/, TransformSize tx_size,
+                               int adjusted_tx_height, void* src_buffer,
+                               int /*start_x*/, int /*start_y*/,
+                               void* /*dst_frame*/) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const int tx_height = kTransformHeight[tx_size];
+  const bool should_round = (tx_height == 8);
+  const int row_shift = (tx_height == 16);
+
+  if (DctDcOnly<4>(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<4>(src, adjusted_tx_height);
+  }
+
+  // Process 4 1d dct4 rows in parallel per iteration.
+  int i = adjusted_tx_height;
+  auto* data = src;
+  do {
+    Dct4_NEON<ButterflyRotation_4>(data, /*step=*/4, /*is_row=*/true,
+                                   row_shift);
+    data += 16;
+    i -= 4;
+  } while (i != 0);
+}
+
+void Dct4TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+                                  int adjusted_tx_height, void* src_buffer,
+                                  int start_x, int start_y, void* dst_frame) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<4>(src, tx_width);
+  }
+
+  if (!DctDcOnlyColumn<4>(src, adjusted_tx_height, tx_width)) {
+    // Process 4 1d dct4 columns in parallel per iteration.
+    int i = tx_width;
+    auto* data = src;
+    do {
+      Dct4_NEON<ButterflyRotation_4>(data, tx_width, /*transpose=*/false,
+                                     /*row_shift=*/0);
+      data += 4;
+      i -= 4;
+    } while (i != 0);
+  }
+
+  auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+  StoreToFrameWithRound<4>(frame, start_x, start_y, tx_width, src, tx_type);
+}
+
+void Dct8TransformLoopRow_NEON(TransformType /*tx_type*/, TransformSize tx_size,
+                               int adjusted_tx_height, void* src_buffer,
+                               int /*start_x*/, int /*start_y*/,
+                               void* /*dst_frame*/) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+
+  if (DctDcOnly<8>(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<8>(src, adjusted_tx_height);
+  }
+
+  // Process 4 1d dct8 rows in parallel per iteration.
+  int i = adjusted_tx_height;
+  auto* data = src;
+  do {
+    Dct8_NEON<ButterflyRotation_4>(data, /*step=*/8, /*is_row=*/true,
+                                   row_shift);
+    data += 32;
+    i -= 4;
+  } while (i != 0);
+}
+
+void Dct8TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+                                  int adjusted_tx_height, void* src_buffer,
+                                  int start_x, int start_y, void* dst_frame) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<8>(src, tx_width);
+  }
+
+  if (!DctDcOnlyColumn<8>(src, adjusted_tx_height, tx_width)) {
+    // Process 4 1d dct8 columns in parallel per iteration.
+    int i = tx_width;
+    auto* data = src;
+    do {
+      Dct8_NEON<ButterflyRotation_4>(data, tx_width, /*is_row=*/false,
+                                     /*row_shift=*/0);
+      data += 4;
+      i -= 4;
+    } while (i != 0);
+  }
+  auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+  StoreToFrameWithRound<8>(frame, start_x, start_y, tx_width, src, tx_type);
+}
+
+void Dct16TransformLoopRow_NEON(TransformType /*tx_type*/,
+                                TransformSize tx_size, int adjusted_tx_height,
+                                void* src_buffer, int /*start_x*/,
+                                int /*start_y*/, void* /*dst_frame*/) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+
+  if (DctDcOnly<16>(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<16>(src, adjusted_tx_height);
+  }
+
+  assert(adjusted_tx_height % 4 == 0);
+  int i = adjusted_tx_height;
+  auto* data = src;
+  do {
+    // Process 4 1d dct16 rows in parallel per iteration.
+    Dct16_NEON<ButterflyRotation_4>(data, 16, /*is_row=*/true, row_shift);
+    data += 64;
+    i -= 4;
+  } while (i != 0);
+}
+
+void Dct16TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+                                   int adjusted_tx_height, void* src_buffer,
+                                   int start_x, int start_y, void* dst_frame) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<16>(src, tx_width);
+  }
+
+  if (!DctDcOnlyColumn<16>(src, adjusted_tx_height, tx_width)) {
+    // Process 4 1d dct16 columns in parallel per iteration.
+    int i = tx_width;
+    auto* data = src;
+    do {
+      Dct16_NEON<ButterflyRotation_4>(data, tx_width, /*is_row=*/false,
+                                      /*row_shift=*/0);
+      data += 4;
+      i -= 4;
+    } while (i != 0);
+  }
+  auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+  StoreToFrameWithRound<16>(frame, start_x, start_y, tx_width, src, tx_type);
+}
+
+void Dct32TransformLoopRow_NEON(TransformType /*tx_type*/,
+                                TransformSize tx_size, int adjusted_tx_height,
+                                void* src_buffer, int /*start_x*/,
+                                int /*start_y*/, void* /*dst_frame*/) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+
+  if (DctDcOnly<32>(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<32>(src, adjusted_tx_height);
+  }
+
+  assert(adjusted_tx_height % 4 == 0);
+  int i = adjusted_tx_height;
+  auto* data = src;
+  do {
+    // Process 4 1d dct32 rows in parallel per iteration.
+    Dct32_NEON(data, 32, /*is_row=*/true, row_shift);
+    data += 128;
+    i -= 4;
+  } while (i != 0);
+}
+
+void Dct32TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+                                   int adjusted_tx_height, void* src_buffer,
+                                   int start_x, int start_y, void* dst_frame) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<32>(src, tx_width);
+  }
+
+  if (!DctDcOnlyColumn<32>(src, adjusted_tx_height, tx_width)) {
+    // Process 4 1d dct32 columns in parallel per iteration.
+    int i = tx_width;
+    auto* data = src;
+    do {
+      Dct32_NEON(data, tx_width, /*is_row=*/false, /*row_shift=*/0);
+      data += 4;
+      i -= 4;
+    } while (i != 0);
+  }
+  auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+  StoreToFrameWithRound<32>(frame, start_x, start_y, tx_width, src, tx_type);
+}
+
+void Dct64TransformLoopRow_NEON(TransformType /*tx_type*/,
+                                TransformSize tx_size, int adjusted_tx_height,
+                                void* src_buffer, int /*start_x*/,
+                                int /*start_y*/, void* /*dst_frame*/) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+
+  if (DctDcOnly<64>(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<64>(src, adjusted_tx_height);
+  }
+
+  assert(adjusted_tx_height % 4 == 0);
+  int i = adjusted_tx_height;
+  auto* data = src;
+  do {
+    // Process 4 1d dct64 rows in parallel per iteration.
+    Dct64_NEON(data, 64, /*is_row=*/true, row_shift);
+    data += 128 * 2;
+    i -= 4;
+  } while (i != 0);
+}
+
+void Dct64TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+                                   int adjusted_tx_height, void* src_buffer,
+                                   int start_x, int start_y, void* dst_frame) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<64>(src, tx_width);
+  }
+
+  if (!DctDcOnlyColumn<64>(src, adjusted_tx_height, tx_width)) {
+    // Process 4 1d dct64 columns in parallel per iteration.
+    int i = tx_width;
+    auto* data = src;
+    do {
+      Dct64_NEON(data, tx_width, /*is_row=*/false, /*row_shift=*/0);
+      data += 4;
+      i -= 4;
+    } while (i != 0);
+  }
+  auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+  StoreToFrameWithRound<64>(frame, start_x, start_y, tx_width, src, tx_type);
+}
+
+void Adst4TransformLoopRow_NEON(TransformType /*tx_type*/,
+                                TransformSize tx_size, int adjusted_tx_height,
+                                void* src_buffer, int /*start_x*/,
+                                int /*start_y*/, void* /*dst_frame*/) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const int tx_height = kTransformHeight[tx_size];
+  const int row_shift = static_cast<int>(tx_height == 16);
+  const bool should_round = (tx_height == 8);
+
+  if (Adst4DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<4>(src, adjusted_tx_height);
+  }
+
+  // Process 4 1d adst4 rows in parallel per iteration.
+  int i = adjusted_tx_height;
+  auto* data = src;
+  do {
+    Adst4_NEON(data, /*step=*/4, /*is_row=*/true, row_shift);
+    data += 16;
+    i -= 4;
+  } while (i != 0);
+}
+
+void Adst4TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+                                   int adjusted_tx_height, void* src_buffer,
+                                   int start_x, int start_y, void* dst_frame) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<4>(src, tx_width);
+  }
+
+  if (!Adst4DcOnlyColumn(src, adjusted_tx_height, tx_width)) {
+    // Process 4 1d adst4 columns in parallel per iteration.
+    int i = tx_width;
+    auto* data = src;
+    do {
+      Adst4_NEON(data, tx_width, /*is_row=*/false, /*row_shift=*/0);
+      data += 4;
+      i -= 4;
+    } while (i != 0);
+  }
+
+  auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+  StoreToFrameWithRound<4, /*enable_flip_rows=*/true>(frame, start_x, start_y,
+                                                      tx_width, src, tx_type);
+}
+
+void Adst8TransformLoopRow_NEON(TransformType /*tx_type*/,
+                                TransformSize tx_size, int adjusted_tx_height,
+                                void* src_buffer, int /*start_x*/,
+                                int /*start_y*/, void* /*dst_frame*/) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+
+  if (Adst8DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<8>(src, adjusted_tx_height);
+  }
+
+  // Process 4 1d adst8 rows in parallel per iteration.
+  assert(adjusted_tx_height % 4 == 0);
+  int i = adjusted_tx_height;
+  auto* data = src;
+  do {
+    Adst8_NEON<ButterflyRotation_4>(data, /*step=*/8,
+                                    /*transpose=*/true, row_shift);
+    data += 32;
+    i -= 4;
+  } while (i != 0);
+}
+
+void Adst8TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+                                   int adjusted_tx_height, void* src_buffer,
+                                   int start_x, int start_y, void* dst_frame) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<8>(src, tx_width);
+  }
+
+  if (!Adst8DcOnlyColumn(src, adjusted_tx_height, tx_width)) {
+    // Process 4 1d adst8 columns in parallel per iteration.
+    int i = tx_width;
+    auto* data = src;
+    do {
+      Adst8_NEON<ButterflyRotation_4>(data, tx_width, /*transpose=*/false,
+                                      /*row_shift=*/0);
+      data += 4;
+      i -= 4;
+    } while (i != 0);
+  }
+  auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+  StoreToFrameWithRound<8, /*enable_flip_rows=*/true>(frame, start_x, start_y,
+                                                      tx_width, src, tx_type);
+}
+
+void Adst16TransformLoopRow_NEON(TransformType /*tx_type*/,
+                                 TransformSize tx_size, int adjusted_tx_height,
+                                 void* src_buffer, int /*start_x*/,
+                                 int /*start_y*/, void* /*dst_frame*/) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+
+  if (Adst16DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<16>(src, adjusted_tx_height);
+  }
+
+  assert(adjusted_tx_height % 4 == 0);
+  int i = adjusted_tx_height;
+  do {
+    // Process 4 1d adst16 rows in parallel per iteration.
+    Adst16_NEON<ButterflyRotation_4>(src, 16, /*is_row=*/true, row_shift);
+    src += 64;
+    i -= 4;
+  } while (i != 0);
+}
+
+void Adst16TransformLoopColumn_NEON(TransformType tx_type,
+                                    TransformSize tx_size,
+                                    int adjusted_tx_height, void* src_buffer,
+                                    int start_x, int start_y, void* dst_frame) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<16>(src, tx_width);
+  }
+
+  if (!Adst16DcOnlyColumn(src, adjusted_tx_height, tx_width)) {
+    int i = tx_width;
+    auto* data = src;
+    do {
+      // Process 4 1d adst16 columns in parallel per iteration.
+      Adst16_NEON<ButterflyRotation_4>(data, tx_width, /*is_row=*/false,
+                                       /*row_shift=*/0);
+      data += 4;
+      i -= 4;
+    } while (i != 0);
+  }
+  auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+  StoreToFrameWithRound<16, /*enable_flip_rows=*/true>(frame, start_x, start_y,
+                                                       tx_width, src, tx_type);
+}
+
+void Identity4TransformLoopRow_NEON(TransformType tx_type,
+                                    TransformSize tx_size,
+                                    int adjusted_tx_height, void* src_buffer,
+                                    int /*start_x*/, int /*start_y*/,
+                                    void* /*dst_frame*/) {
+  // Special case: Process row calculations during column transform call.
+  // Improves performance.
+  if (tx_type == kTransformTypeIdentityIdentity &&
+      tx_size == kTransformSize4x4) {
+    return;
+  }
+
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const int tx_height = kTransformHeight[tx_size];
+  const bool should_round = (tx_height == 8);
+
+  if (Identity4DcOnly(src, adjusted_tx_height, should_round, tx_height)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<4>(src, adjusted_tx_height);
+  }
+
+  const int shift = tx_height > 8 ? 1 : 0;
+  int i = adjusted_tx_height;
+  do {
+    Identity4_NEON(src, /*step=*/4, shift);
+    src += 16;
+    i -= 4;
+  } while (i != 0);
+}
+
+void Identity4TransformLoopColumn_NEON(TransformType tx_type,
+                                       TransformSize tx_size,
+                                       int adjusted_tx_height, void* src_buffer,
+                                       int start_x, int start_y,
+                                       void* dst_frame) {
+  auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  // Special case: Process row calculations during column transform call.
+  if (tx_type == kTransformTypeIdentityIdentity &&
+      (tx_size == kTransformSize4x4 || tx_size == kTransformSize8x4)) {
+    Identity4RowColumnStoreToFrame(frame, start_x, start_y, tx_width,
+                                   adjusted_tx_height, src);
+    return;
+  }
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<4>(src, tx_width);
+  }
+
+  IdentityColumnStoreToFrame<4>(frame, start_x, start_y, tx_width,
+                                adjusted_tx_height, src);
+}
+
+void Identity8TransformLoopRow_NEON(TransformType tx_type,
+                                    TransformSize tx_size,
+                                    int adjusted_tx_height, void* src_buffer,
+                                    int /*start_x*/, int /*start_y*/,
+                                    void* /*dst_frame*/) {
+  // Special case: Process row calculations during column transform call.
+  // Improves performance.
+  if (tx_type == kTransformTypeIdentityIdentity &&
+      tx_size == kTransformSize8x4) {
+    return;
+  }
+
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const int tx_height = kTransformHeight[tx_size];
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+
+  if (Identity8DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+  if (should_round) {
+    ApplyRounding<8>(src, adjusted_tx_height);
+  }
+
+  // When combining the identity8 multiplier with the row shift, the
+  // calculations for tx_height == 8 and tx_height == 16 can be simplified
+  // from ((A * 2) + 1) >> 1) to A. For 10bpp, A must be clamped to a signed 16
+  // bit value.
+  if ((tx_height & 0x18) != 0) {
+    for (int i = 0; i < tx_height; ++i) {
+      const int32x4_t v_src_lo = vld1q_s32(&src[i * 8]);
+      const int32x4_t v_src_hi = vld1q_s32(&src[(i * 8) + 4]);
+      vst1q_s32(&src[i * 8], vmovl_s16(vqmovn_s32(v_src_lo)));
+      vst1q_s32(&src[(i * 8) + 4], vmovl_s16(vqmovn_s32(v_src_hi)));
+    }
+    return;
+  }
+  if (tx_height == 32) {
+    int i = adjusted_tx_height;
+    do {
+      Identity8Row32_NEON(src, /*step=*/8);
+      src += 32;
+      i -= 4;
+    } while (i != 0);
+    return;
+  }
+
+  assert(tx_size == kTransformSize8x4);
+  int i = adjusted_tx_height;
+  do {
+    Identity8Row4_NEON(src, /*step=*/8);
+    src += 32;
+    i -= 4;
+  } while (i != 0);
+}
+
+void Identity8TransformLoopColumn_NEON(TransformType tx_type,
+                                       TransformSize tx_size,
+                                       int adjusted_tx_height, void* src_buffer,
+                                       int start_x, int start_y,
+                                       void* dst_frame) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<8>(src, tx_width);
+  }
+  auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+  IdentityColumnStoreToFrame<8>(frame, start_x, start_y, tx_width,
+                                adjusted_tx_height, src);
+}
+
+void Identity16TransformLoopRow_NEON(TransformType /*tx_type*/,
+                                     TransformSize tx_size,
+                                     int adjusted_tx_height, void* src_buffer,
+                                     int /*start_x*/, int /*start_y*/,
+                                     void* /*dst_frame*/) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+
+  if (Identity16DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<16>(src, adjusted_tx_height);
+  }
+  int i = adjusted_tx_height;
+  do {
+    Identity16Row_NEON(src, /*step=*/16, row_shift);
+    src += 64;
+    i -= 4;
+  } while (i != 0);
+}
+
+void Identity16TransformLoopColumn_NEON(TransformType tx_type,
+                                        TransformSize tx_size,
+                                        int adjusted_tx_height,
+                                        void* src_buffer, int start_x,
+                                        int start_y, void* dst_frame) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<16>(src, tx_width);
+  }
+  auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+  IdentityColumnStoreToFrame<16>(frame, start_x, start_y, tx_width,
+                                 adjusted_tx_height, src);
+}
+
+//------------------------------------------------------------------------------
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+  // Maximum transform size for Dct is 64.
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kRow] =
+      Dct4TransformLoopRow_NEON;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kColumn] =
+      Dct4TransformLoopColumn_NEON;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kRow] =
+      Dct8TransformLoopRow_NEON;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kColumn] =
+      Dct8TransformLoopColumn_NEON;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kRow] =
+      Dct16TransformLoopRow_NEON;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kColumn] =
+      Dct16TransformLoopColumn_NEON;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kRow] =
+      Dct32TransformLoopRow_NEON;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kColumn] =
+      Dct32TransformLoopColumn_NEON;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kRow] =
+      Dct64TransformLoopRow_NEON;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kColumn] =
+      Dct64TransformLoopColumn_NEON;
+
+  // Maximum transform size for Adst is 16.
+  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kRow] =
+      Adst4TransformLoopRow_NEON;
+  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kColumn] =
+      Adst4TransformLoopColumn_NEON;
+  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kRow] =
+      Adst8TransformLoopRow_NEON;
+  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kColumn] =
+      Adst8TransformLoopColumn_NEON;
+  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kRow] =
+      Adst16TransformLoopRow_NEON;
+  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kColumn] =
+      Adst16TransformLoopColumn_NEON;
+
+  // Maximum transform size for Identity transform is 32.
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kRow] =
+      Identity4TransformLoopRow_NEON;
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kColumn] =
+      Identity4TransformLoopColumn_NEON;
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kRow] =
+      Identity8TransformLoopRow_NEON;
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kColumn] =
+      Identity8TransformLoopColumn_NEON;
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kRow] =
+      Identity16TransformLoopRow_NEON;
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kColumn] =
+      Identity16TransformLoopColumn_NEON;
+}
+
+}  // namespace
+
+void InverseTransformInit10bpp_NEON() { Init10bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+#else   // !LIBGAV1_ENABLE_NEON || LIBGAV1_MAX_BITDEPTH < 10
+namespace libgav1 {
+namespace dsp {
+
+void InverseTransformInit10bpp_NEON() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_ENABLE_NEON && LIBGAV1_MAX_BITDEPTH >= 10
diff --git a/src/dsp/arm/inverse_transform_neon.cc b/src/dsp/arm/inverse_transform_neon.cc
index 072991a..315d5e9 100644
--- a/src/dsp/arm/inverse_transform_neon.cc
+++ b/src/dsp/arm/inverse_transform_neon.cc
@@ -3117,7 +3117,7 @@ void InverseTransformInit_NEON() { low_bitdepth::Init8bpp(); }
 
 }  // namespace dsp
 }  // namespace libgav1
-#else  // !LIBGAV1_ENABLE_NEON
+#else   // !LIBGAV1_ENABLE_NEON
 namespace libgav1 {
 namespace dsp {
 
diff --git a/src/dsp/arm/inverse_transform_neon.h b/src/dsp/arm/inverse_transform_neon.h
index af647e8..91e0e83 100644
--- a/src/dsp/arm/inverse_transform_neon.h
+++ b/src/dsp/arm/inverse_transform_neon.h
@@ -26,6 +26,7 @@ namespace dsp {
 // Initializes Dsp::inverse_transforms, see the defines below for specifics.
 // This function is not thread-safe.
 void InverseTransformInit_NEON();
+void InverseTransformInit10bpp_NEON();
 
 }  // namespace dsp
 }  // namespace libgav1
@@ -47,6 +48,21 @@ void InverseTransformInit_NEON();
 #define LIBGAV1_Dsp8bpp_1DTransformSize32_1DTransformIdentity LIBGAV1_CPU_NEON
 
 #define LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformWht LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_1DTransformSize4_1DTransformDct LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_1DTransformSize8_1DTransformDct LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_1DTransformSize16_1DTransformDct LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_1DTransformSize32_1DTransformDct LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_1DTransformSize64_1DTransformDct LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_1DTransformSize4_1DTransformAdst LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_1DTransformSize8_1DTransformAdst LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_1DTransformSize16_1DTransformAdst LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_1DTransformSize4_1DTransformIdentity LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_1DTransformSize8_1DTransformIdentity LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_1DTransformSize16_1DTransformIdentity LIBGAV1_CPU_NEON
+
 #endif  // LIBGAV1_ENABLE_NEON
 
 #endif  // LIBGAV1_SRC_DSP_ARM_INVERSE_TRANSFORM_NEON_H_
diff --git a/src/dsp/arm/loop_filter_neon.cc b/src/dsp/arm/loop_filter_neon.cc
index 146c983..8d72892 100644
--- a/src/dsp/arm/loop_filter_neon.cc
+++ b/src/dsp/arm/loop_filter_neon.cc
@@ -35,7 +35,7 @@ namespace {
 // (abs(p1 - p0) > thresh) || (abs(q1 - q0) > thresh)
 inline uint8x8_t Hev(const uint8x8_t abd_p0p1_q0q1, const uint8_t thresh) {
   const uint8x8_t a = vcgt_u8(abd_p0p1_q0q1, vdup_n_u8(thresh));
-  return vorr_u8(a, RightShift<32>(a));
+  return vorr_u8(a, RightShiftVector<32>(a));
 }
 
 // abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= outer_thresh
@@ -44,7 +44,7 @@ inline uint8x8_t OuterThreshold(const uint8x8_t p0q0, const uint8x8_t p1q1,
   const uint8x8x2_t a = Interleave32(p0q0, p1q1);
   const uint8x8_t b = vabd_u8(a.val[0], a.val[1]);
   const uint8x8_t p0q0_double = vqadd_u8(b, b);
-  const uint8x8_t p1q1_half = RightShift<32>(vshr_n_u8(b, 1));
+  const uint8x8_t p1q1_half = RightShiftVector<32>(vshr_n_u8(b, 1));
   const uint8x8_t c = vqadd_u8(p0q0_double, p1q1_half);
   return vcle_u8(c, vdup_n_u8(outer_thresh));
 }
@@ -56,7 +56,7 @@ inline uint8x8_t NeedsFilter4(const uint8x8_t abd_p0p1_q0q1,
                               const uint8_t inner_thresh,
                               const uint8_t outer_thresh) {
   const uint8x8_t a = vcle_u8(abd_p0p1_q0q1, vdup_n_u8(inner_thresh));
-  const uint8x8_t inner_mask = vand_u8(a, RightShift<32>(a));
+  const uint8x8_t inner_mask = vand_u8(a, RightShiftVector<32>(a));
   const uint8x8_t outer_mask = OuterThreshold(p0q0, p1q1, outer_thresh);
   return vand_u8(inner_mask, outer_mask);
 }
@@ -121,7 +121,7 @@ inline void Filter4(const uint8x8_t q0p1, const uint8x8_t p0q1,
       vcombine_s16(vget_low_s16(p0q1_l), vget_low_s16(q0p1_l));
   // Need to shift the second term or we end up with a2_ma2.
   const int8x8_t a2_ma1 =
-      InterleaveLow32(a2_a1, RightShift<32>(vneg_s8(a2_a1)));
+      InterleaveLow32(a2_a1, RightShiftVector<32>(vneg_s8(a2_a1)));
   const int16x8_t p0q0_a = vaddw_s8(p0q0_l, a2_ma1);
 
   *p1q1_result = vqmovun_s16(p1q1_a3);
@@ -251,7 +251,7 @@ inline uint8x8_t IsFlat3(const uint8x8_t abd_p0p1_q0q1,
                          const uint8x8_t abd_p0p2_q0q2) {
   const uint8x8_t a = vmax_u8(abd_p0p1_q0q1, abd_p0p2_q0q2);
   const uint8x8_t b = vcle_u8(a, vdup_n_u8(1));
-  return vand_u8(b, RightShift<32>(b));
+  return vand_u8(b, RightShiftVector<32>(b));
 }
 
 // abs(p2 - p1) <= inner_thresh && abs(p1 - p0) <= inner_thresh &&
@@ -264,7 +264,7 @@ inline uint8x8_t NeedsFilter6(const uint8x8_t abd_p0p1_q0q1,
                               const uint8_t outer_thresh) {
   const uint8x8_t a = vmax_u8(abd_p0p1_q0q1, abd_p1p2_q1q2);
   const uint8x8_t b = vcle_u8(a, vdup_n_u8(inner_thresh));
-  const uint8x8_t inner_mask = vand_u8(b, RightShift<32>(b));
+  const uint8x8_t inner_mask = vand_u8(b, RightShiftVector<32>(b));
   const uint8x8_t outer_mask = OuterThreshold(p0q0, p1q1, outer_thresh);
   return vand_u8(inner_mask, outer_mask);
 }
@@ -482,7 +482,7 @@ inline uint8x8_t IsFlat4(const uint8x8_t abd_p0n0_q0n0,
   const uint8x8_t a = vmax_u8(abd_p0n0_q0n0, abd_p0n1_q0n1);
   const uint8x8_t b = vmax_u8(a, abd_p0n2_q0n2);
   const uint8x8_t c = vcle_u8(b, vdup_n_u8(1));
-  return vand_u8(c, RightShift<32>(c));
+  return vand_u8(c, RightShiftVector<32>(c));
 }
 
 // abs(p3 - p2) <= inner_thresh && abs(p2 - p1) <= inner_thresh &&
@@ -498,7 +498,7 @@ inline uint8x8_t NeedsFilter8(const uint8x8_t abd_p0p1_q0q1,
   const uint8x8_t a = vmax_u8(abd_p0p1_q0q1, abd_p1p2_q1q2);
   const uint8x8_t b = vmax_u8(a, abd_p2p3_q2q3);
   const uint8x8_t c = vcle_u8(b, vdup_n_u8(inner_thresh));
-  const uint8x8_t inner_mask = vand_u8(c, RightShift<32>(c));
+  const uint8x8_t inner_mask = vand_u8(c, RightShiftVector<32>(c));
   const uint8x8_t outer_mask = OuterThreshold(p0q0, p1q1, outer_thresh);
   return vand_u8(inner_mask, outer_mask);
 }
@@ -1179,7 +1179,7 @@ void LoopFilterInit_NEON() { low_bitdepth::Init8bpp(); }
 }  // namespace dsp
 }  // namespace libgav1
 
-#else  // !LIBGAV1_ENABLE_NEON
+#else   // !LIBGAV1_ENABLE_NEON
 namespace libgav1 {
 namespace dsp {
 
diff --git a/src/dsp/arm/loop_restoration_neon.cc b/src/dsp/arm/loop_restoration_neon.cc
index 337c9b4..e6ceb66 100644
--- a/src/dsp/arm/loop_restoration_neon.cc
+++ b/src/dsp/arm/loop_restoration_neon.cc
@@ -41,10 +41,25 @@ inline uint8x8_t VshrU128(const uint8x8x2_t src) {
 }
 
 template <int bytes>
+inline uint8x8_t VshrU128(const uint8x8_t src[2]) {
+  return vext_u8(src[0], src[1], bytes);
+}
+
+template <int bytes>
+inline uint8x16_t VshrU128(const uint8x16_t src[2]) {
+  return vextq_u8(src[0], src[1], bytes);
+}
+
+template <int bytes>
 inline uint16x8_t VshrU128(const uint16x8x2_t src) {
   return vextq_u16(src.val[0], src.val[1], bytes / 2);
 }
 
+template <int bytes>
+inline uint16x8_t VshrU128(const uint16x8_t src[2]) {
+  return vextq_u16(src[0], src[1], bytes / 2);
+}
+
 // Wiener
 
 // Must make a local copy of coefficients to help compiler know that they have
@@ -177,18 +192,17 @@ inline void WienerHorizontalTap3(const uint8_t* src, const ptrdiff_t src_stride,
                                  int16_t** const wiener_buffer) {
   for (int y = height; y != 0; --y) {
     const uint8_t* src_ptr = src;
-    uint8x16_t s[4];
-    s[0] = vld1q_u8(src_ptr);
+    uint8x16_t s[3];
     ptrdiff_t x = width;
     do {
-      src_ptr += 16;
-      s[3] = vld1q_u8(src_ptr);
-      s[1] = vextq_u8(s[0], s[3], 1);
-      s[2] = vextq_u8(s[0], s[3], 2);
+      // Slightly faster than using vextq_u8().
+      s[0] = vld1q_u8(src_ptr);
+      s[1] = vld1q_u8(src_ptr + 1);
+      s[2] = vld1q_u8(src_ptr + 2);
       int16x8x2_t sum;
       sum.val[0] = sum.val[1] = vdupq_n_s16(0);
       WienerHorizontalSum(s, filter, sum, *wiener_buffer);
-      s[0] = s[3];
+      src_ptr += 16;
       *wiener_buffer += 16;
       x -= 16;
     } while (x != 0);
@@ -476,12 +490,12 @@ inline void WienerVerticalTap1(const int16_t* wiener_buffer,
 // For width 16 and up, store the horizontal results, and then do the vertical
 // filter row by row. This is faster than doing it column by column when
 // considering cache issues.
-void WienerFilter_NEON(const RestorationUnitInfo& restoration_info,
-                       const void* const source, const void* const top_border,
-                       const void* const bottom_border, const ptrdiff_t stride,
-                       const int width, const int height,
-                       RestorationBuffer* const restoration_buffer,
-                       void* const dest) {
+void WienerFilter_NEON(
+    const RestorationUnitInfo& restoration_info, const void* const source,
+    const ptrdiff_t stride, const void* const top_border,
+    const ptrdiff_t top_border_stride, const void* const bottom_border,
+    const ptrdiff_t bottom_border_stride, const int width, const int height,
+    RestorationBuffer* const restoration_buffer, void* const dest) {
   const int16_t* const number_leading_zero_coefficients =
       restoration_info.wiener_info.number_leading_zero_coefficients;
   const int number_rows_to_skip = std::max(
@@ -509,39 +523,42 @@ void WienerFilter_NEON(const RestorationUnitInfo& restoration_info,
   const auto* const top = static_cast<const uint8_t*>(top_border);
   const auto* const bottom = static_cast<const uint8_t*>(bottom_border);
   if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) {
-    WienerHorizontalTap7(top + (2 - height_extra) * stride - 3, stride,
-                         wiener_stride, height_extra, filter_horizontal,
-                         &wiener_buffer_horizontal);
-    WienerHorizontalTap7(src - 3, stride, wiener_stride, height,
+    WienerHorizontalTap7(top + (2 - height_extra) * top_border_stride - 3,
+                         top_border_stride, wiener_stride, height_extra,
                          filter_horizontal, &wiener_buffer_horizontal);
-    WienerHorizontalTap7(bottom - 3, stride, wiener_stride, height_extra,
+    WienerHorizontalTap7(src - 3, stride, wiener_stride, height,
                          filter_horizontal, &wiener_buffer_horizontal);
-  } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
-    WienerHorizontalTap5(top + (2 - height_extra) * stride - 2, stride,
-                         wiener_stride, height_extra, filter_horizontal,
+    WienerHorizontalTap7(bottom - 3, bottom_border_stride, wiener_stride,
+                         height_extra, filter_horizontal,
                          &wiener_buffer_horizontal);
-    WienerHorizontalTap5(src - 2, stride, wiener_stride, height,
+  } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
+    WienerHorizontalTap5(top + (2 - height_extra) * top_border_stride - 2,
+                         top_border_stride, wiener_stride, height_extra,
                          filter_horizontal, &wiener_buffer_horizontal);
-    WienerHorizontalTap5(bottom - 2, stride, wiener_stride, height_extra,
+    WienerHorizontalTap5(src - 2, stride, wiener_stride, height,
                          filter_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap5(bottom - 2, bottom_border_stride, wiener_stride,
+                         height_extra, filter_horizontal,
+                         &wiener_buffer_horizontal);
   } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) {
     // The maximum over-reads happen here.
-    WienerHorizontalTap3(top + (2 - height_extra) * stride - 1, stride,
-                         wiener_stride, height_extra, filter_horizontal,
-                         &wiener_buffer_horizontal);
-    WienerHorizontalTap3(src - 1, stride, wiener_stride, height,
+    WienerHorizontalTap3(top + (2 - height_extra) * top_border_stride - 1,
+                         top_border_stride, wiener_stride, height_extra,
                          filter_horizontal, &wiener_buffer_horizontal);
-    WienerHorizontalTap3(bottom - 1, stride, wiener_stride, height_extra,
+    WienerHorizontalTap3(src - 1, stride, wiener_stride, height,
                          filter_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap3(bottom - 1, bottom_border_stride, wiener_stride,
+                         height_extra, filter_horizontal,
+                         &wiener_buffer_horizontal);
   } else {
     assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3);
-    WienerHorizontalTap1(top + (2 - height_extra) * stride, stride,
-                         wiener_stride, height_extra,
+    WienerHorizontalTap1(top + (2 - height_extra) * top_border_stride,
+                         top_border_stride, wiener_stride, height_extra,
                          &wiener_buffer_horizontal);
     WienerHorizontalTap1(src, stride, wiener_stride, height,
                          &wiener_buffer_horizontal);
-    WienerHorizontalTap1(bottom, stride, wiener_stride, height_extra,
-                         &wiener_buffer_horizontal);
+    WienerHorizontalTap1(bottom, bottom_border_stride, wiener_stride,
+                         height_extra, &wiener_buffer_horizontal);
   }
 
   // vertical filtering.
@@ -574,13 +591,20 @@ void WienerFilter_NEON(const RestorationUnitInfo& restoration_info,
 //------------------------------------------------------------------------------
 // SGR
 
-inline void Prepare3_8(const uint8x8x2_t src, uint8x8_t dst[3]) {
+inline void Prepare3_8(const uint8x8_t src[2], uint8x8_t dst[3]) {
   dst[0] = VshrU128<0>(src);
   dst[1] = VshrU128<1>(src);
   dst[2] = VshrU128<2>(src);
 }
 
-inline void Prepare3_16(const uint16x8x2_t src, uint16x4_t low[3],
+template <int offset>
+inline void Prepare3_8(const uint8x16_t src[2], uint8x16_t dst[3]) {
+  dst[0] = VshrU128<offset + 0>(src);
+  dst[1] = VshrU128<offset + 1>(src);
+  dst[2] = VshrU128<offset + 2>(src);
+}
+
+inline void Prepare3_16(const uint16x8_t src[2], uint16x4_t low[3],
                         uint16x4_t high[3]) {
   uint16x8_t s[3];
   s[0] = VshrU128<0>(src);
@@ -594,7 +618,7 @@ inline void Prepare3_16(const uint16x8x2_t src, uint16x4_t low[3],
   high[2] = vget_high_u16(s[2]);
 }
 
-inline void Prepare5_8(const uint8x8x2_t src, uint8x8_t dst[5]) {
+inline void Prepare5_8(const uint8x8_t src[2], uint8x8_t dst[5]) {
   dst[0] = VshrU128<0>(src);
   dst[1] = VshrU128<1>(src);
   dst[2] = VshrU128<2>(src);
@@ -602,7 +626,16 @@ inline void Prepare5_8(const uint8x8x2_t src, uint8x8_t dst[5]) {
   dst[4] = VshrU128<4>(src);
 }
 
-inline void Prepare5_16(const uint16x8x2_t src, uint16x4_t low[5],
+template <int offset>
+inline void Prepare5_8(const uint8x16_t src[2], uint8x16_t dst[5]) {
+  dst[0] = VshrU128<offset + 0>(src);
+  dst[1] = VshrU128<offset + 1>(src);
+  dst[2] = VshrU128<offset + 2>(src);
+  dst[3] = VshrU128<offset + 3>(src);
+  dst[4] = VshrU128<offset + 4>(src);
+}
+
+inline void Prepare5_16(const uint16x8_t src[2], uint16x4_t low[5],
                         uint16x4_t high[5]) {
   Prepare3_16(src, low, high);
   const uint16x8_t s3 = VshrU128<6>(src);
@@ -641,6 +674,30 @@ inline uint16x8_t Sum3W_16(const uint8x8_t src[3]) {
   return vaddw_u8(sum, src[2]);
 }
 
+inline uint16x8_t Sum3WLo16(const uint8x16_t src[3]) {
+  const uint16x8_t sum = vaddl_u8(vget_low_u8(src[0]), vget_low_u8(src[1]));
+  return vaddw_u8(sum, vget_low_u8(src[2]));
+}
+
+inline uint16x8_t Sum3WHi16(const uint8x16_t src[3]) {
+  const uint16x8_t sum = vaddl_u8(vget_high_u8(src[0]), vget_high_u8(src[1]));
+  return vaddw_u8(sum, vget_high_u8(src[2]));
+}
+
+inline uint16x8_t Sum5WLo16(const uint8x16_t src[5]) {
+  const uint16x8_t sum01 = vaddl_u8(vget_low_u8(src[0]), vget_low_u8(src[1]));
+  const uint16x8_t sum23 = vaddl_u8(vget_low_u8(src[2]), vget_low_u8(src[3]));
+  const uint16x8_t sum = vaddq_u16(sum01, sum23);
+  return vaddw_u8(sum, vget_low_u8(src[4]));
+}
+
+inline uint16x8_t Sum5WHi16(const uint8x16_t src[5]) {
+  const uint16x8_t sum01 = vaddl_u8(vget_high_u8(src[0]), vget_high_u8(src[1]));
+  const uint16x8_t sum23 = vaddl_u8(vget_high_u8(src[2]), vget_high_u8(src[3]));
+  const uint16x8_t sum = vaddq_u16(sum01, sum23);
+  return vaddw_u8(sum, vget_high_u8(src[4]));
+}
+
 inline uint32x4_t Sum3W_32(const uint16x4_t src[3]) {
   const uint32x4_t sum = vaddl_u16(src[0], src[1]);
   return vaddw_u16(sum, src[2]);
@@ -678,13 +735,28 @@ inline uint32x4_t Sum5W_32(const uint16x4_t src[5]) {
   return vaddw_u16(sum0123, src[4]);
 }
 
-inline uint16x8_t Sum3Horizontal(const uint8x8x2_t src) {
+inline uint16x8_t Sum3Horizontal(const uint8x8_t src[2]) {
   uint8x8_t s[3];
   Prepare3_8(src, s);
   return Sum3W_16(s);
 }
 
-inline uint32x4x2_t Sum3WHorizontal(const uint16x8x2_t src) {
+inline uint16x8_t Sum3Horizontal(const uint8x16_t src) {
+  uint8x8_t s[2];
+  s[0] = vget_low_u8(src);
+  s[1] = vget_high_u8(src);
+  return Sum3Horizontal(s);
+}
+
+template <int offset>
+inline void Sum3Horizontal(const uint8x16_t src[2], uint16x8_t dst[2]) {
+  uint8x16_t s[3];
+  Prepare3_8<offset>(src, s);
+  dst[0] = Sum3WLo16(s);
+  dst[1] = Sum3WHi16(s);
+}
+
+inline uint32x4x2_t Sum3WHorizontal(const uint16x8_t src[2]) {
   uint16x4_t low[3], high[3];
   uint32x4x2_t sum;
   Prepare3_16(src, low, high);
@@ -693,7 +765,7 @@ inline uint32x4x2_t Sum3WHorizontal(const uint16x8x2_t src) {
   return sum;
 }
 
-inline uint16x8_t Sum5Horizontal(const uint8x8x2_t src) {
+inline uint16x8_t Sum5Horizontal(const uint8x8_t src[2]) {
   uint8x8_t s[5];
   Prepare5_8(src, s);
   const uint16x8_t sum01 = vaddl_u8(s[0], s[1]);
@@ -702,7 +774,23 @@ inline uint16x8_t Sum5Horizontal(const uint8x8x2_t src) {
   return vaddw_u8(sum0123, s[4]);
 }
 
-inline uint32x4x2_t Sum5WHorizontal(const uint16x8x2_t src) {
+inline uint16x8_t Sum5Horizontal(const uint8x16_t src) {
+  uint8x8_t s[2];
+  s[0] = vget_low_u8(src);
+  s[1] = vget_high_u8(src);
+  return Sum5Horizontal(s);
+}
+
+template <int offset>
+inline void Sum5Horizontal(const uint8x16_t src[2], uint16x8_t* const dst0,
+                           uint16x8_t* const dst1) {
+  uint8x16_t s[5];
+  Prepare5_8<offset>(src, s);
+  *dst0 = Sum5WLo16(s);
+  *dst1 = Sum5WHi16(s);
+}
+
+inline uint32x4x2_t Sum5WHorizontal(const uint16x8_t src[2]) {
   uint16x4_t low[5], high[5];
   Prepare5_16(src, low, high);
   uint32x4x2_t sum;
@@ -711,35 +799,68 @@ inline uint32x4x2_t Sum5WHorizontal(const uint16x8x2_t src) {
   return sum;
 }
 
-void SumHorizontal(const uint16x4_t src[5], uint32x4_t* const row_sq3,
-                   uint32x4_t* const row_sq5) {
-  const uint32x4_t sum04 = vaddl_u16(src[0], src[4]);
-  const uint32x4_t sum12 = vaddl_u16(src[1], src[2]);
-  *row_sq3 = vaddw_u16(sum12, src[3]);
-  *row_sq5 = vaddq_u32(sum04, *row_sq3);
+template <int offset>
+void SumHorizontal(const uint8x16_t src[2], uint16x8_t* const row3_0,
+                   uint16x8_t* const row3_1, uint16x8_t* const row5_0,
+                   uint16x8_t* const row5_1) {
+  uint8x16_t s[5];
+  Prepare5_8<offset>(src, s);
+  const uint16x8_t sum04_lo = vaddl_u8(vget_low_u8(s[0]), vget_low_u8(s[4]));
+  const uint16x8_t sum04_hi = vaddl_u8(vget_high_u8(s[0]), vget_high_u8(s[4]));
+  *row3_0 = Sum3WLo16(s + 1);
+  *row3_1 = Sum3WHi16(s + 1);
+  *row5_0 = vaddq_u16(sum04_lo, *row3_0);
+  *row5_1 = vaddq_u16(sum04_hi, *row3_1);
 }
 
-void SumHorizontal(const uint8x8x2_t src, const uint16x8x2_t sq,
-                   uint16x8_t* const row3, uint16x8_t* const row5,
-                   uint32x4x2_t* const row_sq3, uint32x4x2_t* const row_sq5) {
+void SumHorizontal(const uint8x8_t src[2], uint16x8_t* const row3,
+                   uint16x8_t* const row5) {
   uint8x8_t s[5];
   Prepare5_8(src, s);
   const uint16x8_t sum04 = vaddl_u8(s[0], s[4]);
   const uint16x8_t sum12 = vaddl_u8(s[1], s[2]);
   *row3 = vaddw_u8(sum12, s[3]);
   *row5 = vaddq_u16(sum04, *row3);
+}
+
+void SumHorizontal(const uint16x4_t src[5], uint32x4_t* const row_sq3,
+                   uint32x4_t* const row_sq5) {
+  const uint32x4_t sum04 = vaddl_u16(src[0], src[4]);
+  const uint32x4_t sum12 = vaddl_u16(src[1], src[2]);
+  *row_sq3 = vaddw_u16(sum12, src[3]);
+  *row_sq5 = vaddq_u32(sum04, *row_sq3);
+}
+
+void SumHorizontal(const uint16x8_t sq[2], uint32x4x2_t* const row_sq3,
+                   uint32x4x2_t* const row_sq5) {
   uint16x4_t low[5], high[5];
   Prepare5_16(sq, low, high);
   SumHorizontal(low, &row_sq3->val[0], &row_sq5->val[0]);
   SumHorizontal(high, &row_sq3->val[1], &row_sq5->val[1]);
 }
 
-inline uint16x8_t Sum343(const uint8x8x2_t src) {
-  uint8x8_t s[3];
-  Prepare3_8(src, s);
-  const uint16x8_t sum = Sum3W_16(s);
+void SumHorizontal(const uint8x8_t src[2], const uint16x8_t sq[2],
+                   uint16x8_t* const row3, uint16x8_t* const row5,
+                   uint32x4x2_t* const row_sq3, uint32x4x2_t* const row_sq5) {
+  SumHorizontal(src, row3, row5);
+  SumHorizontal(sq, row_sq3, row_sq5);
+}
+
+void SumHorizontal(const uint8x16_t src, const uint16x8_t sq[2],
+                   uint16x8_t* const row3, uint16x8_t* const row5,
+                   uint32x4x2_t* const row_sq3, uint32x4x2_t* const row_sq5) {
+  uint8x8_t s[2];
+  s[0] = vget_low_u8(src);
+  s[1] = vget_high_u8(src);
+  return SumHorizontal(s, sq, row3, row5, row_sq3, row_sq5);
+}
+
+template <int offset>
+inline uint16x8_t Sum343(const uint8x16_t ma3[2]) {
+  const uint16x8_t sum = (offset == 0) ? Sum3WLo16(ma3) : Sum3WHi16(ma3);
   const uint16x8_t sum3 = Sum3_16(sum, sum, sum);
-  return vaddw_u8(sum3, s[1]);
+  return vaddw_u8(sum3,
+                  (offset == 0) ? vget_low_u8(ma3[1]) : vget_high_u8(ma3[1]));
 }
 
 inline uint32x4_t Sum343W(const uint16x4_t src[3]) {
@@ -748,7 +869,7 @@ inline uint32x4_t Sum343W(const uint16x4_t src[3]) {
   return vaddw_u16(sum3, src[1]);
 }
 
-inline uint32x4x2_t Sum343W(const uint16x8x2_t src) {
+inline uint32x4x2_t Sum343W(const uint16x8_t src[2]) {
   uint16x4_t low[3], high[3];
   uint32x4x2_t d;
   Prepare3_16(src, low, high);
@@ -757,13 +878,13 @@ inline uint32x4x2_t Sum343W(const uint16x8x2_t src) {
   return d;
 }
 
-inline uint16x8_t Sum565(const uint8x8x2_t src) {
-  uint8x8_t s[3];
-  Prepare3_8(src, s);
-  const uint16x8_t sum = Sum3W_16(s);
+template <int offset>
+inline uint16x8_t Sum565(const uint8x16_t ma5[2]) {
+  const uint16x8_t sum = (offset == 0) ? Sum3WLo16(ma5) : Sum3WHi16(ma5);
   const uint16x8_t sum4 = vshlq_n_u16(sum, 2);
   const uint16x8_t sum5 = vaddq_u16(sum4, sum);
-  return vaddw_u8(sum5, s[1]);
+  return vaddw_u8(sum5,
+                  (offset == 0) ? vget_low_u8(ma5[1]) : vget_high_u8(ma5[1]));
 }
 
 inline uint32x4_t Sum565W(const uint16x4_t src[3]) {
@@ -773,7 +894,7 @@ inline uint32x4_t Sum565W(const uint16x4_t src[3]) {
   return vaddw_u16(sum5, src[1]);
 }
 
-inline uint32x4x2_t Sum565W(const uint16x8x2_t src) {
+inline uint32x4x2_t Sum565W(const uint16x8_t src[2]) {
   uint16x4_t low[3], high[3];
   uint32x4x2_t d;
   Prepare3_16(src, low, high);
@@ -783,21 +904,21 @@ inline uint32x4x2_t Sum565W(const uint16x8x2_t src) {
 }
 
 inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride,
-                   const int height, const ptrdiff_t sum_stride, uint16_t* sum3,
-                   uint16_t* sum5, uint32_t* square_sum3,
-                   uint32_t* square_sum5) {
-  int y = height;
+                   const ptrdiff_t sum_stride, uint16_t* sum3, uint16_t* sum5,
+                   uint32_t* square_sum3, uint32_t* square_sum5) {
+  int y = 2;
+  // Don't change loop width to 16, which is even slower.
   do {
-    uint8x8x2_t s;
-    uint16x8x2_t sq;
-    s.val[0] = vld1_u8(src);
-    sq.val[0] = vmull_u8(s.val[0], s.val[0]);
+    uint8x8_t s[2];
+    uint16x8_t sq[2];
+    s[0] = vld1_u8(src);
+    sq[0] = vmull_u8(s[0], s[0]);
     ptrdiff_t x = 0;
     do {
       uint16x8_t row3, row5;
       uint32x4x2_t row_sq3, row_sq5;
-      s.val[1] = vld1_u8(src + x + 8);
-      sq.val[1] = vmull_u8(s.val[1], s.val[1]);
+      s[1] = vld1_u8(src + x + 8);
+      sq[1] = vmull_u8(s[1], s[1]);
       SumHorizontal(s, sq, &row3, &row5, &row_sq3, &row_sq5);
       vst1q_u16(sum3, row3);
       vst1q_u16(sum5, row5);
@@ -805,8 +926,8 @@ inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride,
       vst1q_u32(square_sum3 + 4, row_sq3.val[1]);
       vst1q_u32(square_sum5 + 0, row_sq5.val[0]);
       vst1q_u32(square_sum5 + 4, row_sq5.val[1]);
-      s.val[0] = s.val[1];
-      sq.val[0] = sq.val[1];
+      s[0] = s[1];
+      sq[0] = sq[1];
       sum3 += 8;
       sum5 += 8;
       square_sum3 += 8;
@@ -819,21 +940,22 @@ inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride,
 
 template <int size>
 inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride,
-                   const int height, const ptrdiff_t sum_stride, uint16_t* sums,
+                   const ptrdiff_t sum_stride, uint16_t* sums,
                    uint32_t* square_sums) {
   static_assert(size == 3 || size == 5, "");
-  int y = height;
+  int y = 2;
+  // Don't change loop width to 16, which is even slower.
   do {
-    uint8x8x2_t s;
-    uint16x8x2_t sq;
-    s.val[0] = vld1_u8(src);
-    sq.val[0] = vmull_u8(s.val[0], s.val[0]);
+    uint8x8_t s[2];
+    uint16x8_t sq[2];
+    s[0] = vld1_u8(src);
+    sq[0] = vmull_u8(s[0], s[0]);
     ptrdiff_t x = 0;
     do {
       uint16x8_t row;
       uint32x4x2_t row_sq;
-      s.val[1] = vld1_u8(src + x + 8);
-      sq.val[1] = vmull_u8(s.val[1], s.val[1]);
+      s[1] = vld1_u8(src + x + 8);
+      sq[1] = vmull_u8(s[1], s[1]);
       if (size == 3) {
         row = Sum3Horizontal(s);
         row_sq = Sum3WHorizontal(sq);
@@ -844,8 +966,8 @@ inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride,
       vst1q_u16(sums, row);
       vst1q_u32(square_sums + 0, row_sq.val[0]);
       vst1q_u32(square_sums + 4, row_sq.val[1]);
-      s.val[0] = s.val[1];
-      sq.val[0] = sq.val[1];
+      s[0] = s[1];
+      sq[0] = sq[1];
       sums += 8;
       square_sums += 8;
       x += 8;
@@ -871,10 +993,18 @@ inline uint16x4_t CalculateMa(const uint16x4_t sum, const uint32x4_t sum_sq,
   return vmovn_u32(shifted);
 }
 
-template <int n>
+inline uint8x8_t AdjustValue(const uint8x8_t value, const uint8x8_t index,
+                             const int threshold) {
+  const uint8x8_t thresholds = vdup_n_u8(threshold);
+  const uint8x8_t offset = vcgt_u8(index, thresholds);
+  // Adding 255 is equivalent to subtracting 1 for 8-bit data.
+  return vadd_u8(value, offset);
+}
+
+template <int n, int offset>
 inline void CalculateIntermediate(const uint16x8_t sum,
                                   const uint32x4x2_t sum_sq,
-                                  const uint32_t scale, uint8x8_t* const ma,
+                                  const uint32_t scale, uint8x16_t* const ma,
                                   uint16x8_t* const b) {
   constexpr uint32_t one_over_n =
       ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n;
@@ -882,19 +1012,39 @@ inline void CalculateIntermediate(const uint16x8_t sum,
   const uint16x4_t z1 =
       CalculateMa<n>(vget_high_u16(sum), sum_sq.val[1], scale);
   const uint16x8_t z01 = vcombine_u16(z0, z1);
-  // Using vqmovn_u16() needs an extra sign extension instruction.
-  const uint16x8_t z = vminq_u16(z01, vdupq_n_u16(255));
-  // Using vgetq_lane_s16() can save the sign extension instruction.
-  const uint8_t lookup[8] = {
-      kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 0)],
-      kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 1)],
-      kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 2)],
-      kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 3)],
-      kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 4)],
-      kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 5)],
-      kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 6)],
-      kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 7)]};
-  *ma = vld1_u8(lookup);
+  const uint8x8_t idx = vqmovn_u16(z01);
+  // Use table lookup to read elements whose indices are less than 48.
+  // Using one uint8x8x4_t vector and one uint8x8x2_t vector is faster than
+  // using two uint8x8x3_t vectors.
+  uint8x8x4_t table0;
+  uint8x8x2_t table1;
+  table0.val[0] = vld1_u8(kSgrMaLookup + 0 * 8);
+  table0.val[1] = vld1_u8(kSgrMaLookup + 1 * 8);
+  table0.val[2] = vld1_u8(kSgrMaLookup + 2 * 8);
+  table0.val[3] = vld1_u8(kSgrMaLookup + 3 * 8);
+  table1.val[0] = vld1_u8(kSgrMaLookup + 4 * 8);
+  table1.val[1] = vld1_u8(kSgrMaLookup + 5 * 8);
+  // All elements whose indices are out of range [0, 47] are set to 0.
+  uint8x8_t val = vtbl4_u8(table0, idx);  // Range [0, 31].
+  // Subtract 8 to shuffle the next index range.
+  const uint8x8_t index = vsub_u8(idx, vdup_n_u8(32));
+  const uint8x8_t res = vtbl2_u8(table1, index);  // Range [32, 47].
+  // Use OR instruction to combine shuffle results together.
+  val = vorr_u8(val, res);
+
+  // For elements whose indices are larger than 47, since they seldom change
+  // values with the increase of the index, we use comparison and arithmetic
+  // operations to calculate their values.
+  // Elements whose indices are larger than 47 (with value 0) are set to 5.
+  val = vmax_u8(val, vdup_n_u8(5));
+  val = AdjustValue(val, idx, 55);   // 55 is the last index which value is 5.
+  val = AdjustValue(val, idx, 72);   // 72 is the last index which value is 4.
+  val = AdjustValue(val, idx, 101);  // 101 is the last index which value is 3.
+  val = AdjustValue(val, idx, 169);  // 169 is the last index which value is 2.
+  val = AdjustValue(val, idx, 254);  // 254 is the last index which value is 1.
+  *ma = (offset == 0) ? vcombine_u8(val, vget_high_u8(*ma))
+                      : vcombine_u8(vget_low_u8(*ma), val);
+
   // b = ma * b * one_over_n
   // |ma| = [0, 255]
   // |sum| is a box sum with radius 1 or 2.
@@ -906,7 +1056,8 @@ inline void CalculateIntermediate(const uint16x8_t sum,
   // |kSgrProjReciprocalBits| is 12.
   // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
   // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
-  const uint16x8_t maq = vmovl_u8(*ma);
+  const uint16x8_t maq =
+      vmovl_u8((offset == 0) ? vget_low_u8(*ma) : vget_high_u8(*ma));
   const uint32x4_t m0 = vmull_u16(vget_low_u16(maq), vget_low_u16(sum));
   const uint32x4_t m1 = vmull_u16(vget_high_u16(maq), vget_high_u16(sum));
   const uint32x4_t m2 = vmulq_n_u32(m0, one_over_n);
@@ -916,37 +1067,39 @@ inline void CalculateIntermediate(const uint16x8_t sum,
   *b = vcombine_u16(b_lo, b_hi);
 }
 
+template <int offset>
 inline void CalculateIntermediate5(const uint16x8_t s5[5],
                                    const uint32x4x2_t sq5[5],
-                                   const uint32_t scale, uint8x8_t* const ma,
+                                   const uint32_t scale, uint8x16_t* const ma,
                                    uint16x8_t* const b) {
   const uint16x8_t sum = Sum5_16(s5);
   const uint32x4x2_t sum_sq = Sum5_32(sq5);
-  CalculateIntermediate<25>(sum, sum_sq, scale, ma, b);
+  CalculateIntermediate<25, offset>(sum, sum_sq, scale, ma, b);
 }
 
+template <int offset>
 inline void CalculateIntermediate3(const uint16x8_t s3[3],
                                    const uint32x4x2_t sq3[3],
-                                   const uint32_t scale, uint8x8_t* const ma,
+                                   const uint32_t scale, uint8x16_t* const ma,
                                    uint16x8_t* const b) {
   const uint16x8_t sum = Sum3_16(s3);
   const uint32x4x2_t sum_sq = Sum3_32(sq3);
-  CalculateIntermediate<9>(sum, sum_sq, scale, ma, b);
+  CalculateIntermediate<9, offset>(sum, sum_sq, scale, ma, b);
 }
 
-inline void Store343_444(const uint8x8x2_t ma3, const uint16x8x2_t b3,
+template <int offset>
+inline void Store343_444(const uint8x16_t ma3[3], const uint16x8_t b3[2],
                          const ptrdiff_t x, uint16x8_t* const sum_ma343,
                          uint16x8_t* const sum_ma444,
                          uint32x4x2_t* const sum_b343,
                          uint32x4x2_t* const sum_b444, uint16_t* const ma343,
                          uint16_t* const ma444, uint32_t* const b343,
                          uint32_t* const b444) {
-  uint8x8_t s[3];
-  Prepare3_8(ma3, s);
-  const uint16x8_t sum_ma111 = Sum3W_16(s);
+  const uint16x8_t sum_ma111 = (offset == 0) ? Sum3WLo16(ma3) : Sum3WHi16(ma3);
   *sum_ma444 = vshlq_n_u16(sum_ma111, 2);
   const uint16x8_t sum333 = vsubq_u16(*sum_ma444, sum_ma111);
-  *sum_ma343 = vaddw_u8(sum333, s[1]);
+  *sum_ma343 = vaddw_u8(
+      sum333, (offset == 0) ? vget_low_u8(ma3[1]) : vget_high_u8(ma3[1]));
   uint16x4_t low[3], high[3];
   uint32x4x2_t sum_b111;
   Prepare3_16(b3, low, high);
@@ -966,93 +1119,211 @@ inline void Store343_444(const uint8x8x2_t ma3, const uint16x8x2_t b3,
   vst1q_u32(b444 + x + 4, sum_b444->val[1]);
 }
 
-inline void Store343_444(const uint8x8x2_t ma3, const uint16x8x2_t b3,
+template <int offset>
+inline void Store343_444(const uint8x16_t ma3[3], const uint16x8_t b3[2],
                          const ptrdiff_t x, uint16x8_t* const sum_ma343,
                          uint32x4x2_t* const sum_b343, uint16_t* const ma343,
                          uint16_t* const ma444, uint32_t* const b343,
                          uint32_t* const b444) {
   uint16x8_t sum_ma444;
   uint32x4x2_t sum_b444;
-  Store343_444(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, &sum_b444, ma343,
-               ma444, b343, b444);
+  Store343_444<offset>(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, &sum_b444,
+                       ma343, ma444, b343, b444);
 }
 
-inline void Store343_444(const uint8x8x2_t ma3, const uint16x8x2_t b3,
+template <int offset>
+inline void Store343_444(const uint8x16_t ma3[3], const uint16x8_t b3[2],
                          const ptrdiff_t x, uint16_t* const ma343,
                          uint16_t* const ma444, uint32_t* const b343,
                          uint32_t* const b444) {
   uint16x8_t sum_ma343;
   uint32x4x2_t sum_b343;
-  Store343_444(ma3, b3, x, &sum_ma343, &sum_b343, ma343, ma444, b343, b444);
+  Store343_444<offset>(ma3, b3, x, &sum_ma343, &sum_b343, ma343, ma444, b343,
+                       b444);
 }
 
-LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5(
-    const uint8_t* const src0, const uint8_t* const src1, const ptrdiff_t x,
-    const uint32_t scale, uint16_t* const sum5[5],
-    uint32_t* const square_sum5[5], uint8x8x2_t s[2], uint16x8x2_t sq[2],
-    uint8x8_t* const ma, uint16x8_t* const b) {
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5Lo(
+    const uint8_t* const src0, const uint8_t* const src1, const uint32_t scale,
+    uint8x16_t s[2][2], uint16_t* const sum5[5], uint32_t* const square_sum5[5],
+    uint16x8_t sq[2][4], uint8x16_t* const ma, uint16x8_t* const b) {
   uint16x8_t s5[5];
   uint32x4x2_t sq5[5];
-  s[0].val[1] = vld1_u8(src0 + x + 8);
-  s[1].val[1] = vld1_u8(src1 + x + 8);
-  sq[0].val[1] = vmull_u8(s[0].val[1], s[0].val[1]);
-  sq[1].val[1] = vmull_u8(s[1].val[1], s[1].val[1]);
-  s5[3] = Sum5Horizontal(s[0]);
-  s5[4] = Sum5Horizontal(s[1]);
+  s[0][0] = vld1q_u8(src0);
+  s[1][0] = vld1q_u8(src1);
+  sq[0][0] = vmull_u8(vget_low_u8(s[0][0]), vget_low_u8(s[0][0]));
+  sq[1][0] = vmull_u8(vget_low_u8(s[1][0]), vget_low_u8(s[1][0]));
+  sq[0][1] = vmull_u8(vget_high_u8(s[0][0]), vget_high_u8(s[0][0]));
+  sq[1][1] = vmull_u8(vget_high_u8(s[1][0]), vget_high_u8(s[1][0]));
+  s5[3] = Sum5Horizontal(s[0][0]);
+  s5[4] = Sum5Horizontal(s[1][0]);
   sq5[3] = Sum5WHorizontal(sq[0]);
   sq5[4] = Sum5WHorizontal(sq[1]);
-  vst1q_u16(sum5[3] + x, s5[3]);
-  vst1q_u16(sum5[4] + x, s5[4]);
+  vst1q_u16(sum5[3], s5[3]);
+  vst1q_u16(sum5[4], s5[4]);
+  vst1q_u32(square_sum5[3] + 0, sq5[3].val[0]);
+  vst1q_u32(square_sum5[3] + 4, sq5[3].val[1]);
+  vst1q_u32(square_sum5[4] + 0, sq5[4].val[0]);
+  vst1q_u32(square_sum5[4] + 4, sq5[4].val[1]);
+  s5[0] = vld1q_u16(sum5[0]);
+  s5[1] = vld1q_u16(sum5[1]);
+  s5[2] = vld1q_u16(sum5[2]);
+  sq5[0].val[0] = vld1q_u32(square_sum5[0] + 0);
+  sq5[0].val[1] = vld1q_u32(square_sum5[0] + 4);
+  sq5[1].val[0] = vld1q_u32(square_sum5[1] + 0);
+  sq5[1].val[1] = vld1q_u32(square_sum5[1] + 4);
+  sq5[2].val[0] = vld1q_u32(square_sum5[2] + 0);
+  sq5[2].val[1] = vld1q_u32(square_sum5[2] + 4);
+  CalculateIntermediate5<0>(s5, sq5, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5(
+    const uint8_t* const src0, const uint8_t* const src1, const ptrdiff_t x,
+    const uint32_t scale, uint8x16_t s[2][2], uint16_t* const sum5[5],
+    uint32_t* const square_sum5[5], uint16x8_t sq[2][4], uint8x16_t ma[2],
+    uint16x8_t b[2]) {
+  uint16x8_t s5[2][5];
+  uint32x4x2_t sq5[5];
+  s[0][1] = vld1q_u8(src0 + x + 8);
+  s[1][1] = vld1q_u8(src1 + x + 8);
+  sq[0][2] = vmull_u8(vget_low_u8(s[0][1]), vget_low_u8(s[0][1]));
+  sq[1][2] = vmull_u8(vget_low_u8(s[1][1]), vget_low_u8(s[1][1]));
+  Sum5Horizontal<8>(s[0], &s5[0][3], &s5[1][3]);
+  Sum5Horizontal<8>(s[1], &s5[0][4], &s5[1][4]);
+  sq5[3] = Sum5WHorizontal(sq[0] + 1);
+  sq5[4] = Sum5WHorizontal(sq[1] + 1);
+  vst1q_u16(sum5[3] + x, s5[0][3]);
+  vst1q_u16(sum5[4] + x, s5[0][4]);
   vst1q_u32(square_sum5[3] + x + 0, sq5[3].val[0]);
   vst1q_u32(square_sum5[3] + x + 4, sq5[3].val[1]);
   vst1q_u32(square_sum5[4] + x + 0, sq5[4].val[0]);
   vst1q_u32(square_sum5[4] + x + 4, sq5[4].val[1]);
-  s5[0] = vld1q_u16(sum5[0] + x);
-  s5[1] = vld1q_u16(sum5[1] + x);
-  s5[2] = vld1q_u16(sum5[2] + x);
+  s5[0][0] = vld1q_u16(sum5[0] + x);
+  s5[0][1] = vld1q_u16(sum5[1] + x);
+  s5[0][2] = vld1q_u16(sum5[2] + x);
   sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 0);
   sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 4);
   sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 0);
   sq5[1].val[1] = vld1q_u32(square_sum5[1] + x + 4);
   sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 0);
   sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 4);
-  CalculateIntermediate5(s5, sq5, scale, ma, b);
+  CalculateIntermediate5<8>(s5[0], sq5, scale, &ma[0], &b[0]);
+
+  sq[0][3] = vmull_u8(vget_high_u8(s[0][1]), vget_high_u8(s[0][1]));
+  sq[1][3] = vmull_u8(vget_high_u8(s[1][1]), vget_high_u8(s[1][1]));
+  sq5[3] = Sum5WHorizontal(sq[0] + 2);
+  sq5[4] = Sum5WHorizontal(sq[1] + 2);
+  vst1q_u16(sum5[3] + x + 8, s5[1][3]);
+  vst1q_u16(sum5[4] + x + 8, s5[1][4]);
+  vst1q_u32(square_sum5[3] + x + 8, sq5[3].val[0]);
+  vst1q_u32(square_sum5[3] + x + 12, sq5[3].val[1]);
+  vst1q_u32(square_sum5[4] + x + 8, sq5[4].val[0]);
+  vst1q_u32(square_sum5[4] + x + 12, sq5[4].val[1]);
+  s5[1][0] = vld1q_u16(sum5[0] + x + 8);
+  s5[1][1] = vld1q_u16(sum5[1] + x + 8);
+  s5[1][2] = vld1q_u16(sum5[2] + x + 8);
+  sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 8);
+  sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 12);
+  sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 8);
+  sq5[1].val[1] = vld1q_u32(square_sum5[1] + x + 12);
+  sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 8);
+  sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 12);
+  CalculateIntermediate5<0>(s5[1], sq5, scale, &ma[1], &b[1]);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRowLo(
+    const uint8_t* const src, const uint32_t scale, uint8x16_t* const s,
+    const uint16_t* const sum5[5], const uint32_t* const square_sum5[5],
+    uint16x8_t sq[2], uint8x16_t* const ma, uint16x8_t* const b) {
+  uint16x8_t s5[5];
+  uint32x4x2_t sq5[5];
+  *s = vld1q_u8(src);
+  sq[0] = vmull_u8(vget_low_u8(*s), vget_low_u8(*s));
+  sq[1] = vmull_u8(vget_high_u8(*s), vget_high_u8(*s));
+  s5[3] = s5[4] = Sum5Horizontal(*s);
+  sq5[3] = sq5[4] = Sum5WHorizontal(sq);
+  s5[0] = vld1q_u16(sum5[0]);
+  s5[1] = vld1q_u16(sum5[1]);
+  s5[2] = vld1q_u16(sum5[2]);
+  sq5[0].val[0] = vld1q_u32(square_sum5[0] + 0);
+  sq5[0].val[1] = vld1q_u32(square_sum5[0] + 4);
+  sq5[1].val[0] = vld1q_u32(square_sum5[1] + 0);
+  sq5[1].val[1] = vld1q_u32(square_sum5[1] + 4);
+  sq5[2].val[0] = vld1q_u32(square_sum5[2] + 0);
+  sq5[2].val[1] = vld1q_u32(square_sum5[2] + 4);
+  CalculateIntermediate5<0>(s5, sq5, scale, ma, b);
 }
 
 LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRow(
     const uint8_t* const src, const ptrdiff_t x, const uint32_t scale,
-    const uint16_t* const sum5[5], const uint32_t* const square_sum5[5],
-    uint8x8x2_t* const s, uint16x8x2_t* const sq, uint8x8_t* const ma,
-    uint16x8_t* const b) {
-  uint16x8_t s5[5];
+    uint8x16_t s[2], const uint16_t* const sum5[5],
+    const uint32_t* const square_sum5[5], uint16x8_t sq[3], uint8x16_t ma[2],
+    uint16x8_t b[2]) {
+  uint16x8_t s5[2][5];
   uint32x4x2_t sq5[5];
-  s->val[1] = vld1_u8(src + x + 8);
-  sq->val[1] = vmull_u8(s->val[1], s->val[1]);
-  s5[3] = s5[4] = Sum5Horizontal(*s);
-  sq5[3] = sq5[4] = Sum5WHorizontal(*sq);
-  s5[0] = vld1q_u16(sum5[0] + x);
-  s5[1] = vld1q_u16(sum5[1] + x);
-  s5[2] = vld1q_u16(sum5[2] + x);
+  s[1] = vld1q_u8(src + x + 8);
+  sq[1] = vmull_u8(vget_low_u8(s[1]), vget_low_u8(s[1]));
+  Sum5Horizontal<8>(s, &s5[0][3], &s5[1][3]);
+  sq5[3] = sq5[4] = Sum5WHorizontal(sq);
+  s5[0][0] = vld1q_u16(sum5[0] + x);
+  s5[0][1] = vld1q_u16(sum5[1] + x);
+  s5[0][2] = vld1q_u16(sum5[2] + x);
+  s5[0][4] = s5[0][3];
   sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 0);
   sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 4);
   sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 0);
   sq5[1].val[1] = vld1q_u32(square_sum5[1] + x + 4);
   sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 0);
   sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 4);
-  CalculateIntermediate5(s5, sq5, scale, ma, b);
+  CalculateIntermediate5<8>(s5[0], sq5, scale, &ma[0], &b[0]);
+
+  sq[2] = vmull_u8(vget_high_u8(s[1]), vget_high_u8(s[1]));
+  sq5[3] = sq5[4] = Sum5WHorizontal(sq + 1);
+  s5[1][0] = vld1q_u16(sum5[0] + x + 8);
+  s5[1][1] = vld1q_u16(sum5[1] + x + 8);
+  s5[1][2] = vld1q_u16(sum5[2] + x + 8);
+  s5[1][4] = s5[1][3];
+  sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 8);
+  sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 12);
+  sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 8);
+  sq5[1].val[1] = vld1q_u32(square_sum5[1] + x + 12);
+  sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 8);
+  sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 12);
+  CalculateIntermediate5<0>(s5[1], sq5, scale, &ma[1], &b[1]);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3Lo(
+    const uint8_t* const src, const uint32_t scale, uint8x16_t* const s,
+    uint16_t* const sum3[3], uint32_t* const square_sum3[3], uint16x8_t sq[2],
+    uint8x16_t* const ma, uint16x8_t* const b) {
+  uint16x8_t s3[3];
+  uint32x4x2_t sq3[3];
+  *s = vld1q_u8(src);
+  sq[0] = vmull_u8(vget_low_u8(*s), vget_low_u8(*s));
+  sq[1] = vmull_u8(vget_high_u8(*s), vget_high_u8(*s));
+  s3[2] = Sum3Horizontal(*s);
+  sq3[2] = Sum3WHorizontal(sq);
+  vst1q_u16(sum3[2], s3[2]);
+  vst1q_u32(square_sum3[2] + 0, sq3[2].val[0]);
+  vst1q_u32(square_sum3[2] + 4, sq3[2].val[1]);
+  s3[0] = vld1q_u16(sum3[0]);
+  s3[1] = vld1q_u16(sum3[1]);
+  sq3[0].val[0] = vld1q_u32(square_sum3[0] + 0);
+  sq3[0].val[1] = vld1q_u32(square_sum3[0] + 4);
+  sq3[1].val[0] = vld1q_u32(square_sum3[1] + 0);
+  sq3[1].val[1] = vld1q_u32(square_sum3[1] + 4);
+  CalculateIntermediate3<0>(s3, sq3, scale, ma, b);
 }
 
 LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3(
     const uint8_t* const src, const ptrdiff_t x, const uint32_t scale,
-    uint16_t* const sum3[3], uint32_t* const square_sum3[3],
-    uint8x8x2_t* const s, uint16x8x2_t* const sq, uint8x8_t* const ma,
-    uint16x8_t* const b) {
-  uint16x8_t s3[3];
+    uint16_t* const sum3[3], uint32_t* const square_sum3[3], uint8x16_t s[2],
+    uint16x8_t sq[3], uint8x16_t ma[2], uint16x8_t b[2]) {
+  uint16x8_t s3[4];
   uint32x4x2_t sq3[3];
-  s->val[1] = vld1_u8(src + x + 8);
-  sq->val[1] = vmull_u8(s->val[1], s->val[1]);
-  s3[2] = Sum3Horizontal(*s);
-  sq3[2] = Sum3WHorizontal(*sq);
+  s[1] = vld1q_u8(src + x + 8);
+  sq[1] = vmull_u8(vget_low_u8(s[1]), vget_low_u8(s[1]));
+  Sum3Horizontal<8>(s, s3 + 2);
+  sq3[2] = Sum3WHorizontal(sq);
   vst1q_u16(sum3[2] + x, s3[2]);
   vst1q_u32(square_sum3[2] + x + 0, sq3[2].val[0]);
   vst1q_u32(square_sum3[2] + x + 4, sq3[2].val[1]);
@@ -1062,71 +1333,204 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3(
   sq3[0].val[1] = vld1q_u32(square_sum3[0] + x + 4);
   sq3[1].val[0] = vld1q_u32(square_sum3[1] + x + 0);
   sq3[1].val[1] = vld1q_u32(square_sum3[1] + x + 4);
-  CalculateIntermediate3(s3, sq3, scale, ma, b);
+  CalculateIntermediate3<8>(s3, sq3, scale, &ma[0], &b[0]);
+
+  sq[2] = vmull_u8(vget_high_u8(s[1]), vget_high_u8(s[1]));
+  sq3[2] = Sum3WHorizontal(sq + 1);
+  vst1q_u16(sum3[2] + x + 8, s3[3]);
+  vst1q_u32(square_sum3[2] + x + 8, sq3[2].val[0]);
+  vst1q_u32(square_sum3[2] + x + 12, sq3[2].val[1]);
+  s3[1] = vld1q_u16(sum3[0] + x + 8);
+  s3[2] = vld1q_u16(sum3[1] + x + 8);
+  sq3[0].val[0] = vld1q_u32(square_sum3[0] + x + 8);
+  sq3[0].val[1] = vld1q_u32(square_sum3[0] + x + 12);
+  sq3[1].val[0] = vld1q_u32(square_sum3[1] + x + 8);
+  sq3[1].val[1] = vld1q_u32(square_sum3[1] + x + 12);
+  CalculateIntermediate3<0>(s3 + 1, sq3, scale, &ma[1], &b[1]);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLo(
+    const uint8_t* const src0, const uint8_t* const src1,
+    const uint16_t scales[2], uint8x16_t s[2][2], uint16_t* const sum3[4],
+    uint16_t* const sum5[5], uint32_t* const square_sum3[4],
+    uint32_t* const square_sum5[5], uint16x8_t sq[2][4], uint8x16_t ma3[2][2],
+    uint16x8_t b3[2][3], uint8x16_t* const ma5, uint16x8_t* const b5) {
+  uint16x8_t s3[4], s5[5];
+  uint32x4x2_t sq3[4], sq5[5];
+  s[0][0] = vld1q_u8(src0);
+  s[1][0] = vld1q_u8(src1);
+  sq[0][0] = vmull_u8(vget_low_u8(s[0][0]), vget_low_u8(s[0][0]));
+  sq[1][0] = vmull_u8(vget_low_u8(s[1][0]), vget_low_u8(s[1][0]));
+  sq[0][1] = vmull_u8(vget_high_u8(s[0][0]), vget_high_u8(s[0][0]));
+  sq[1][1] = vmull_u8(vget_high_u8(s[1][0]), vget_high_u8(s[1][0]));
+  SumHorizontal(s[0][0], sq[0], &s3[2], &s5[3], &sq3[2], &sq5[3]);
+  SumHorizontal(s[1][0], sq[1], &s3[3], &s5[4], &sq3[3], &sq5[4]);
+  vst1q_u16(sum3[2], s3[2]);
+  vst1q_u16(sum3[3], s3[3]);
+  vst1q_u32(square_sum3[2] + 0, sq3[2].val[0]);
+  vst1q_u32(square_sum3[2] + 4, sq3[2].val[1]);
+  vst1q_u32(square_sum3[3] + 0, sq3[3].val[0]);
+  vst1q_u32(square_sum3[3] + 4, sq3[3].val[1]);
+  vst1q_u16(sum5[3], s5[3]);
+  vst1q_u16(sum5[4], s5[4]);
+  vst1q_u32(square_sum5[3] + 0, sq5[3].val[0]);
+  vst1q_u32(square_sum5[3] + 4, sq5[3].val[1]);
+  vst1q_u32(square_sum5[4] + 0, sq5[4].val[0]);
+  vst1q_u32(square_sum5[4] + 4, sq5[4].val[1]);
+  s3[0] = vld1q_u16(sum3[0]);
+  s3[1] = vld1q_u16(sum3[1]);
+  sq3[0].val[0] = vld1q_u32(square_sum3[0] + 0);
+  sq3[0].val[1] = vld1q_u32(square_sum3[0] + 4);
+  sq3[1].val[0] = vld1q_u32(square_sum3[1] + 0);
+  sq3[1].val[1] = vld1q_u32(square_sum3[1] + 4);
+  s5[0] = vld1q_u16(sum5[0]);
+  s5[1] = vld1q_u16(sum5[1]);
+  s5[2] = vld1q_u16(sum5[2]);
+  sq5[0].val[0] = vld1q_u32(square_sum5[0] + 0);
+  sq5[0].val[1] = vld1q_u32(square_sum5[0] + 4);
+  sq5[1].val[0] = vld1q_u32(square_sum5[1] + 0);
+  sq5[1].val[1] = vld1q_u32(square_sum5[1] + 4);
+  sq5[2].val[0] = vld1q_u32(square_sum5[2] + 0);
+  sq5[2].val[1] = vld1q_u32(square_sum5[2] + 4);
+  CalculateIntermediate3<0>(s3, sq3, scales[1], ma3[0], b3[0]);
+  CalculateIntermediate3<0>(s3 + 1, sq3 + 1, scales[1], ma3[1], b3[1]);
+  CalculateIntermediate5<0>(s5, sq5, scales[0], ma5, b5);
 }
 
 LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess(
     const uint8_t* const src0, const uint8_t* const src1, const ptrdiff_t x,
-    const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5],
-    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
-    uint8x8x2_t s[2], uint16x8x2_t sq[2], uint8x8_t* const ma3_0,
-    uint8x8_t* const ma3_1, uint16x8_t* const b3_0, uint16x8_t* const b3_1,
-    uint8x8_t* const ma5, uint16x8_t* const b5) {
-  uint16x8_t s3[4], s5[5];
+    const uint16_t scales[2], uint8x16_t s[2][2], uint16_t* const sum3[4],
+    uint16_t* const sum5[5], uint32_t* const square_sum3[4],
+    uint32_t* const square_sum5[5], uint16x8_t sq[2][4], uint8x16_t ma3[2][2],
+    uint16x8_t b3[2][3], uint8x16_t ma5[2], uint16x8_t b5[2]) {
+  uint16x8_t s3[2][4], s5[2][5];
   uint32x4x2_t sq3[4], sq5[5];
-  s[0].val[1] = vld1_u8(src0 + x + 8);
-  s[1].val[1] = vld1_u8(src1 + x + 8);
-  sq[0].val[1] = vmull_u8(s[0].val[1], s[0].val[1]);
-  sq[1].val[1] = vmull_u8(s[1].val[1], s[1].val[1]);
-  SumHorizontal(s[0], sq[0], &s3[2], &s5[3], &sq3[2], &sq5[3]);
-  SumHorizontal(s[1], sq[1], &s3[3], &s5[4], &sq3[3], &sq5[4]);
-  vst1q_u16(sum3[2] + x, s3[2]);
-  vst1q_u16(sum3[3] + x, s3[3]);
+  s[0][1] = vld1q_u8(src0 + x + 8);
+  s[1][1] = vld1q_u8(src1 + x + 8);
+  sq[0][2] = vmull_u8(vget_low_u8(s[0][1]), vget_low_u8(s[0][1]));
+  sq[1][2] = vmull_u8(vget_low_u8(s[1][1]), vget_low_u8(s[1][1]));
+  SumHorizontal<8>(s[0], &s3[0][2], &s3[1][2], &s5[0][3], &s5[1][3]);
+  SumHorizontal<8>(s[1], &s3[0][3], &s3[1][3], &s5[0][4], &s5[1][4]);
+  SumHorizontal(sq[0] + 1, &sq3[2], &sq5[3]);
+  SumHorizontal(sq[1] + 1, &sq3[3], &sq5[4]);
+  vst1q_u16(sum3[2] + x, s3[0][2]);
+  vst1q_u16(sum3[3] + x, s3[0][3]);
   vst1q_u32(square_sum3[2] + x + 0, sq3[2].val[0]);
   vst1q_u32(square_sum3[2] + x + 4, sq3[2].val[1]);
   vst1q_u32(square_sum3[3] + x + 0, sq3[3].val[0]);
   vst1q_u32(square_sum3[3] + x + 4, sq3[3].val[1]);
-  vst1q_u16(sum5[3] + x, s5[3]);
-  vst1q_u16(sum5[4] + x, s5[4]);
+  vst1q_u16(sum5[3] + x, s5[0][3]);
+  vst1q_u16(sum5[4] + x, s5[0][4]);
   vst1q_u32(square_sum5[3] + x + 0, sq5[3].val[0]);
   vst1q_u32(square_sum5[3] + x + 4, sq5[3].val[1]);
   vst1q_u32(square_sum5[4] + x + 0, sq5[4].val[0]);
   vst1q_u32(square_sum5[4] + x + 4, sq5[4].val[1]);
-  s3[0] = vld1q_u16(sum3[0] + x);
-  s3[1] = vld1q_u16(sum3[1] + x);
+  s3[0][0] = vld1q_u16(sum3[0] + x);
+  s3[0][1] = vld1q_u16(sum3[1] + x);
   sq3[0].val[0] = vld1q_u32(square_sum3[0] + x + 0);
   sq3[0].val[1] = vld1q_u32(square_sum3[0] + x + 4);
   sq3[1].val[0] = vld1q_u32(square_sum3[1] + x + 0);
   sq3[1].val[1] = vld1q_u32(square_sum3[1] + x + 4);
-  s5[0] = vld1q_u16(sum5[0] + x);
-  s5[1] = vld1q_u16(sum5[1] + x);
-  s5[2] = vld1q_u16(sum5[2] + x);
+  s5[0][0] = vld1q_u16(sum5[0] + x);
+  s5[0][1] = vld1q_u16(sum5[1] + x);
+  s5[0][2] = vld1q_u16(sum5[2] + x);
   sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 0);
   sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 4);
   sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 0);
   sq5[1].val[1] = vld1q_u32(square_sum5[1] + x + 4);
   sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 0);
   sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 4);
-  CalculateIntermediate3(s3, sq3, scales[1], ma3_0, b3_0);
-  CalculateIntermediate3(s3 + 1, sq3 + 1, scales[1], ma3_1, b3_1);
-  CalculateIntermediate5(s5, sq5, scales[0], ma5, b5);
+  CalculateIntermediate3<8>(s3[0], sq3, scales[1], &ma3[0][0], &b3[0][1]);
+  CalculateIntermediate3<8>(s3[0] + 1, sq3 + 1, scales[1], &ma3[1][0],
+                            &b3[1][1]);
+  CalculateIntermediate5<8>(s5[0], sq5, scales[0], &ma5[0], &b5[0]);
+
+  sq[0][3] = vmull_u8(vget_high_u8(s[0][1]), vget_high_u8(s[0][1]));
+  sq[1][3] = vmull_u8(vget_high_u8(s[1][1]), vget_high_u8(s[1][1]));
+  SumHorizontal(sq[0] + 2, &sq3[2], &sq5[3]);
+  SumHorizontal(sq[1] + 2, &sq3[3], &sq5[4]);
+  vst1q_u16(sum3[2] + x + 8, s3[1][2]);
+  vst1q_u16(sum3[3] + x + 8, s3[1][3]);
+  vst1q_u32(square_sum3[2] + x + 8, sq3[2].val[0]);
+  vst1q_u32(square_sum3[2] + x + 12, sq3[2].val[1]);
+  vst1q_u32(square_sum3[3] + x + 8, sq3[3].val[0]);
+  vst1q_u32(square_sum3[3] + x + 12, sq3[3].val[1]);
+  vst1q_u16(sum5[3] + x + 8, s5[1][3]);
+  vst1q_u16(sum5[4] + x + 8, s5[1][4]);
+  vst1q_u32(square_sum5[3] + x + 8, sq5[3].val[0]);
+  vst1q_u32(square_sum5[3] + x + 12, sq5[3].val[1]);
+  vst1q_u32(square_sum5[4] + x + 8, sq5[4].val[0]);
+  vst1q_u32(square_sum5[4] + x + 12, sq5[4].val[1]);
+  s3[1][0] = vld1q_u16(sum3[0] + x + 8);
+  s3[1][1] = vld1q_u16(sum3[1] + x + 8);
+  sq3[0].val[0] = vld1q_u32(square_sum3[0] + x + 8);
+  sq3[0].val[1] = vld1q_u32(square_sum3[0] + x + 12);
+  sq3[1].val[0] = vld1q_u32(square_sum3[1] + x + 8);
+  sq3[1].val[1] = vld1q_u32(square_sum3[1] + x + 12);
+  s5[1][0] = vld1q_u16(sum5[0] + x + 8);
+  s5[1][1] = vld1q_u16(sum5[1] + x + 8);
+  s5[1][2] = vld1q_u16(sum5[2] + x + 8);
+  sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 8);
+  sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 12);
+  sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 8);
+  sq5[1].val[1] = vld1q_u32(square_sum5[1] + x + 12);
+  sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 8);
+  sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 12);
+  CalculateIntermediate3<0>(s3[1], sq3, scales[1], &ma3[0][1], &b3[0][2]);
+  CalculateIntermediate3<0>(s3[1] + 1, sq3 + 1, scales[1], &ma3[1][1],
+                            &b3[1][2]);
+  CalculateIntermediate5<0>(s5[1], sq5, scales[0], &ma5[1], &b5[1]);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRowLo(
+    const uint8_t* const src, const uint16_t scales[2],
+    const uint16_t* const sum3[4], const uint16_t* const sum5[5],
+    const uint32_t* const square_sum3[4], const uint32_t* const square_sum5[5],
+    uint8x16_t* const s, uint16x8_t sq[2], uint8x16_t* const ma3,
+    uint8x16_t* const ma5, uint16x8_t* const b3, uint16x8_t* const b5) {
+  uint16x8_t s3[3], s5[5];
+  uint32x4x2_t sq3[3], sq5[5];
+  *s = vld1q_u8(src);
+  sq[0] = vmull_u8(vget_low_u8(*s), vget_low_u8(*s));
+  sq[1] = vmull_u8(vget_high_u8(*s), vget_high_u8(*s));
+  SumHorizontal(*s, sq, &s3[2], &s5[3], &sq3[2], &sq5[3]);
+  s5[0] = vld1q_u16(sum5[0]);
+  s5[1] = vld1q_u16(sum5[1]);
+  s5[2] = vld1q_u16(sum5[2]);
+  s5[4] = s5[3];
+  sq5[0].val[0] = vld1q_u32(square_sum5[0] + 0);
+  sq5[0].val[1] = vld1q_u32(square_sum5[0] + 4);
+  sq5[1].val[0] = vld1q_u32(square_sum5[1] + 0);
+  sq5[1].val[1] = vld1q_u32(square_sum5[1] + 4);
+  sq5[2].val[0] = vld1q_u32(square_sum5[2] + 0);
+  sq5[2].val[1] = vld1q_u32(square_sum5[2] + 4);
+  sq5[4] = sq5[3];
+  CalculateIntermediate5<0>(s5, sq5, scales[0], ma5, b5);
+  s3[0] = vld1q_u16(sum3[0]);
+  s3[1] = vld1q_u16(sum3[1]);
+  sq3[0].val[0] = vld1q_u32(square_sum3[0] + 0);
+  sq3[0].val[1] = vld1q_u32(square_sum3[0] + 4);
+  sq3[1].val[0] = vld1q_u32(square_sum3[1] + 0);
+  sq3[1].val[1] = vld1q_u32(square_sum3[1] + 4);
+  CalculateIntermediate3<0>(s3, sq3, scales[1], ma3, b3);
 }
 
 LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRow(
     const uint8_t* const src, const ptrdiff_t x, const uint16_t scales[2],
     const uint16_t* const sum3[4], const uint16_t* const sum5[5],
     const uint32_t* const square_sum3[4], const uint32_t* const square_sum5[5],
-    uint8x8x2_t* const s, uint16x8x2_t* const sq, uint8x8_t* const ma3,
-    uint8x8_t* const ma5, uint16x8_t* const b3, uint16x8_t* const b5) {
-  uint16x8_t s3[3], s5[5];
+    uint8x16_t s[2], uint16x8_t sq[3], uint8x16_t ma3[2], uint8x16_t ma5[2],
+    uint16x8_t b3[2], uint16x8_t b5[2]) {
+  uint16x8_t s3[2][3], s5[2][5];
   uint32x4x2_t sq3[3], sq5[5];
-  s->val[1] = vld1_u8(src + x + 8);
-  sq->val[1] = vmull_u8(s->val[1], s->val[1]);
-  SumHorizontal(*s, *sq, &s3[2], &s5[3], &sq3[2], &sq5[3]);
-  s5[0] = vld1q_u16(sum5[0] + x);
-  s5[1] = vld1q_u16(sum5[1] + x);
-  s5[2] = vld1q_u16(sum5[2] + x);
-  s5[4] = s5[3];
+  s[1] = vld1q_u8(src + x + 8);
+  sq[1] = vmull_u8(vget_low_u8(s[1]), vget_low_u8(s[1]));
+  SumHorizontal<8>(s, &s3[0][2], &s3[1][2], &s5[0][3], &s5[1][3]);
+  SumHorizontal(sq, &sq3[2], &sq5[3]);
+  s5[0][0] = vld1q_u16(sum5[0] + x);
+  s5[0][1] = vld1q_u16(sum5[1] + x);
+  s5[0][2] = vld1q_u16(sum5[2] + x);
+  s5[0][4] = s5[0][3];
   sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 0);
   sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 4);
   sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 0);
@@ -1134,14 +1538,36 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRow(
   sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 0);
   sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 4);
   sq5[4] = sq5[3];
-  CalculateIntermediate5(s5, sq5, scales[0], ma5, b5);
-  s3[0] = vld1q_u16(sum3[0] + x);
-  s3[1] = vld1q_u16(sum3[1] + x);
+  CalculateIntermediate5<8>(s5[0], sq5, scales[0], &ma5[0], &b5[0]);
+  s3[0][0] = vld1q_u16(sum3[0] + x);
+  s3[0][1] = vld1q_u16(sum3[1] + x);
   sq3[0].val[0] = vld1q_u32(square_sum3[0] + x + 0);
   sq3[0].val[1] = vld1q_u32(square_sum3[0] + x + 4);
   sq3[1].val[0] = vld1q_u32(square_sum3[1] + x + 0);
   sq3[1].val[1] = vld1q_u32(square_sum3[1] + x + 4);
-  CalculateIntermediate3(s3, sq3, scales[1], ma3, b3);
+  CalculateIntermediate3<8>(s3[0], sq3, scales[1], &ma3[0], &b3[0]);
+
+  sq[2] = vmull_u8(vget_high_u8(s[1]), vget_high_u8(s[1]));
+  SumHorizontal(sq + 1, &sq3[2], &sq5[3]);
+  s5[1][0] = vld1q_u16(sum5[0] + x + 8);
+  s5[1][1] = vld1q_u16(sum5[1] + x + 8);
+  s5[1][2] = vld1q_u16(sum5[2] + x + 8);
+  s5[1][4] = s5[1][3];
+  sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 8);
+  sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 12);
+  sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 8);
+  sq5[1].val[1] = vld1q_u32(square_sum5[1] + x + 12);
+  sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 8);
+  sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 12);
+  sq5[4] = sq5[3];
+  CalculateIntermediate5<0>(s5[1], sq5, scales[0], &ma5[1], &b5[1]);
+  s3[1][0] = vld1q_u16(sum3[0] + x + 8);
+  s3[1][1] = vld1q_u16(sum3[1] + x + 8);
+  sq3[0].val[0] = vld1q_u32(square_sum3[0] + x + 8);
+  sq3[0].val[1] = vld1q_u32(square_sum3[0] + x + 12);
+  sq3[1].val[0] = vld1q_u32(square_sum3[1] + x + 8);
+  sq3[1].val[1] = vld1q_u32(square_sum3[1] + x + 12);
+  CalculateIntermediate3<0>(s3[1], sq3, scales[1], &ma3[1], &b3[1]);
 }
 
 inline void BoxSumFilterPreProcess5(const uint8_t* const src0,
@@ -1150,33 +1576,39 @@ inline void BoxSumFilterPreProcess5(const uint8_t* const src0,
                                     uint16_t* const sum5[5],
                                     uint32_t* const square_sum5[5],
                                     uint16_t* ma565, uint32_t* b565) {
-  uint8x8x2_t s[2], mas;
-  uint16x8x2_t sq[2], bs;
-  s[0].val[0] = vld1_u8(src0);
-  s[1].val[0] = vld1_u8(src1);
-  sq[0].val[0] = vmull_u8(s[0].val[0], s[0].val[0]);
-  sq[1].val[0] = vmull_u8(s[1].val[0], s[1].val[0]);
-  BoxFilterPreProcess5(src0, src1, 0, scale, sum5, square_sum5, s, sq,
-                       &mas.val[0], &bs.val[0]);
+  uint8x16_t s[2][2], mas[2];
+  uint16x8_t sq[2][4], bs[3];
+  BoxFilterPreProcess5Lo(src0, src1, scale, s, sum5, square_sum5, sq, &mas[0],
+                         &bs[0]);
 
   int x = 0;
   do {
-    s[0].val[0] = s[0].val[1];
-    s[1].val[0] = s[1].val[1];
-    sq[0].val[0] = sq[0].val[1];
-    sq[1].val[0] = sq[1].val[1];
-    BoxFilterPreProcess5(src0, src1, x + 8, scale, sum5, square_sum5, s, sq,
-                         &mas.val[1], &bs.val[1]);
-    const uint16x8_t ma = Sum565(mas);
-    const uint32x4x2_t b = Sum565W(bs);
-    vst1q_u16(ma565, ma);
-    vst1q_u32(b565 + 0, b.val[0]);
-    vst1q_u32(b565 + 4, b.val[1]);
-    mas.val[0] = mas.val[1];
-    bs.val[0] = bs.val[1];
-    ma565 += 8;
-    b565 += 8;
-    x += 8;
+    uint16x8_t ma[2];
+    uint8x16_t masx[3];
+    uint32x4x2_t b[2];
+    BoxFilterPreProcess5(src0, src1, x + 8, scale, s, sum5, square_sum5, sq,
+                         mas, bs + 1);
+    Prepare3_8<0>(mas, masx);
+    ma[0] = Sum565<0>(masx);
+    b[0] = Sum565W(bs);
+    vst1q_u16(ma565, ma[0]);
+    vst1q_u32(b565 + 0, b[0].val[0]);
+    vst1q_u32(b565 + 4, b[0].val[1]);
+
+    ma[1] = Sum565<8>(masx);
+    b[1] = Sum565W(bs + 1);
+    vst1q_u16(ma565 + 8, ma[1]);
+    vst1q_u32(b565 + 8, b[1].val[0]);
+    vst1q_u32(b565 + 12, b[1].val[1]);
+    s[0][0] = s[0][1];
+    s[1][0] = s[1][1];
+    sq[0][1] = sq[0][3];
+    sq[1][1] = sq[1][3];
+    mas[0] = mas[1];
+    bs[0] = bs[2];
+    ma565 += 16;
+    b565 += 16;
+    x += 16;
   } while (x < width);
 }
 
@@ -1185,35 +1617,44 @@ LIBGAV1_ALWAYS_INLINE void BoxSumFilterPreProcess3(
     const uint8_t* const src, const int width, const uint32_t scale,
     uint16_t* const sum3[3], uint32_t* const square_sum3[3], uint16_t* ma343,
     uint16_t* ma444, uint32_t* b343, uint32_t* b444) {
-  uint8x8x2_t s, mas;
-  uint16x8x2_t sq, bs;
-  s.val[0] = vld1_u8(src);
-  sq.val[0] = vmull_u8(s.val[0], s.val[0]);
-  BoxFilterPreProcess3(src, 0, scale, sum3, square_sum3, &s, &sq, &mas.val[0],
-                       &bs.val[0]);
+  uint8x16_t s[2], mas[2];
+  uint16x8_t sq[4], bs[3];
+  BoxFilterPreProcess3Lo(src, scale, &s[0], sum3, square_sum3, sq, &mas[0],
+                         &bs[0]);
 
   int x = 0;
   do {
-    s.val[0] = s.val[1];
-    sq.val[0] = sq.val[1];
-    BoxFilterPreProcess3(src, x + 8, scale, sum3, square_sum3, &s, &sq,
-                         &mas.val[1], &bs.val[1]);
+    uint8x16_t ma3x[3];
+    BoxFilterPreProcess3(src, x + 8, scale, sum3, square_sum3, s, sq + 1, mas,
+                         bs + 1);
+    Prepare3_8<0>(mas, ma3x);
     if (calculate444) {
-      Store343_444(mas, bs, 0, ma343, ma444, b343, b444);
-      ma444 += 8;
-      b444 += 8;
+      Store343_444<0>(ma3x, bs + 0, 0, ma343, ma444, b343, b444);
+      Store343_444<8>(ma3x, bs + 1, 0, ma343 + 8, ma444 + 8, b343 + 8,
+                      b444 + 8);
+      ma444 += 16;
+      b444 += 16;
     } else {
-      const uint16x8_t ma = Sum343(mas);
-      const uint32x4x2_t b = Sum343W(bs);
-      vst1q_u16(ma343, ma);
-      vst1q_u32(b343 + 0, b.val[0]);
-      vst1q_u32(b343 + 4, b.val[1]);
+      uint16x8_t ma[2];
+      uint32x4x2_t b[2];
+      ma[0] = Sum343<0>(ma3x);
+      b[0] = Sum343W(bs);
+      vst1q_u16(ma343, ma[0]);
+      vst1q_u32(b343 + 0, b[0].val[0]);
+      vst1q_u32(b343 + 4, b[0].val[1]);
+      ma[1] = Sum343<8>(ma3x);
+      b[1] = Sum343W(bs + 1);
+      vst1q_u16(ma343 + 8, ma[1]);
+      vst1q_u32(b343 + 8, b[1].val[0]);
+      vst1q_u32(b343 + 12, b[1].val[1]);
     }
-    mas.val[0] = mas.val[1];
-    bs.val[0] = bs.val[1];
-    ma343 += 8;
-    b343 += 8;
-    x += 8;
+    s[0] = s[1];
+    sq[1] = sq[3];
+    mas[0] = mas[1];
+    bs[0] = bs[2];
+    ma343 += 16;
+    b343 += 16;
+    x += 16;
   } while (x < width);
 }
 
@@ -1221,48 +1662,58 @@ inline void BoxSumFilterPreProcess(
     const uint8_t* const src0, const uint8_t* const src1, const int width,
     const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5],
     uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
-    uint16_t* const ma343[4], uint16_t* const ma444[2], uint16_t* ma565,
-    uint32_t* const b343[4], uint32_t* const b444[2], uint32_t* b565) {
-  uint8x8x2_t s[2];
-  uint8x8x2_t ma3[2], ma5;
-  uint16x8x2_t sq[2], b3[2], b5;
-  s[0].val[0] = vld1_u8(src0);
-  s[1].val[0] = vld1_u8(src1);
-  sq[0].val[0] = vmull_u8(s[0].val[0], s[0].val[0]);
-  sq[1].val[0] = vmull_u8(s[1].val[0], s[1].val[0]);
-  BoxFilterPreProcess(src0, src1, 0, scales, sum3, sum5, square_sum3,
-                      square_sum5, s, sq, &ma3[0].val[0], &ma3[1].val[0],
-                      &b3[0].val[0], &b3[1].val[0], &ma5.val[0], &b5.val[0]);
+    uint16_t* const ma343[4], uint16_t* const ma444, uint16_t* ma565,
+    uint32_t* const b343[4], uint32_t* const b444, uint32_t* b565) {
+  uint8x16_t s[2][2], ma3[2][2], ma5[2];
+  uint16x8_t sq[2][4], b3[2][3], b5[3];
+  BoxFilterPreProcessLo(src0, src1, scales, s, sum3, sum5, square_sum3,
+                        square_sum5, sq, ma3, b3, &ma5[0], &b5[0]);
 
   int x = 0;
   do {
-    s[0].val[0] = s[0].val[1];
-    s[1].val[0] = s[1].val[1];
-    sq[0].val[0] = sq[0].val[1];
-    sq[1].val[0] = sq[1].val[1];
-    BoxFilterPreProcess(src0, src1, x + 8, scales, sum3, sum5, square_sum3,
-                        square_sum5, s, sq, &ma3[0].val[1], &ma3[1].val[1],
-                        &b3[0].val[1], &b3[1].val[1], &ma5.val[1], &b5.val[1]);
-    uint16x8_t ma = Sum343(ma3[0]);
-    uint32x4x2_t b = Sum343W(b3[0]);
-    vst1q_u16(ma343[0] + x, ma);
-    vst1q_u32(b343[0] + x, b.val[0]);
-    vst1q_u32(b343[0] + x + 4, b.val[1]);
-    Store343_444(ma3[1], b3[1], x, ma343[1], ma444[0], b343[1], b444[0]);
-    ma = Sum565(ma5);
-    b = Sum565W(b5);
-    vst1q_u16(ma565, ma);
-    vst1q_u32(b565 + 0, b.val[0]);
-    vst1q_u32(b565 + 4, b.val[1]);
-    ma3[0].val[0] = ma3[0].val[1];
-    ma3[1].val[0] = ma3[1].val[1];
-    b3[0].val[0] = b3[0].val[1];
-    b3[1].val[0] = b3[1].val[1];
-    ma5.val[0] = ma5.val[1];
-    b5.val[0] = b5.val[1];
-    ma565 += 8;
-    b565 += 8;
-    x += 8;
+    uint16x8_t ma[2];
+    uint8x16_t ma3x[3], ma5x[3];
+    uint32x4x2_t b[2];
+    BoxFilterPreProcess(src0, src1, x + 8, scales, s, sum3, sum5, square_sum3,
+                        square_sum5, sq, ma3, b3, ma5, b5 + 1);
+    Prepare3_8<0>(ma3[0], ma3x);
+    ma[0] = Sum343<0>(ma3x);
+    ma[1] = Sum343<8>(ma3x);
+    b[0] = Sum343W(b3[0] + 0);
+    b[1] = Sum343W(b3[0] + 1);
+    vst1q_u16(ma343[0] + x, ma[0]);
+    vst1q_u16(ma343[0] + x + 8, ma[1]);
+    vst1q_u32(b343[0] + x, b[0].val[0]);
+    vst1q_u32(b343[0] + x + 4, b[0].val[1]);
+    vst1q_u32(b343[0] + x + 8, b[1].val[0]);
+    vst1q_u32(b343[0] + x + 12, b[1].val[1]);
+    Prepare3_8<0>(ma3[1], ma3x);
+    Store343_444<0>(ma3x, b3[1], x, ma343[1], ma444, b343[1], b444);
+    Store343_444<8>(ma3x, b3[1] + 1, x + 8, ma343[1], ma444, b343[1], b444);
+    Prepare3_8<0>(ma5, ma5x);
+    ma[0] = Sum565<0>(ma5x);
+    ma[1] = Sum565<8>(ma5x);
+    b[0] = Sum565W(b5);
+    b[1] = Sum565W(b5 + 1);
+    vst1q_u16(ma565, ma[0]);
+    vst1q_u16(ma565 + 8, ma[1]);
+    vst1q_u32(b565 + 0, b[0].val[0]);
+    vst1q_u32(b565 + 4, b[0].val[1]);
+    vst1q_u32(b565 + 8, b[1].val[0]);
+    vst1q_u32(b565 + 12, b[1].val[1]);
+    s[0][0] = s[0][1];
+    s[1][0] = s[1][1];
+    sq[0][1] = sq[0][3];
+    sq[1][1] = sq[1][3];
+    ma3[0][0] = ma3[0][1];
+    ma3[1][0] = ma3[1][1];
+    b3[0][0] = b3[0][2];
+    b3[1][0] = b3[1][2];
+    ma5[0] = ma5[1];
+    b5[0] = b5[2];
+    ma565 += 16;
+    b565 += 16;
+    x += 16;
   } while (x < width);
 }
 
@@ -1310,37 +1761,36 @@ inline int16x8_t CalculateFilteredOutputPass2(const uint8x8_t s,
   return CalculateFilteredOutput<5>(s, ma_sum, b_sum);
 }
 
-inline void SelfGuidedFinal(const uint8x8_t src, const int32x4_t v[2],
-                            uint8_t* const dst) {
+inline uint8x8_t SelfGuidedFinal(const uint8x8_t src, const int32x4_t v[2]) {
   const int16x4_t v_lo =
       vrshrn_n_s32(v[0], kSgrProjRestoreBits + kSgrProjPrecisionBits);
   const int16x4_t v_hi =
       vrshrn_n_s32(v[1], kSgrProjRestoreBits + kSgrProjPrecisionBits);
   const int16x8_t vv = vcombine_s16(v_lo, v_hi);
-  const int16x8_t s = ZeroExtend(src);
-  const int16x8_t d = vaddq_s16(s, vv);
-  vst1_u8(dst, vqmovun_s16(d));
+  const int16x8_t d =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(vv), src));
+  return vqmovun_s16(d);
 }
 
-inline void SelfGuidedDoubleMultiplier(const uint8x8_t src,
-                                       const int16x8_t filter[2], const int w0,
-                                       const int w2, uint8_t* const dst) {
+inline uint8x8_t SelfGuidedDoubleMultiplier(const uint8x8_t src,
+                                            const int16x8_t filter[2],
+                                            const int w0, const int w2) {
   int32x4_t v[2];
   v[0] = vmull_n_s16(vget_low_s16(filter[0]), w0);
   v[1] = vmull_n_s16(vget_high_s16(filter[0]), w0);
   v[0] = vmlal_n_s16(v[0], vget_low_s16(filter[1]), w2);
   v[1] = vmlal_n_s16(v[1], vget_high_s16(filter[1]), w2);
-  SelfGuidedFinal(src, v, dst);
+  return SelfGuidedFinal(src, v);
 }
 
-inline void SelfGuidedSingleMultiplier(const uint8x8_t src,
-                                       const int16x8_t filter, const int w0,
-                                       uint8_t* const dst) {
+inline uint8x8_t SelfGuidedSingleMultiplier(const uint8x8_t src,
+                                            const int16x8_t filter,
+                                            const int w0) {
   // weight: -96 to 96 (Sgrproj_Xqd_Min/Max)
   int32x4_t v[2];
   v[0] = vmull_n_s16(vget_low_s16(filter), w0);
   v[1] = vmull_n_s16(vget_high_s16(filter), w0);
-  SelfGuidedFinal(src, v, dst);
+  return SelfGuidedFinal(src, v);
 }
 
 LIBGAV1_ALWAYS_INLINE void BoxFilterPass1(
@@ -1349,43 +1799,60 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterPass1(
     uint32_t* const square_sum5[5], const int width, const uint32_t scale,
     const int16_t w0, uint16_t* const ma565[2], uint32_t* const b565[2],
     uint8_t* const dst) {
-  uint8x8x2_t s[2], mas;
-  uint16x8x2_t sq[2], bs;
-  s[0].val[0] = vld1_u8(src0);
-  s[1].val[0] = vld1_u8(src1);
-  sq[0].val[0] = vmull_u8(s[0].val[0], s[0].val[0]);
-  sq[1].val[0] = vmull_u8(s[1].val[0], s[1].val[0]);
-  BoxFilterPreProcess5(src0, src1, 0, scale, sum5, square_sum5, s, sq,
-                       &mas.val[0], &bs.val[0]);
+  uint8x16_t s[2][2], mas[2];
+  uint16x8_t sq[2][4], bs[3];
+  BoxFilterPreProcess5Lo(src0, src1, scale, s, sum5, square_sum5, sq, &mas[0],
+                         &bs[0]);
 
   int x = 0;
   do {
-    s[0].val[0] = s[0].val[1];
-    s[1].val[0] = s[1].val[1];
-    sq[0].val[0] = sq[0].val[1];
-    sq[1].val[0] = sq[1].val[1];
-    BoxFilterPreProcess5(src0, src1, x + 8, scale, sum5, square_sum5, s, sq,
-                         &mas.val[1], &bs.val[1]);
     uint16x8_t ma[2];
+    uint8x16_t masx[3];
     uint32x4x2_t b[2];
-    ma[1] = Sum565(mas);
+    int16x8_t p0, p1;
+    BoxFilterPreProcess5(src0, src1, x + 8, scale, s, sum5, square_sum5, sq,
+                         mas, bs + 1);
+    Prepare3_8<0>(mas, masx);
+    ma[1] = Sum565<0>(masx);
     b[1] = Sum565W(bs);
     vst1q_u16(ma565[1] + x, ma[1]);
     vst1q_u32(b565[1] + x + 0, b[1].val[0]);
     vst1q_u32(b565[1] + x + 4, b[1].val[1]);
-    const uint8x8_t sr0 = vld1_u8(src + x);
-    const uint8x8_t sr1 = vld1_u8(src + stride + x);
-    int16x8_t p0, p1;
+    const uint8x16_t sr0 = vld1q_u8(src + x);
+    const uint8x16_t sr1 = vld1q_u8(src + stride + x);
+    const uint8x8_t sr00 = vget_low_u8(sr0);
+    const uint8x8_t sr10 = vget_low_u8(sr1);
     ma[0] = vld1q_u16(ma565[0] + x);
     b[0].val[0] = vld1q_u32(b565[0] + x + 0);
     b[0].val[1] = vld1q_u32(b565[0] + x + 4);
-    p0 = CalculateFilteredOutputPass1(sr0, ma, b);
-    p1 = CalculateFilteredOutput<4>(sr1, ma[1], b[1]);
-    SelfGuidedSingleMultiplier(sr0, p0, w0, dst + x);
-    SelfGuidedSingleMultiplier(sr1, p1, w0, dst + stride + x);
-    mas.val[0] = mas.val[1];
-    bs.val[0] = bs.val[1];
-    x += 8;
+    p0 = CalculateFilteredOutputPass1(sr00, ma, b);
+    p1 = CalculateFilteredOutput<4>(sr10, ma[1], b[1]);
+    const uint8x8_t d00 = SelfGuidedSingleMultiplier(sr00, p0, w0);
+    const uint8x8_t d10 = SelfGuidedSingleMultiplier(sr10, p1, w0);
+
+    ma[1] = Sum565<8>(masx);
+    b[1] = Sum565W(bs + 1);
+    vst1q_u16(ma565[1] + x + 8, ma[1]);
+    vst1q_u32(b565[1] + x + 8, b[1].val[0]);
+    vst1q_u32(b565[1] + x + 12, b[1].val[1]);
+    const uint8x8_t sr01 = vget_high_u8(sr0);
+    const uint8x8_t sr11 = vget_high_u8(sr1);
+    ma[0] = vld1q_u16(ma565[0] + x + 8);
+    b[0].val[0] = vld1q_u32(b565[0] + x + 8);
+    b[0].val[1] = vld1q_u32(b565[0] + x + 12);
+    p0 = CalculateFilteredOutputPass1(sr01, ma, b);
+    p1 = CalculateFilteredOutput<4>(sr11, ma[1], b[1]);
+    const uint8x8_t d01 = SelfGuidedSingleMultiplier(sr01, p0, w0);
+    const uint8x8_t d11 = SelfGuidedSingleMultiplier(sr11, p1, w0);
+    vst1q_u8(dst + x, vcombine_u8(d00, d01));
+    vst1q_u8(dst + stride + x, vcombine_u8(d10, d11));
+    s[0][0] = s[0][1];
+    s[1][0] = s[1][1];
+    sq[0][1] = sq[0][3];
+    sq[1][1] = sq[1][3];
+    mas[0] = mas[1];
+    bs[0] = bs[2];
+    x += 16;
   } while (x < width);
 }
 
@@ -1396,34 +1863,45 @@ inline void BoxFilterPass1LastRow(const uint8_t* const src,
                                   uint32_t* const square_sum5[5],
                                   uint16_t* ma565, uint32_t* b565,
                                   uint8_t* const dst) {
-  uint8x8x2_t s, mas;
-  uint16x8x2_t sq, bs;
-  s.val[0] = vld1_u8(src0);
-  sq.val[0] = vmull_u8(s.val[0], s.val[0]);
-  BoxFilterPreProcess5LastRow(src0, 0, scale, sum5, square_sum5, &s, &sq,
-                              &mas.val[0], &bs.val[0]);
+  uint8x16_t s[2], mas[2];
+  uint16x8_t sq[4], bs[4];
+  BoxFilterPreProcess5LastRowLo(src0, scale, s, sum5, square_sum5, sq, &mas[0],
+                                &bs[0]);
 
   int x = 0;
   do {
-    s.val[0] = s.val[1];
-    sq.val[0] = sq.val[1];
-    BoxFilterPreProcess5LastRow(src0, x + 8, scale, sum5, square_sum5, &s, &sq,
-                                &mas.val[1], &bs.val[1]);
     uint16x8_t ma[2];
+    uint8x16_t masx[3];
     uint32x4x2_t b[2];
-    ma[1] = Sum565(mas);
+    BoxFilterPreProcess5LastRow(src0, x + 8, scale, s, sum5, square_sum5,
+                                sq + 1, mas, bs + 1);
+    Prepare3_8<0>(mas, masx);
+    ma[1] = Sum565<0>(masx);
     b[1] = Sum565W(bs);
-    mas.val[0] = mas.val[1];
-    bs.val[0] = bs.val[1];
     ma[0] = vld1q_u16(ma565);
     b[0].val[0] = vld1q_u32(b565 + 0);
     b[0].val[1] = vld1q_u32(b565 + 4);
-    const uint8x8_t sr = vld1_u8(src + x);
-    const int16x8_t p = CalculateFilteredOutputPass1(sr, ma, b);
-    SelfGuidedSingleMultiplier(sr, p, w0, dst + x);
-    ma565 += 8;
-    b565 += 8;
-    x += 8;
+    const uint8x16_t sr = vld1q_u8(src + x);
+    const uint8x8_t sr0 = vget_low_u8(sr);
+    const int16x8_t p0 = CalculateFilteredOutputPass1(sr0, ma, b);
+    const uint8x8_t d0 = SelfGuidedSingleMultiplier(sr0, p0, w0);
+
+    ma[1] = Sum565<8>(masx);
+    b[1] = Sum565W(bs + 1);
+    bs[0] = bs[2];
+    const uint8x8_t sr1 = vget_high_u8(sr);
+    ma[0] = vld1q_u16(ma565 + 8);
+    b[0].val[0] = vld1q_u32(b565 + 8);
+    b[0].val[1] = vld1q_u32(b565 + 12);
+    const int16x8_t p1 = CalculateFilteredOutputPass1(sr1, ma, b);
+    const uint8x8_t d1 = SelfGuidedSingleMultiplier(sr1, p1, w0);
+    vst1q_u8(dst + x, vcombine_u8(d0, d1));
+    s[0] = s[1];
+    sq[1] = sq[3];
+    mas[0] = mas[1];
+    ma565 += 16;
+    b565 += 16;
+    x += 16;
   } while (x < width);
 }
 
@@ -1433,35 +1911,49 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterPass2(
     uint32_t* const square_sum3[3], uint16_t* const ma343[3],
     uint16_t* const ma444[2], uint32_t* const b343[3], uint32_t* const b444[2],
     uint8_t* const dst) {
-  uint8x8x2_t s, mas;
-  uint16x8x2_t sq, bs;
-  s.val[0] = vld1_u8(src0);
-  sq.val[0] = vmull_u8(s.val[0], s.val[0]);
-  BoxFilterPreProcess3(src0, 0, scale, sum3, square_sum3, &s, &sq, &mas.val[0],
-                       &bs.val[0]);
+  uint8x16_t s[2], mas[2];
+  uint16x8_t sq[4], bs[3];
+  BoxFilterPreProcess3Lo(src0, scale, &s[0], sum3, square_sum3, sq, &mas[0],
+                         &bs[0]);
 
   int x = 0;
   do {
-    s.val[0] = s.val[1];
-    sq.val[0] = sq.val[1];
-    BoxFilterPreProcess3(src0, x + 8, scale, sum3, square_sum3, &s, &sq,
-                         &mas.val[1], &bs.val[1]);
     uint16x8_t ma[3];
+    uint8x16_t ma3x[3];
     uint32x4x2_t b[3];
-    Store343_444(mas, bs, x, &ma[2], &b[2], ma343[2], ma444[1], b343[2],
-                 b444[1]);
-    const uint8x8_t sr = vld1_u8(src + x);
+    BoxFilterPreProcess3(src0, x + 8, scale, sum3, square_sum3, s, sq + 1, mas,
+                         bs + 1);
+    Prepare3_8<0>(mas, ma3x);
+    Store343_444<0>(ma3x, bs, x, &ma[2], &b[2], ma343[2], ma444[1], b343[2],
+                    b444[1]);
+    const uint8x16_t sr = vld1q_u8(src + x);
+    const uint8x8_t sr0 = vget_low_u8(sr);
     ma[0] = vld1q_u16(ma343[0] + x);
     ma[1] = vld1q_u16(ma444[0] + x);
     b[0].val[0] = vld1q_u32(b343[0] + x + 0);
     b[0].val[1] = vld1q_u32(b343[0] + x + 4);
     b[1].val[0] = vld1q_u32(b444[0] + x + 0);
     b[1].val[1] = vld1q_u32(b444[0] + x + 4);
-    const int16x8_t p = CalculateFilteredOutputPass2(sr, ma, b);
-    SelfGuidedSingleMultiplier(sr, p, w0, dst + x);
-    mas.val[0] = mas.val[1];
-    bs.val[0] = bs.val[1];
-    x += 8;
+    const int16x8_t p0 = CalculateFilteredOutputPass2(sr0, ma, b);
+    const uint8x8_t d0 = SelfGuidedSingleMultiplier(sr0, p0, w0);
+
+    Store343_444<8>(ma3x, bs + 1, x + 8, &ma[2], &b[2], ma343[2], ma444[1],
+                    b343[2], b444[1]);
+    const uint8x8_t sr1 = vget_high_u8(sr);
+    ma[0] = vld1q_u16(ma343[0] + x + 8);
+    ma[1] = vld1q_u16(ma444[0] + x + 8);
+    b[0].val[0] = vld1q_u32(b343[0] + x + 8);
+    b[0].val[1] = vld1q_u32(b343[0] + x + 12);
+    b[1].val[0] = vld1q_u32(b444[0] + x + 8);
+    b[1].val[1] = vld1q_u32(b444[0] + x + 12);
+    const int16x8_t p1 = CalculateFilteredOutputPass2(sr1, ma, b);
+    const uint8x8_t d1 = SelfGuidedSingleMultiplier(sr1, p1, w0);
+    vst1q_u8(dst + x, vcombine_u8(d0, d1));
+    s[0] = s[1];
+    sq[1] = sq[3];
+    mas[0] = mas[1];
+    bs[0] = bs[2];
+    x += 16;
   } while (x < width);
 }
 
@@ -1474,64 +1966,96 @@ LIBGAV1_ALWAYS_INLINE void BoxFilter(
     uint16_t* const ma343[4], uint16_t* const ma444[3],
     uint16_t* const ma565[2], uint32_t* const b343[4], uint32_t* const b444[3],
     uint32_t* const b565[2], uint8_t* const dst) {
-  uint8x8x2_t s[2], ma3[2], ma5;
-  uint16x8x2_t sq[2], b3[2], b5;
-  s[0].val[0] = vld1_u8(src0);
-  s[1].val[0] = vld1_u8(src1);
-  sq[0].val[0] = vmull_u8(s[0].val[0], s[0].val[0]);
-  sq[1].val[0] = vmull_u8(s[1].val[0], s[1].val[0]);
-  BoxFilterPreProcess(src0, src1, 0, scales, sum3, sum5, square_sum3,
-                      square_sum5, s, sq, &ma3[0].val[0], &ma3[1].val[0],
-                      &b3[0].val[0], &b3[1].val[0], &ma5.val[0], &b5.val[0]);
+  uint8x16_t s[2][2], ma3[2][2], ma5[2];
+  uint16x8_t sq[2][4], b3[2][3], b5[3];
+  BoxFilterPreProcessLo(src0, src1, scales, s, sum3, sum5, square_sum3,
+                        square_sum5, sq, ma3, b3, &ma5[0], &b5[0]);
 
   int x = 0;
   do {
-    s[0].val[0] = s[0].val[1];
-    s[1].val[0] = s[1].val[1];
-    sq[0].val[0] = sq[0].val[1];
-    sq[1].val[0] = sq[1].val[1];
-    BoxFilterPreProcess(src0, src1, x + 8, scales, sum3, sum5, square_sum3,
-                        square_sum5, s, sq, &ma3[0].val[1], &ma3[1].val[1],
-                        &b3[0].val[1], &b3[1].val[1], &ma5.val[1], &b5.val[1]);
     uint16x8_t ma[3][3];
+    uint8x16_t ma3x[2][3], ma5x[3];
     uint32x4x2_t b[3][3];
-    Store343_444(ma3[0], b3[0], x, &ma[1][2], &ma[2][1], &b[1][2], &b[2][1],
-                 ma343[2], ma444[1], b343[2], b444[1]);
-    Store343_444(ma3[1], b3[1], x, &ma[2][2], &b[2][2], ma343[3], ma444[2],
-                 b343[3], b444[2]);
-    ma[0][1] = Sum565(ma5);
+    int16x8_t p[2][2];
+    BoxFilterPreProcess(src0, src1, x + 8, scales, s, sum3, sum5, square_sum3,
+                        square_sum5, sq, ma3, b3, ma5, b5 + 1);
+    Prepare3_8<0>(ma3[0], ma3x[0]);
+    Prepare3_8<0>(ma3[1], ma3x[1]);
+    Store343_444<0>(ma3x[0], b3[0], x, &ma[1][2], &ma[2][1], &b[1][2], &b[2][1],
+                    ma343[2], ma444[1], b343[2], b444[1]);
+    Store343_444<0>(ma3x[1], b3[1], x, &ma[2][2], &b[2][2], ma343[3], ma444[2],
+                    b343[3], b444[2]);
+    Prepare3_8<0>(ma5, ma5x);
+    ma[0][1] = Sum565<0>(ma5x);
     b[0][1] = Sum565W(b5);
     vst1q_u16(ma565[1] + x, ma[0][1]);
     vst1q_u32(b565[1] + x, b[0][1].val[0]);
     vst1q_u32(b565[1] + x + 4, b[0][1].val[1]);
-    ma3[0].val[0] = ma3[0].val[1];
-    ma3[1].val[0] = ma3[1].val[1];
-    b3[0].val[0] = b3[0].val[1];
-    b3[1].val[0] = b3[1].val[1];
-    ma5.val[0] = ma5.val[1];
-    b5.val[0] = b5.val[1];
-    int16x8_t p[2][2];
-    const uint8x8_t sr0 = vld1_u8(src + x);
-    const uint8x8_t sr1 = vld1_u8(src + stride + x);
+    const uint8x16_t sr0 = vld1q_u8(src + x);
+    const uint8x16_t sr1 = vld1q_u8(src + stride + x);
+    const uint8x8_t sr00 = vget_low_u8(sr0);
+    const uint8x8_t sr10 = vget_low_u8(sr1);
     ma[0][0] = vld1q_u16(ma565[0] + x);
     b[0][0].val[0] = vld1q_u32(b565[0] + x);
     b[0][0].val[1] = vld1q_u32(b565[0] + x + 4);
-    p[0][0] = CalculateFilteredOutputPass1(sr0, ma[0], b[0]);
-    p[1][0] = CalculateFilteredOutput<4>(sr1, ma[0][1], b[0][1]);
+    p[0][0] = CalculateFilteredOutputPass1(sr00, ma[0], b[0]);
+    p[1][0] = CalculateFilteredOutput<4>(sr10, ma[0][1], b[0][1]);
     ma[1][0] = vld1q_u16(ma343[0] + x);
     ma[1][1] = vld1q_u16(ma444[0] + x);
     b[1][0].val[0] = vld1q_u32(b343[0] + x);
     b[1][0].val[1] = vld1q_u32(b343[0] + x + 4);
     b[1][1].val[0] = vld1q_u32(b444[0] + x);
     b[1][1].val[1] = vld1q_u32(b444[0] + x + 4);
-    p[0][1] = CalculateFilteredOutputPass2(sr0, ma[1], b[1]);
+    p[0][1] = CalculateFilteredOutputPass2(sr00, ma[1], b[1]);
     ma[2][0] = vld1q_u16(ma343[1] + x);
     b[2][0].val[0] = vld1q_u32(b343[1] + x);
     b[2][0].val[1] = vld1q_u32(b343[1] + x + 4);
-    p[1][1] = CalculateFilteredOutputPass2(sr1, ma[2], b[2]);
-    SelfGuidedDoubleMultiplier(sr0, p[0], w0, w2, dst + x);
-    SelfGuidedDoubleMultiplier(sr1, p[1], w0, w2, dst + stride + x);
-    x += 8;
+    p[1][1] = CalculateFilteredOutputPass2(sr10, ma[2], b[2]);
+    const uint8x8_t d00 = SelfGuidedDoubleMultiplier(sr00, p[0], w0, w2);
+    const uint8x8_t d10 = SelfGuidedDoubleMultiplier(sr10, p[1], w0, w2);
+
+    Store343_444<8>(ma3x[0], b3[0] + 1, x + 8, &ma[1][2], &ma[2][1], &b[1][2],
+                    &b[2][1], ma343[2], ma444[1], b343[2], b444[1]);
+    Store343_444<8>(ma3x[1], b3[1] + 1, x + 8, &ma[2][2], &b[2][2], ma343[3],
+                    ma444[2], b343[3], b444[2]);
+    ma[0][1] = Sum565<8>(ma5x);
+    b[0][1] = Sum565W(b5 + 1);
+    vst1q_u16(ma565[1] + x + 8, ma[0][1]);
+    vst1q_u32(b565[1] + x + 8, b[0][1].val[0]);
+    vst1q_u32(b565[1] + x + 12, b[0][1].val[1]);
+    b3[0][0] = b3[0][2];
+    b3[1][0] = b3[1][2];
+    b5[0] = b5[2];
+    const uint8x8_t sr01 = vget_high_u8(sr0);
+    const uint8x8_t sr11 = vget_high_u8(sr1);
+    ma[0][0] = vld1q_u16(ma565[0] + x + 8);
+    b[0][0].val[0] = vld1q_u32(b565[0] + x + 8);
+    b[0][0].val[1] = vld1q_u32(b565[0] + x + 12);
+    p[0][0] = CalculateFilteredOutputPass1(sr01, ma[0], b[0]);
+    p[1][0] = CalculateFilteredOutput<4>(sr11, ma[0][1], b[0][1]);
+    ma[1][0] = vld1q_u16(ma343[0] + x + 8);
+    ma[1][1] = vld1q_u16(ma444[0] + x + 8);
+    b[1][0].val[0] = vld1q_u32(b343[0] + x + 8);
+    b[1][0].val[1] = vld1q_u32(b343[0] + x + 12);
+    b[1][1].val[0] = vld1q_u32(b444[0] + x + 8);
+    b[1][1].val[1] = vld1q_u32(b444[0] + x + 12);
+    p[0][1] = CalculateFilteredOutputPass2(sr01, ma[1], b[1]);
+    ma[2][0] = vld1q_u16(ma343[1] + x + 8);
+    b[2][0].val[0] = vld1q_u32(b343[1] + x + 8);
+    b[2][0].val[1] = vld1q_u32(b343[1] + x + 12);
+    p[1][1] = CalculateFilteredOutputPass2(sr11, ma[2], b[2]);
+    const uint8x8_t d01 = SelfGuidedDoubleMultiplier(sr01, p[0], w0, w2);
+    const uint8x8_t d11 = SelfGuidedDoubleMultiplier(sr11, p[1], w0, w2);
+    vst1q_u8(dst + x, vcombine_u8(d00, d01));
+    vst1q_u8(dst + stride + x, vcombine_u8(d10, d11));
+    s[0][0] = s[0][1];
+    s[1][0] = s[1][1];
+    sq[0][1] = sq[0][3];
+    sq[1][1] = sq[1][3];
+    ma3[0][0] = ma3[0][1];
+    ma3[1][0] = ma3[1][1];
+    ma5[0] = ma5[1];
+    x += 16;
   } while (x < width);
 }
 
@@ -1540,58 +2064,79 @@ inline void BoxFilterLastRow(
     const uint16_t scales[2], const int16_t w0, const int16_t w2,
     uint16_t* const sum3[4], uint16_t* const sum5[5],
     uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
-    uint16_t* const ma343[4], uint16_t* const ma444[3],
-    uint16_t* const ma565[2], uint32_t* const b343[4], uint32_t* const b444[3],
-    uint32_t* const b565[2], uint8_t* const dst) {
-  uint8x8x2_t s, ma3, ma5;
-  uint16x8x2_t sq, b3, b5;
-  uint16x8_t ma[3];
+    uint16_t* const ma343, uint16_t* const ma444, uint16_t* const ma565,
+    uint32_t* const b343, uint32_t* const b444, uint32_t* const b565,
+    uint8_t* const dst) {
+  uint8x16_t s[2], ma3[2], ma5[2];
+  uint16x8_t sq[4], ma[3], b3[3], b5[3];
   uint32x4x2_t b[3];
-  s.val[0] = vld1_u8(src0);
-  sq.val[0] = vmull_u8(s.val[0], s.val[0]);
-  BoxFilterPreProcessLastRow(src0, 0, scales, sum3, sum5, square_sum3,
-                             square_sum5, &s, &sq, &ma3.val[0], &ma5.val[0],
-                             &b3.val[0], &b5.val[0]);
+  BoxFilterPreProcessLastRowLo(src0, scales, sum3, sum5, square_sum3,
+                               square_sum5, &s[0], sq, &ma3[0], &ma5[0], &b3[0],
+                               &b5[0]);
 
   int x = 0;
   do {
-    s.val[0] = s.val[1];
-    sq.val[0] = sq.val[1];
+    uint8x16_t ma3x[3], ma5x[3];
+    int16x8_t p[2];
     BoxFilterPreProcessLastRow(src0, x + 8, scales, sum3, sum5, square_sum3,
-                               square_sum5, &s, &sq, &ma3.val[1], &ma5.val[1],
-                               &b3.val[1], &b5.val[1]);
-    ma[1] = Sum565(ma5);
+                               square_sum5, s, sq + 1, ma3, ma5, &b3[1],
+                               &b5[1]);
+    Prepare3_8<0>(ma5, ma5x);
+    ma[1] = Sum565<0>(ma5x);
     b[1] = Sum565W(b5);
-    ma5.val[0] = ma5.val[1];
-    b5.val[0] = b5.val[1];
-    ma[2] = Sum343(ma3);
+    Prepare3_8<0>(ma3, ma3x);
+    ma[2] = Sum343<0>(ma3x);
     b[2] = Sum343W(b3);
-    ma3.val[0] = ma3.val[1];
-    b3.val[0] = b3.val[1];
-    const uint8x8_t sr = vld1_u8(src + x);
-    int16x8_t p[2];
-    ma[0] = vld1q_u16(ma565[0] + x);
-    b[0].val[0] = vld1q_u32(b565[0] + x + 0);
-    b[0].val[1] = vld1q_u32(b565[0] + x + 4);
-    p[0] = CalculateFilteredOutputPass1(sr, ma, b);
-    ma[0] = vld1q_u16(ma343[0] + x);
-    ma[1] = vld1q_u16(ma444[0] + x);
-    b[0].val[0] = vld1q_u32(b343[0] + x + 0);
-    b[0].val[1] = vld1q_u32(b343[0] + x + 4);
-    b[1].val[0] = vld1q_u32(b444[0] + x + 0);
-    b[1].val[1] = vld1q_u32(b444[0] + x + 4);
-    p[1] = CalculateFilteredOutputPass2(sr, ma, b);
-    SelfGuidedDoubleMultiplier(sr, p, w0, w2, dst + x);
-    x += 8;
+    const uint8x16_t sr = vld1q_u8(src + x);
+    const uint8x8_t sr0 = vget_low_u8(sr);
+    ma[0] = vld1q_u16(ma565 + x);
+    b[0].val[0] = vld1q_u32(b565 + x + 0);
+    b[0].val[1] = vld1q_u32(b565 + x + 4);
+    p[0] = CalculateFilteredOutputPass1(sr0, ma, b);
+    ma[0] = vld1q_u16(ma343 + x);
+    ma[1] = vld1q_u16(ma444 + x);
+    b[0].val[0] = vld1q_u32(b343 + x + 0);
+    b[0].val[1] = vld1q_u32(b343 + x + 4);
+    b[1].val[0] = vld1q_u32(b444 + x + 0);
+    b[1].val[1] = vld1q_u32(b444 + x + 4);
+    p[1] = CalculateFilteredOutputPass2(sr0, ma, b);
+    const uint8x8_t d0 = SelfGuidedDoubleMultiplier(sr0, p, w0, w2);
+
+    ma[1] = Sum565<8>(ma5x);
+    b[1] = Sum565W(b5 + 1);
+    b5[0] = b5[2];
+    ma[2] = Sum343<8>(ma3x);
+    b[2] = Sum343W(b3 + 1);
+    b3[0] = b3[2];
+    const uint8x8_t sr1 = vget_high_u8(sr);
+    ma[0] = vld1q_u16(ma565 + x + 8);
+    b[0].val[0] = vld1q_u32(b565 + x + 8);
+    b[0].val[1] = vld1q_u32(b565 + x + 12);
+    p[0] = CalculateFilteredOutputPass1(sr1, ma, b);
+    ma[0] = vld1q_u16(ma343 + x + 8);
+    ma[1] = vld1q_u16(ma444 + x + 8);
+    b[0].val[0] = vld1q_u32(b343 + x + 8);
+    b[0].val[1] = vld1q_u32(b343 + x + 12);
+    b[1].val[0] = vld1q_u32(b444 + x + 8);
+    b[1].val[1] = vld1q_u32(b444 + x + 12);
+    p[1] = CalculateFilteredOutputPass2(sr1, ma, b);
+    const uint8x8_t d1 = SelfGuidedDoubleMultiplier(sr1, p, w0, w2);
+    vst1q_u8(dst + x, vcombine_u8(d0, d1));
+    s[0] = s[1];
+    sq[1] = sq[3];
+    ma3[0] = ma3[1];
+    ma5[0] = ma5[1];
+    x += 16;
   } while (x < width);
 }
 
 LIBGAV1_ALWAYS_INLINE void BoxFilterProcess(
     const RestorationUnitInfo& restoration_info, const uint8_t* src,
-    const uint8_t* const top_border, const uint8_t* bottom_border,
-    const ptrdiff_t stride, const int width, const int height,
+    const ptrdiff_t stride, const uint8_t* const top_border,
+    const ptrdiff_t top_border_stride, const uint8_t* bottom_border,
+    const ptrdiff_t bottom_border_stride, const int width, const int height,
     SgrBuffer* const sgr_buffer, uint8_t* dst) {
-  const auto temp_stride = Align<ptrdiff_t>(width, 8);
+  const auto temp_stride = Align<ptrdiff_t>(width, 16);
   const ptrdiff_t sum_stride = temp_stride + 8;
   const int sgr_proj_index = restoration_info.sgr_proj_info.index;
   const uint16_t* const scales = kSgrScaleParameter[sgr_proj_index];  // < 2^12.
@@ -1628,13 +2173,13 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterProcess(
   b565[1] = b565[0] + temp_stride;
   assert(scales[0] != 0);
   assert(scales[1] != 0);
-  BoxSum(top_border, stride, 2, sum_stride, sum3[0], sum5[1], square_sum3[0],
-         square_sum5[1]);
+  BoxSum(top_border, top_border_stride, sum_stride, sum3[0], sum5[1],
+         square_sum3[0], square_sum5[1]);
   sum5[0] = sum5[1];
   square_sum5[0] = square_sum5[1];
   const uint8_t* const s = (height > 1) ? src + stride : bottom_border;
   BoxSumFilterPreProcess(src, s, width, scales, sum3, sum5, square_sum3,
-                         square_sum5, ma343, ma444, ma565[0], b343, b444,
+                         square_sum5, ma343, ma444[0], ma565[0], b343, b444[0],
                          b565[0]);
   sum5[0] = sgr_buffer->sum5;
   square_sum5[0] = sgr_buffer->square_sum5;
@@ -1665,7 +2210,7 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterProcess(
     const uint8_t* sr[2];
     if ((height & 1) == 0) {
       sr[0] = bottom_border;
-      sr[1] = bottom_border + stride;
+      sr[1] = bottom_border + bottom_border_stride;
     } else {
       sr[0] = src + 2 * stride;
       sr[1] = bottom_border;
@@ -1689,20 +2234,22 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterProcess(
       std::swap(ma565[0], ma565[1]);
       std::swap(b565[0], b565[1]);
     }
-    BoxFilterLastRow(src + 3, bottom_border + stride, width, scales, w0, w2,
-                     sum3, sum5, square_sum3, square_sum5, ma343, ma444, ma565,
-                     b343, b444, b565, dst);
+    BoxFilterLastRow(src + 3, bottom_border + bottom_border_stride, width,
+                     scales, w0, w2, sum3, sum5, square_sum3, square_sum5,
+                     ma343[0], ma444[0], ma565[0], b343[0], b444[0], b565[0],
+                     dst);
   }
 }
 
 inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
-                                  const uint8_t* src,
+                                  const uint8_t* src, const ptrdiff_t stride,
                                   const uint8_t* const top_border,
+                                  const ptrdiff_t top_border_stride,
                                   const uint8_t* bottom_border,
-                                  const ptrdiff_t stride, const int width,
-                                  const int height, SgrBuffer* const sgr_buffer,
-                                  uint8_t* dst) {
-  const auto temp_stride = Align<ptrdiff_t>(width, 8);
+                                  const ptrdiff_t bottom_border_stride,
+                                  const int width, const int height,
+                                  SgrBuffer* const sgr_buffer, uint8_t* dst) {
+  const auto temp_stride = Align<ptrdiff_t>(width, 16);
   const ptrdiff_t sum_stride = temp_stride + 8;
   const int sgr_proj_index = restoration_info.sgr_proj_info.index;
   const uint32_t scale = kSgrScaleParameter[sgr_proj_index][0];  // < 2^12.
@@ -1720,7 +2267,7 @@ inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
   b565[0] = sgr_buffer->b565;
   b565[1] = b565[0] + temp_stride;
   assert(scale != 0);
-  BoxSum<5>(top_border, stride, 2, sum_stride, sum5[1], square_sum5[1]);
+  BoxSum<5>(top_border, top_border_stride, sum_stride, sum5[1], square_sum5[1]);
   sum5[0] = sum5[1];
   square_sum5[0] = square_sum5[1];
   const uint8_t* const s = (height > 1) ? src + stride : bottom_border;
@@ -1746,7 +2293,7 @@ inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
     const uint8_t* sr[2];
     if ((height & 1) == 0) {
       sr[0] = bottom_border;
-      sr[1] = bottom_border + stride;
+      sr[1] = bottom_border + bottom_border_stride;
     } else {
       sr[0] = src + 2 * stride;
       sr[1] = bottom_border;
@@ -1763,20 +2310,21 @@ inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
       Circulate5PointersBy2<uint16_t>(sum5);
       Circulate5PointersBy2<uint32_t>(square_sum5);
     }
-    BoxFilterPass1LastRow(src + 3, bottom_border + stride, width, scale, w0,
-                          sum5, square_sum5, ma565[0], b565[0], dst);
+    BoxFilterPass1LastRow(src + 3, bottom_border + bottom_border_stride, width,
+                          scale, w0, sum5, square_sum5, ma565[0], b565[0], dst);
   }
 }
 
 inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
-                                  const uint8_t* src,
+                                  const uint8_t* src, const ptrdiff_t stride,
                                   const uint8_t* const top_border,
+                                  const ptrdiff_t top_border_stride,
                                   const uint8_t* bottom_border,
-                                  const ptrdiff_t stride, const int width,
-                                  const int height, SgrBuffer* const sgr_buffer,
-                                  uint8_t* dst) {
+                                  const ptrdiff_t bottom_border_stride,
+                                  const int width, const int height,
+                                  SgrBuffer* const sgr_buffer, uint8_t* dst) {
   assert(restoration_info.sgr_proj_info.multiplier[0] == 0);
-  const auto temp_stride = Align<ptrdiff_t>(width, 8);
+  const auto temp_stride = Align<ptrdiff_t>(width, 16);
   const ptrdiff_t sum_stride = temp_stride + 8;
   const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
   const int16_t w0 = (1 << kSgrProjPrecisionBits) - w1;
@@ -1799,7 +2347,7 @@ inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
   b444[0] = sgr_buffer->b444;
   b444[1] = b444[0] + temp_stride;
   assert(scale != 0);
-  BoxSum<3>(top_border, stride, 2, sum_stride, sum3[0], square_sum3[0]);
+  BoxSum<3>(top_border, top_border_stride, sum_stride, sum3[0], square_sum3[0]);
   BoxSumFilterPreProcess3<false>(src, width, scale, sum3, square_sum3, ma343[0],
                                  nullptr, b343[0], nullptr);
   Circulate3PointersBy1<uint16_t>(sum3);
@@ -1809,7 +2357,7 @@ inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
     s = src + stride;
   } else {
     s = bottom_border;
-    bottom_border += stride;
+    bottom_border += bottom_border_stride;
   }
   BoxSumFilterPreProcess3<true>(s, width, scale, sum3, square_sum3, ma343[1],
                                 ma444[0], b343[1], b444[0]);
@@ -1836,7 +2384,7 @@ inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
                    ma343, ma444, b343, b444, dst);
     src += stride;
     dst += stride;
-    bottom_border += stride;
+    bottom_border += bottom_border_stride;
     Circulate3PointersBy1<uint16_t>(ma343);
     Circulate3PointersBy1<uint32_t>(b343);
     std::swap(ma444[0], ma444[1]);
@@ -1849,8 +2397,9 @@ inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
 // part of the visible frame.
 void SelfGuidedFilter_NEON(
     const RestorationUnitInfo& restoration_info, const void* const source,
-    const void* const top_border, const void* const bottom_border,
-    const ptrdiff_t stride, const int width, const int height,
+    const ptrdiff_t stride, const void* const top_border,
+    const ptrdiff_t top_border_stride, const void* const bottom_border,
+    const ptrdiff_t bottom_border_stride, const int width, const int height,
     RestorationBuffer* const restoration_buffer, void* const dest) {
   const int index = restoration_info.sgr_proj_info.index;
   const int radius_pass_0 = kSgrProjParams[index][0];  // 2 or 0
@@ -1864,14 +2413,17 @@ void SelfGuidedFilter_NEON(
     // |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the
     // following assertion.
     assert(radius_pass_0 != 0);
-    BoxFilterProcessPass1(restoration_info, src - 3, top - 3, bottom - 3,
-                          stride, width, height, sgr_buffer, dst);
+    BoxFilterProcessPass1(restoration_info, src - 3, stride, top - 3,
+                          top_border_stride, bottom - 3, bottom_border_stride,
+                          width, height, sgr_buffer, dst);
   } else if (radius_pass_0 == 0) {
-    BoxFilterProcessPass2(restoration_info, src - 2, top - 2, bottom - 2,
-                          stride, width, height, sgr_buffer, dst);
+    BoxFilterProcessPass2(restoration_info, src - 2, stride, top - 2,
+                          top_border_stride, bottom - 2, bottom_border_stride,
+                          width, height, sgr_buffer, dst);
   } else {
-    BoxFilterProcess(restoration_info, src - 3, top - 3, bottom - 3, stride,
-                     width, height, sgr_buffer, dst);
+    BoxFilterProcess(restoration_info, src - 3, stride, top - 3,
+                     top_border_stride, bottom - 3, bottom_border_stride, width,
+                     height, sgr_buffer, dst);
   }
 }
 
@@ -1890,7 +2442,7 @@ void LoopRestorationInit_NEON() { low_bitdepth::Init8bpp(); }
 }  // namespace dsp
 }  // namespace libgav1
 
-#else  // !LIBGAV1_ENABLE_NEON
+#else   // !LIBGAV1_ENABLE_NEON
 namespace libgav1 {
 namespace dsp {
 
diff --git a/src/dsp/arm/mask_blend_neon.cc b/src/dsp/arm/mask_blend_neon.cc
index 084f42f..ee50923 100644
--- a/src/dsp/arm/mask_blend_neon.cc
+++ b/src/dsp/arm/mask_blend_neon.cc
@@ -432,7 +432,7 @@ void MaskBlendInit_NEON() { low_bitdepth::Init8bpp(); }
 }  // namespace dsp
 }  // namespace libgav1
 
-#else  // !LIBGAV1_ENABLE_NEON
+#else   // !LIBGAV1_ENABLE_NEON
 
 namespace libgav1 {
 namespace dsp {
diff --git a/src/dsp/arm/motion_field_projection_neon.cc b/src/dsp/arm/motion_field_projection_neon.cc
index 8caba7d..3e731b2 100644
--- a/src/dsp/arm/motion_field_projection_neon.cc
+++ b/src/dsp/arm/motion_field_projection_neon.cc
@@ -382,7 +382,7 @@ void MotionFieldProjectionInit_NEON() {
 }  // namespace dsp
 }  // namespace libgav1
 
-#else  // !LIBGAV1_ENABLE_NEON
+#else   // !LIBGAV1_ENABLE_NEON
 namespace libgav1 {
 namespace dsp {
 
diff --git a/src/dsp/arm/motion_vector_search_neon.cc b/src/dsp/arm/motion_vector_search_neon.cc
index 8a403a6..da3ba17 100644
--- a/src/dsp/arm/motion_vector_search_neon.cc
+++ b/src/dsp/arm/motion_vector_search_neon.cc
@@ -256,7 +256,7 @@ void MotionVectorSearchInit_NEON() {
 }  // namespace dsp
 }  // namespace libgav1
 
-#else  // !LIBGAV1_ENABLE_NEON
+#else   // !LIBGAV1_ENABLE_NEON
 namespace libgav1 {
 namespace dsp {
 
diff --git a/src/dsp/arm/obmc_neon.cc b/src/dsp/arm/obmc_neon.cc
index 66ad663..1111a90 100644
--- a/src/dsp/arm/obmc_neon.cc
+++ b/src/dsp/arm/obmc_neon.cc
@@ -380,7 +380,7 @@ void ObmcInit_NEON() { Init8bpp(); }
 }  // namespace dsp
 }  // namespace libgav1
 
-#else  // !LIBGAV1_ENABLE_NEON
+#else   // !LIBGAV1_ENABLE_NEON
 
 namespace libgav1 {
 namespace dsp {
diff --git a/src/dsp/arm/super_res_neon.cc b/src/dsp/arm/super_res_neon.cc
index 1680450..91537c4 100644
--- a/src/dsp/arm/super_res_neon.cc
+++ b/src/dsp/arm/super_res_neon.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "src/dsp/arm/common_neon.h"
 #include "src/dsp/super_res.h"
 #include "src/utils/cpu.h"
 
@@ -20,6 +19,7 @@
 
 #include <arm_neon.h>
 
+#include "src/dsp/arm/common_neon.h"
 #include "src/dsp/constants.h"
 #include "src/dsp/dsp.h"
 #include "src/utils/common.h"
@@ -82,10 +82,10 @@ inline uint8x8_t SuperRes(const uint8x8_t src[kSuperResFilterTaps],
 }
 
 void SuperRes_NEON(const void* const coefficients, void* const source,
-                   const ptrdiff_t stride, const int height,
+                   const ptrdiff_t source_stride, const int height,
                    const int downscaled_width, const int upscaled_width,
                    const int initial_subpixel_x, const int step,
-                   void* const dest) {
+                   void* const dest, const ptrdiff_t dest_stride) {
   auto* src = static_cast<uint8_t*>(source) - DivideBy2(kSuperResFilterTaps);
   auto* dst = static_cast<uint8_t*>(dest);
   int y = height;
@@ -100,7 +100,7 @@ void SuperRes_NEON(const void* const coefficients, void* const source,
     int x = RightShiftWithCeiling(upscaled_width, 4);
     // The below code calculates up to 15 extra upscaled
     // pixels which will over-read up to 15 downscaled pixels in the end of each
-    // row. kSuperResHorizontalBorder accounts for this.
+    // row. kSuperResHorizontalPadding accounts for this.
     do {
       for (int i = 0; i < 8; ++i, subpixel_x += step) {
         sr[i] = vld1_u8(&src[subpixel_x >> kSuperResScaleBits]);
@@ -135,8 +135,8 @@ void SuperRes_NEON(const void* const coefficients, void* const source,
       vst1q_u8(dst_ptr, vcombine_u8(d0, d1));
       dst_ptr += 16;
     } while (--x != 0);
-    src += stride;
-    dst += stride;
+    src += source_stride;
+    dst += dest_stride;
   } while (--y != 0);
 }
 
@@ -149,12 +149,147 @@ void Init8bpp() {
 }  // namespace
 }  // namespace low_bitdepth
 
-void SuperResInit_NEON() { low_bitdepth::Init8bpp(); }
+//------------------------------------------------------------------------------
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+void SuperResCoefficients_NEON(const int upscaled_width,
+                               const int initial_subpixel_x, const int step,
+                               void* const coefficients) {
+  auto* dst = static_cast<uint16_t*>(coefficients);
+  int subpixel_x = initial_subpixel_x;
+  int x = RightShiftWithCeiling(upscaled_width, 3);
+  do {
+    uint16x8_t filter[8];
+    for (int i = 0; i < 8; ++i, subpixel_x += step) {
+      const uint8x8_t filter_8 =
+          vld1_u8(kUpscaleFilterUnsigned[(subpixel_x & kSuperResScaleMask) >>
+                                         kSuperResExtraBits]);
+      // uint8_t -> uint16_t
+      filter[i] = vmovl_u8(filter_8);
+    }
+
+    Transpose8x8(filter);
+
+    vst1q_u16(dst, filter[0]);
+    dst += 8;
+    vst1q_u16(dst, filter[1]);
+    dst += 8;
+    vst1q_u16(dst, filter[2]);
+    dst += 8;
+    vst1q_u16(dst, filter[3]);
+    dst += 8;
+    vst1q_u16(dst, filter[4]);
+    dst += 8;
+    vst1q_u16(dst, filter[5]);
+    dst += 8;
+    vst1q_u16(dst, filter[6]);
+    dst += 8;
+    vst1q_u16(dst, filter[7]);
+    dst += 8;
+  } while (--x != 0);
+}
+
+// The sum is clipped to [0, ((1 << bitdepth) -1)]. Adding all positive and then
+// subtracting all negative with saturation will clip to zero.
+//           0 1 2 3 4 5 6 7
+// tap sign: - + - + + - + -
+inline uint16x8_t SuperRes(const uint16x8_t src[kSuperResFilterTaps],
+                           const uint16_t** coefficients, int bitdepth) {
+  uint16x8_t f[kSuperResFilterTaps];
+  for (int i = 0; i < kSuperResFilterTaps; ++i, *coefficients += 8) {
+    f[i] = vld1q_u16(*coefficients);
+  }
+
+  uint32x4_t res_lo = vmull_u16(vget_low_u16(src[1]), vget_low_u16(f[1]));
+  res_lo = vmlal_u16(res_lo, vget_low_u16(src[3]), vget_low_u16(f[3]));
+  res_lo = vmlal_u16(res_lo, vget_low_u16(src[4]), vget_low_u16(f[4]));
+  res_lo = vmlal_u16(res_lo, vget_low_u16(src[6]), vget_low_u16(f[6]));
+
+  uint32x4_t temp_lo = vmull_u16(vget_low_u16(src[0]), vget_low_u16(f[0]));
+  temp_lo = vmlal_u16(temp_lo, vget_low_u16(src[2]), vget_low_u16(f[2]));
+  temp_lo = vmlal_u16(temp_lo, vget_low_u16(src[5]), vget_low_u16(f[5]));
+  temp_lo = vmlal_u16(temp_lo, vget_low_u16(src[7]), vget_low_u16(f[7]));
+
+  res_lo = vqsubq_u32(res_lo, temp_lo);
+
+  uint32x4_t res_hi = vmull_u16(vget_high_u16(src[1]), vget_high_u16(f[1]));
+  res_hi = vmlal_u16(res_hi, vget_high_u16(src[3]), vget_high_u16(f[3]));
+  res_hi = vmlal_u16(res_hi, vget_high_u16(src[4]), vget_high_u16(f[4]));
+  res_hi = vmlal_u16(res_hi, vget_high_u16(src[6]), vget_high_u16(f[6]));
 
+  uint32x4_t temp_hi = vmull_u16(vget_high_u16(src[0]), vget_high_u16(f[0]));
+  temp_hi = vmlal_u16(temp_hi, vget_high_u16(src[2]), vget_high_u16(f[2]));
+  temp_hi = vmlal_u16(temp_hi, vget_high_u16(src[5]), vget_high_u16(f[5]));
+  temp_hi = vmlal_u16(temp_hi, vget_high_u16(src[7]), vget_high_u16(f[7]));
+
+  res_hi = vqsubq_u32(res_hi, temp_hi);
+
+  const uint16x8_t res = vcombine_u16(vqrshrn_n_u32(res_lo, kFilterBits),
+                                      vqrshrn_n_u32(res_hi, kFilterBits));
+
+  // Clip the result at (1 << bd) - 1.
+  return vminq_u16(res, vdupq_n_u16((1 << bitdepth) - 1));
+}
+
+template <int bitdepth>
+void SuperRes_NEON(const void* const coefficients, void* const source,
+                   const ptrdiff_t source_stride, const int height,
+                   const int downscaled_width, const int upscaled_width,
+                   const int initial_subpixel_x, const int step,
+                   void* const dest, const ptrdiff_t dest_stride) {
+  auto* src = static_cast<uint16_t*>(source) - DivideBy2(kSuperResFilterTaps);
+  auto* dst = static_cast<uint16_t*>(dest);
+  int y = height;
+  do {
+    const auto* filter = static_cast<const uint16_t*>(coefficients);
+    uint16_t* dst_ptr = dst;
+    ExtendLine<uint16_t>(src + DivideBy2(kSuperResFilterTaps), downscaled_width,
+                         kSuperResHorizontalBorder, kSuperResHorizontalBorder);
+    int subpixel_x = initial_subpixel_x;
+    uint16x8_t sr[8];
+    int x = RightShiftWithCeiling(upscaled_width, 3);
+    // The below code calculates up to 7 extra upscaled
+    // pixels which will over-read up to 7 downscaled pixels in the end of each
+    // row. kSuperResHorizontalBorder accounts for this.
+    do {
+      for (int i = 0; i < 8; ++i, subpixel_x += step) {
+        sr[i] = vld1q_u16(&src[subpixel_x >> kSuperResScaleBits]);
+      }
+
+      Transpose8x8(sr);
+
+      const uint16x8_t d0 = SuperRes(sr, &filter, bitdepth);
+      vst1q_u16(dst_ptr, d0);
+      dst_ptr += 8;
+    } while (--x != 0);
+    src += source_stride;
+    dst += dest_stride;
+  } while (--y != 0);
+}
+
+void Init10bpp() {
+  Dsp* dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+  dsp->super_res_coefficients = SuperResCoefficients_NEON;
+  dsp->super_res = SuperRes_NEON<10>;
+}
+
+}  // namespace
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+void SuperResInit_NEON() {
+  low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif
+}
 }  // namespace dsp
 }  // namespace libgav1
 
-#else  // !LIBGAV1_ENABLE_NEON
+#else   // !LIBGAV1_ENABLE_NEON
 
 namespace libgav1 {
 namespace dsp {
diff --git a/src/dsp/arm/super_res_neon.h b/src/dsp/arm/super_res_neon.h
index f51785d..65e48c5 100644
--- a/src/dsp/arm/super_res_neon.h
+++ b/src/dsp/arm/super_res_neon.h
@@ -31,7 +31,10 @@ void SuperResInit_NEON();
 
 #if LIBGAV1_ENABLE_NEON
 #define LIBGAV1_Dsp8bpp_SuperRes LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_SuperResClip LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_SuperResCoefficients LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_SuperResCoefficients LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_SuperRes LIBGAV1_CPU_NEON
 #endif  // LIBGAV1_ENABLE_NEON
 
 #endif  // LIBGAV1_SRC_DSP_ARM_SUPER_RES_NEON_H_
diff --git a/src/dsp/arm/warp_neon.cc b/src/dsp/arm/warp_neon.cc
index 7a41998..c7fb739 100644
--- a/src/dsp/arm/warp_neon.cc
+++ b/src/dsp/arm/warp_neon.cc
@@ -289,7 +289,7 @@ void Warp_NEON(const void* const source, const ptrdiff_t source_stride,
             const int16x8_t sum = vld1q_s16(tmp);
             vst1_u8(reinterpret_cast<uint8_t*>(dst_row), vqmovun_s16(sum));
           }
-#else  // !defined(__aarch64__)
+#else   // !defined(__aarch64__)
           int16x8_t filter[8];
           for (int x = 0; x < 8; ++x) {
             const int offset =
@@ -442,7 +442,7 @@ void WarpInit_NEON() { low_bitdepth::Init8bpp(); }
 
 }  // namespace dsp
 }  // namespace libgav1
-#else  // !LIBGAV1_ENABLE_NEON
+#else   // !LIBGAV1_ENABLE_NEON
 namespace libgav1 {
 namespace dsp {
 
diff --git a/src/dsp/arm/weight_mask_neon.cc b/src/dsp/arm/weight_mask_neon.cc
index 49d3be0..7e5bff0 100644
--- a/src/dsp/arm/weight_mask_neon.cc
+++ b/src/dsp/arm/weight_mask_neon.cc
@@ -451,7 +451,7 @@ void WeightMaskInit_NEON() { low_bitdepth::Init8bpp(); }
 }  // namespace dsp
 }  // namespace libgav1
 
-#else  // !LIBGAV1_ENABLE_NEON
+#else   // !LIBGAV1_ENABLE_NEON
 
 namespace libgav1 {
 namespace dsp {
diff --git a/src/dsp/average_blend.cc b/src/dsp/average_blend.cc
index a59abb0..d3ec21f 100644
--- a/src/dsp/average_blend.cc
+++ b/src/dsp/average_blend.cc
@@ -76,9 +76,7 @@ void Init10bpp() {
   Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
   assert(dsp != nullptr);
 #if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
-#ifndef LIBGAV1_Dsp10bpp_AverageBlend
   dsp->average_blend = AverageBlend_C<10, uint16_t>;
-#endif
 #else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
   static_cast<void>(dsp);
 #ifndef LIBGAV1_Dsp10bpp_AverageBlend
diff --git a/src/dsp/average_blend_test.cc b/src/dsp/average_blend_test.cc
new file mode 100644
index 0000000..fe8a9d6
--- /dev/null
+++ b/src/dsp/average_blend_test.cc
@@ -0,0 +1,322 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/average_blend.h"
+
+#include <cstdint>
+#include <ostream>
+#include <string>
+#include <type_traits>
+
+#include "absl/strings/match.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/distance_weighted_blend.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/memory.h"
+#include "tests/block_utils.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kNumSpeedTests = 5e8;
+constexpr char kAverageBlend[] = "AverageBlend";
+// average_blend is applied to compound prediction values. This implies a range
+// far exceeding that of pixel values.
+// The ranges include kCompoundOffset in 10bpp and 12bpp.
+// see: src/dsp/convolve.cc & src/dsp/warp.cc.
+constexpr int kCompoundPredictionRange[3][2] = {
+    // 8bpp
+    {-5132, 9212},
+    // 10bpp
+    {3988, 61532},
+    // 12bpp
+    {3974, 61559},
+};
+
+struct TestParam {
+  TestParam(int width, int height) : width(width), height(height) {}
+  int width;
+  int height;
+};
+
+std::ostream& operator<<(std::ostream& os, const TestParam& param) {
+  return os << "BlockSize" << param.width << "x" << param.height;
+}
+
+template <int bitdepth, typename Pixel>
+class AverageBlendTest : public testing::TestWithParam<TestParam>,
+                         public test_utils::MaxAlignedAllocable {
+ public:
+  AverageBlendTest() = default;
+  ~AverageBlendTest() override = default;
+
+  void SetUp() override {
+    test_utils::ResetDspTable(bitdepth);
+    AverageBlendInit_C();
+    DistanceWeightedBlendInit_C();
+    const dsp::Dsp* const dsp = dsp::GetDspTable(bitdepth);
+    ASSERT_NE(dsp, nullptr);
+    base_func_ = dsp->average_blend;
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    const absl::string_view test_case = test_info->test_suite_name();
+    if (absl::StartsWith(test_case, "C/")) {
+      base_func_ = nullptr;
+    } else if (absl::StartsWith(test_case, "SSE41/")) {
+      if ((GetCpuInfo() & kSSE4_1) != 0) {
+        AverageBlendInit_SSE4_1();
+      }
+    } else if (absl::StartsWith(test_case, "NEON/")) {
+      AverageBlendInit_NEON();
+    } else {
+      FAIL() << "Unrecognized architecture prefix in test case name: "
+             << test_case;
+    }
+    func_ = dsp->average_blend;
+    dist_blend_func_ = dsp->distance_weighted_blend;
+  }
+
+ protected:
+  void Test(const char* digest, int num_tests, bool debug);
+
+ private:
+  using PredType =
+      typename std::conditional<bitdepth == 8, int16_t, uint16_t>::type;
+  static constexpr int kDestStride = kMaxSuperBlockSizeInPixels;
+  const int width_ = GetParam().width;
+  const int height_ = GetParam().height;
+  alignas(kMaxAlignment) PredType
+      source1_[kMaxSuperBlockSizeInPixels * kMaxSuperBlockSizeInPixels];
+  alignas(kMaxAlignment) PredType
+      source2_[kMaxSuperBlockSizeInPixels * kMaxSuperBlockSizeInPixels];
+  Pixel dest_[kMaxSuperBlockSizeInPixels * kMaxSuperBlockSizeInPixels] = {};
+  Pixel reference_[kMaxSuperBlockSizeInPixels * kMaxSuperBlockSizeInPixels] =
+      {};
+  dsp::AverageBlendFunc base_func_;
+  dsp::AverageBlendFunc func_;
+  dsp::DistanceWeightedBlendFunc dist_blend_func_;
+};
+
+template <int bitdepth, typename Pixel>
+void AverageBlendTest<bitdepth, Pixel>::Test(const char* digest, int num_tests,
+                                             bool debug) {
+  if (func_ == nullptr) return;
+  libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+  PredType* src_1 = source1_;
+  PredType* src_2 = source2_;
+  for (int y = 0; y < height_; ++y) {
+    for (int x = 0; x < width_; ++x) {
+      constexpr int bitdepth_index = (bitdepth - 8) >> 1;
+      const int min_val = kCompoundPredictionRange[bitdepth_index][0];
+      const int max_val = kCompoundPredictionRange[bitdepth_index][1];
+      src_1[x] = static_cast<PredType>(rnd(max_val - min_val) + min_val);
+      src_2[x] = static_cast<PredType>(rnd(max_val - min_val) + min_val);
+    }
+    src_1 += width_;
+    src_2 += width_;
+  }
+  absl::Duration elapsed_time;
+  for (int i = 0; i < num_tests; ++i) {
+    const absl::Time start = absl::Now();
+    func_(source1_, source2_, width_, height_, dest_,
+          sizeof(dest_[0]) * kDestStride);
+    elapsed_time += absl::Now() - start;
+  }
+  if (debug) {
+    if (base_func_ != nullptr) {
+      base_func_(source1_, source2_, width_, height_, reference_,
+                 sizeof(reference_[0]) * kDestStride);
+    } else {
+      // Use dist_blend_func_ as the base for C tests.
+      const int8_t weight = 8;
+      dist_blend_func_(source1_, source2_, weight, weight, width_, height_,
+                       reference_, sizeof(reference_[0]) * kDestStride);
+    }
+    EXPECT_TRUE(test_utils::CompareBlocks(dest_, reference_, width_, height_,
+                                          kDestStride, kDestStride, false));
+  }
+
+  test_utils::CheckMd5Digest(
+      kAverageBlend, absl::StrFormat("%dx%d", width_, height_).c_str(), digest,
+      dest_, sizeof(dest_[0]) * kDestStride * height_, elapsed_time);
+}
+
+const TestParam kTestParam[] = {
+    TestParam(4, 4),    TestParam(4, 8),    TestParam(8, 8),
+    TestParam(8, 16),   TestParam(16, 8),   TestParam(16, 16),
+    TestParam(16, 32),  TestParam(32, 16),  TestParam(32, 32),
+    TestParam(32, 64),  TestParam(64, 32),  TestParam(64, 64),
+    TestParam(64, 128), TestParam(128, 64), TestParam(128, 128),
+};
+
+using AverageBlendTest8bpp = AverageBlendTest<8, uint8_t>;
+
+const char* GetAverageBlendDigest8bpp(const TestParam block_size) {
+  static const char* const kDigestsWidth4[] = {
+      "152bcc35946900b1ed16369b3e7a81b7",
+      "c23e9b5698f7384eaae30a3908118b77",
+  };
+  static const char* const kDigestsWidth8[] = {
+      "d90d3abd368e58c513070a88b34649ba",
+      "77f7d53d0edeffb3537afffd9ff33a4a",
+  };
+  static const char* const kDigestsWidth16[] = {
+      "a50e268e93b48ae39cc2a47d377410e2",
+      "65c8502ff6d78065d466f9911ed6bb3e",
+      "bc2c873b9f5d74b396e1df705e87f699",
+  };
+  static const char* const kDigestsWidth32[] = {
+      "ca40d46d89773e7f858b15fcecd43cc0",
+      "bfdc894707323f4dc43d1326309f8368",
+      "f4733417621719b7feba3166ec0da5b9",
+  };
+  static const char* const kDigestsWidth64[] = {
+      "db38fe2e082bd4a09acb3bb1d52ee11e",
+      "3ad44401cc731215c46c9b7d96f7e4ae",
+      "6c43267be5ed03d204a05fe36090f870",
+  };
+  static const char* const kDigestsWidth128[] = {
+      "c8cfe46ebf166c1cbf08e8804206aadb",
+      "b0557b5156d2334c8ce4a7ee12f9d6b4",
+  };
+  // height < width implies 0.
+  // height == width implies 1.
+  // height > width implies 2.
+  const int height_index = block_size.height / block_size.width;
+  switch (block_size.width) {
+    case 4:
+      return kDigestsWidth4[height_index - 1];
+    case 8:
+      return kDigestsWidth8[height_index - 1];
+    case 16:
+      return kDigestsWidth16[height_index];
+    case 32:
+      return kDigestsWidth32[height_index];
+    case 64:
+      return kDigestsWidth64[height_index];
+    default:
+      EXPECT_EQ(block_size.width, 128)
+          << "Unknown width parameter: " << block_size.width;
+      return kDigestsWidth128[height_index];
+  }
+}
+
+TEST_P(AverageBlendTest8bpp, Blending) {
+  Test(GetAverageBlendDigest8bpp(GetParam()), 1, false);
+}
+
+TEST_P(AverageBlendTest8bpp, DISABLED_Speed) {
+  Test(GetAverageBlendDigest8bpp(GetParam()),
+       kNumSpeedTests / (GetParam().height * GetParam().width), false);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, AverageBlendTest8bpp,
+                         testing::ValuesIn(kTestParam));
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, AverageBlendTest8bpp,
+                         testing::ValuesIn(kTestParam));
+#endif
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, AverageBlendTest8bpp,
+                         testing::ValuesIn(kTestParam));
+#endif
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using AverageBlendTest10bpp = AverageBlendTest<10, uint16_t>;
+
+const char* GetAverageBlendDigest10bpp(const TestParam block_size) {
+  static const char* const kDigestsWidth4[] = {
+      "98c0671c092b4288adcaaa17362cc4a3",
+      "7083f3def8bfb63ab3a985ef5616a923",
+  };
+  static const char* const kDigestsWidth8[] = {
+      "3bee144b9ea6f4288b860c24f88a22f3",
+      "27113bd17bf95034f100e9046c7b59d2",
+  };
+  static const char* const kDigestsWidth16[] = {
+      "24c9e079b9a8647a6ee03f5441f2cdd9",
+      "dd05777751ccdb4356856c90e1176e53",
+      "27b1d69d035b1525c013b7373cfe3875",
+  };
+  static const char* const kDigestsWidth32[] = {
+      "efd24dd7b555786bff1a482e51170ea3",
+      "3b37ddac87de443cd18784f02c2d1dd5",
+      "80d8070939a743a20689a65bf5dc0a68",
+  };
+  static const char* const kDigestsWidth64[] = {
+      "af1fe8c52487c9f2951c3ea516828abb",
+      "ea6f18ff56b053748c18032b7e048e83",
+      "af0cb87fe27d24c2e0afd2c90a8533a6",
+  };
+  static const char* const kDigestsWidth128[] = {
+      "16a83b19911d6dc7278a694b8baa9901",
+      "bd22e77ce6fa727267ff63eeb4dcb19c",
+  };
+  // (height  < width) -> 0
+  // (height == width) -> 1
+  // (height  > width) -> 2
+  const int height_index = block_size.height / block_size.width;
+  switch (block_size.width) {
+    case 4:
+      return kDigestsWidth4[height_index - 1];
+    case 8:
+      return kDigestsWidth8[height_index - 1];
+    case 16:
+      return kDigestsWidth16[height_index];
+    case 32:
+      return kDigestsWidth32[height_index];
+    case 64:
+      return kDigestsWidth64[height_index];
+    default:
+      EXPECT_EQ(block_size.width, 128)
+          << "Unknown width parameter: " << block_size.width;
+      return kDigestsWidth128[height_index];
+  }
+}
+
+TEST_P(AverageBlendTest10bpp, Blending) {
+  Test(GetAverageBlendDigest10bpp(GetParam()), 1, false);
+}
+
+TEST_P(AverageBlendTest10bpp, DISABLED_Speed) {
+  Test(GetAverageBlendDigest10bpp(GetParam()),
+       kNumSpeedTests / (GetParam().height * GetParam().width) / 2, false);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, AverageBlendTest10bpp,
+                         testing::ValuesIn(kTestParam));
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, AverageBlendTest10bpp,
+                         testing::ValuesIn(kTestParam));
+#endif
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, AverageBlendTest10bpp,
+                         testing::ValuesIn(kTestParam));
+#endif
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+}  // namespace
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/cdef.h b/src/dsp/cdef.h
index 2d70d2c..b820b77 100644
--- a/src/dsp/cdef.h
+++ b/src/dsp/cdef.h
@@ -30,6 +30,7 @@
 // The order of includes is important as each tests for a superior version
 // before setting the base.
 // clang-format off
+#include "src/dsp/x86/cdef_avx2.h"
 #include "src/dsp/x86/cdef_sse4.h"
 // clang-format on
 // IWYU pragma: end_exports
diff --git a/src/dsp/cdef_test.cc b/src/dsp/cdef_test.cc
new file mode 100644
index 0000000..fd64593
--- /dev/null
+++ b/src/dsp/cdef_test.cc
@@ -0,0 +1,409 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/cdef.h"
+
+#include <cstdint>
+#include <cstring>
+#include <ostream>
+
+#include "absl/strings/match.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/memory.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/third_party/libvpx/md5_helper.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr char kCdef[] = "Cdef";
+constexpr char kCdefDirectionName[] = "Cdef Direction";
+constexpr char kCdefFilterName[] = "Cdef Filtering";
+constexpr int kTestBufferStride = 8;
+constexpr int kTestBufferSize = 64;
+constexpr int kSourceStride = kMaxSuperBlockSizeInPixels + 2 * 8;
+constexpr int kSourceBufferSize =
+    (kMaxSuperBlockSizeInPixels + 2 * 3) * kSourceStride;
+constexpr int kNumSpeedTests = 5000;
+
+const char* GetDirectionDigest(const int bitdepth, const int num_runs) {
+  static const char* const kDigest[2][2] = {
+      {"de78c820a1fec7e81385aa0a615dbf8c", "7bfc543244f932a542691480dc4541b2"},
+      {"b54236de5d25e16c0f8678d9784cb85e", "559144cf183f3c69cb0e5d98cbf532ff"}};
+  const int bitdepth_index = (bitdepth == 8) ? 0 : 1;
+  const int run_index = (num_runs == 1) ? 0 : 1;
+  return kDigest[bitdepth_index][run_index];
+}
+
+template <int bitdepth, typename Pixel>
+class CdefDirectionTest : public testing::TestWithParam<int> {
+ public:
+  CdefDirectionTest() = default;
+  CdefDirectionTest(const CdefDirectionTest&) = delete;
+  CdefDirectionTest& operator=(const CdefDirectionTest&) = delete;
+  ~CdefDirectionTest() override = default;
+
+ protected:
+  void SetUp() override {
+    test_utils::ResetDspTable(bitdepth);
+    CdefInit_C();
+
+    const Dsp* const dsp = GetDspTable(bitdepth);
+    ASSERT_NE(dsp, nullptr);
+    base_cdef_direction_ = nullptr;
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    const char* const test_case = test_info->test_suite_name();
+    if (absl::StartsWith(test_case, "C/")) {
+    } else if (absl::StartsWith(test_case, "SSE41/")) {
+      CdefInit_SSE4_1();
+    } else if (absl::StartsWith(test_case, "AVX2/")) {
+      if ((GetCpuInfo() & kAVX2) != 0) {
+        CdefInit_AVX2();
+      }
+    } else if (absl::StartsWith(test_case, "NEON/")) {
+      CdefInit_NEON();
+    } else {
+      FAIL() << "Unrecognized architecture prefix in test case name: "
+             << test_case;
+    }
+    cur_cdef_direction_ = dsp->cdef_direction;
+  }
+
+  void TestRandomValues(int num_runs);
+
+  Pixel buffer_[kTestBufferSize];
+  int strength_;
+  int size_;
+
+  CdefDirectionFunc base_cdef_direction_;
+  CdefDirectionFunc cur_cdef_direction_;
+};
+
+template <int bitdepth, typename Pixel>
+void CdefDirectionTest<bitdepth, Pixel>::TestRandomValues(int num_runs) {
+  if (cur_cdef_direction_ == nullptr) return;
+  libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+  absl::Duration elapsed_time;
+  libvpx_test::MD5 actual_digest;
+  for (int num_tests = 0; num_tests < num_runs; ++num_tests) {
+    for (int level = 0; level < (1 << bitdepth); level += 1 + (bitdepth - 8)) {
+      for (int bits = 0; bits <= bitdepth; ++bits) {
+        for (auto& pixel : buffer_) {
+          pixel = Clip3((rnd.Rand16() & ((1 << bits) - 1)) + level, 0,
+                        (1 << bitdepth) - 1);
+        }
+        int output[2] = {};
+        const absl::Time start = absl::Now();
+        cur_cdef_direction_(buffer_, kTestBufferStride * sizeof(Pixel),
+                            reinterpret_cast<uint8_t*>(&output[0]), &output[1]);
+        elapsed_time += absl::Now() - start;
+        actual_digest.Add(reinterpret_cast<const uint8_t*>(output),
+                          sizeof(output));
+      }
+    }
+  }
+  test_utils::CheckMd5Digest(kCdef, kCdefDirectionName,
+                             GetDirectionDigest(bitdepth, num_runs),
+                             actual_digest.Get(), elapsed_time);
+}
+
+using CdefDirectionTest8bpp = CdefDirectionTest<8, uint8_t>;
+
+TEST_P(CdefDirectionTest8bpp, Correctness) { TestRandomValues(1); }
+
+TEST_P(CdefDirectionTest8bpp, DISABLED_Speed) {
+  TestRandomValues(kNumSpeedTests / 100);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, CdefDirectionTest8bpp, testing::Values(0));
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, CdefDirectionTest8bpp, testing::Values(0));
+#endif
+
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, CdefDirectionTest8bpp, testing::Values(0));
+#endif
+
+#if LIBGAV1_ENABLE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, CdefDirectionTest8bpp, testing::Values(0));
+#endif  // LIBGAV1_ENABLE_AVX2
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using CdefDirectionTest10bpp = CdefDirectionTest<10, uint16_t>;
+
+TEST_P(CdefDirectionTest10bpp, Correctness) { TestRandomValues(1); }
+
+TEST_P(CdefDirectionTest10bpp, DISABLED_Speed) {
+  TestRandomValues(kNumSpeedTests / 100);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, CdefDirectionTest10bpp, testing::Values(0));
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+const char* GetDigest8bpp(int id) {
+  static const char* const kDigest[] = {
+      "b6fe1a1f5bbb23e35197160ce57d90bd", "8aed39871b19184f1d381b145779bc33",
+      "82653dd66072e8ebd967083a0413ab03", "421c048396bc66ffaa6aafa016c7bc54",
+      "1f70ba51091e8c6034c3f0974af241c3", "8f700997452a24091136ca58890a5be4",
+      "9deaaf07db25ca1d96ea8762925372d3", "7edadd9ad058be518430e64f78fe34a2",
+      "862362a654edb2562609895395eb69cd", "3b4dae4d353b75f652ce67f96b2fd718",
+      "65c51f49e4fd848d9fef23a346702b17", "f93b3fa86764e53e4c206ef01d5ee9db",
+      "202e36551bc147c30b76ae359d5f7646", "3de677a2b6fe4aa6fc29a5e5f2d63063",
+      "ab860362809e878f7b47dacc6087bce3", "c0d991affc8aeb45d91ae36e7b3d77d8",
+      "27f19fffabfb79104b4be3c272723f62", "a54b981f562e2cf10a4fb037d0181e2d",
+      "9a65933d02867a1e8fc1f29097d4d0db", "c068b21d232145c61db8ef9298447bfa",
+      "8db1948c23648372509e4f3577e8eaa0", "c08a3b192ab0a47abe22f7f0ae78a5d7",
+      "4ff9bd4ae06f2cc2d2660df41cf1baca", "a0a634e48c55a2ca340cf5cac7f74cb6",
+      "f9f631985b42214f8b059c8f119d4401", "5fb136073300a45d74145649473970da",
+      "33624aab8ba0264657fa9304dbdcf72c", "e6a15775d451a3c4803a7c0604deb0ea",
+      "4c28b63022cdc5ea0e49b492c187d53d", "c5fa9792ee292d29c5a864e376ddacc0",
+      "fcdf7319978b64f03ca3b9d4d83a0c2a", "394931c89bd5065308b0633d12370b19",
+      "9e702d68000c1b02759001e9a8876df2", "c844919f0114e83960dd329b1aa7146f",
+      "499248c675884db3ef57018d0a0868b5", "4a9041ed183f9add717e5ddcdb280799",
+  };
+  return kDigest[id];
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+const char* GetDigest10bpp(int id) {
+  static const char* const kDigest[] = {
+      "0a9630b39974850998db653b07e09ab4", "97a924661d931b23ee57893da617ae70",
+      "0d79516b9a491ce5112eb00bbae5eb80", "d5801fd96029a7509cf66dde61e8e2d8",
+      "5bf5c0ea5a85e9b6c1e6991619c34ebc", "e2f1c08a8b3cd93b3a85511493a0ee31",
+      "18910f422e386c71ffde8680176d61c0", "3255afe8b3db5be4c17299420ae9b4b3",
+      "ccac34de92891d4ef25820737e7a4f06", "5c2109c4142867c15bc6bb81e19b8058",
+      "86e8300e2ad292bfce95185530ef06c8", "21c06ed6d62b8fbef1363cd177386cd0",
+      "fd6687987dbff6f15210c2cc61570daa", "7cb246cb65a9cf9b2f829ab086f7c45a",
+      "3a38dc3c89f7e400383b1b7ce3e73008", "7b23b520e41ad510b9608b47f9c5f87e",
+      "f9ca24b57fc06d7b8dc4151bbc4d2840", "070ef8fa64dcdc45701428ee6ef0ca79",
+      "0e7e3ca3cf8546972d01fc262b2b9cfb", "9ac81b7cf93173f33d195927b0a3685a",
+      "1f964b6959774651a79d961e5a2a6a56", "64d5f88995a918a85df317d4240f0862",
+      "55c94ec09facda30fac677d205beb708", "2c010b256f4dabf42ef78bf5a3851b2c",
+      "c7d18d0e287fa8658b94131603e378db", "4f7696fe2c8dbedd0c8e8a53b9dec0fc",
+      "b3483dc32665a4bb0606d78dfb3d285c", "0bcb4acd4090f5798c2d260df73b2c46",
+      "4f574c782f3b28fb9c85cdb70dfcb46a", "14bd700a88be0107e9ef2fe54f75cee6",
+      "5d3b2698c9ffa4a6aed45a9adbddb8bf", "eff870414f80897cf8958ebeea84f0a6",
+      "e042843275f82271a9f540bc3e4ef35c", "26e3ff3d661dac25861a0f5bab522340",
+      "239844e66b07796003f9315166b9e29e", "44b8e6884215a1793cc7f8f7ce40bcee",
+  };
+  return kDigest[id];
+}
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+struct CdefTestParam {
+  CdefTestParam(int subsampling_x, int subsampling_y, int rows4x4,
+                int columns4x4)
+      : subsampling_x(subsampling_x),
+        subsampling_y(subsampling_y),
+        rows4x4(rows4x4),
+        columns4x4(columns4x4) {}
+  int subsampling_x;
+  int subsampling_y;
+  int rows4x4;
+  int columns4x4;
+};
+
+std::ostream& operator<<(std::ostream& os, const CdefTestParam& param) {
+  return os << "subsampling(x/y): " << param.subsampling_x << "/"
+            << param.subsampling_y << ", (rows,columns)4x4: " << param.rows4x4
+            << ", " << param.columns4x4;
+}
+
+// TODO(b/154245961): rework the parameters for this test to match
+// CdefFilteringFuncs. It should cover 4x4, 8x4, 8x8 blocks and
+// primary/secondary strength combinations for both Y and UV.
+template <int bitdepth, typename Pixel>
+class CdefFilteringTest : public testing::TestWithParam<CdefTestParam> {
+ public:
+  CdefFilteringTest() = default;
+  CdefFilteringTest(const CdefFilteringTest&) = delete;
+  CdefFilteringTest& operator=(const CdefFilteringTest&) = delete;
+  ~CdefFilteringTest() override = default;
+
+ protected:
+  void SetUp() override {
+    test_utils::ResetDspTable(bitdepth);
+    CdefInit_C();
+
+    const Dsp* const dsp = GetDspTable(bitdepth);
+    ASSERT_NE(dsp, nullptr);
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    const char* const test_case = test_info->test_suite_name();
+    if (absl::StartsWith(test_case, "C/")) {
+    } else if (absl::StartsWith(test_case, "NEON/")) {
+      CdefInit_NEON();
+    } else if (absl::StartsWith(test_case, "SSE41/")) {
+      CdefInit_SSE4_1();
+    } else if (absl::StartsWith(test_case, "AVX2/")) {
+      if ((GetCpuInfo() & kAVX2) != 0) {
+        CdefInit_AVX2();
+      }
+    } else {
+      FAIL() << "Unrecognized architecture prefix in test case name: "
+             << test_case;
+    }
+    memcpy(cur_cdef_filter_, dsp->cdef_filters, sizeof(cur_cdef_filter_));
+  }
+
+  void TestRandomValues(int num_runs);
+
+  uint16_t source_[kSourceBufferSize];
+  Pixel dest_[kMaxPlanes][kTestBufferSize];
+  int primary_strength_;
+  int secondary_strength_;
+  int damping_;
+  int direction_;
+  CdefTestParam param_ = GetParam();
+
+  CdefFilteringFuncs cur_cdef_filter_;
+};
+
+template <int bitdepth, typename Pixel>
+void CdefFilteringTest<bitdepth, Pixel>::TestRandomValues(int num_runs) {
+  const int id = ((param_.rows4x4 < 4) + (param_.rows4x4 < 2)) * 3 +
+                 param_.subsampling_x * 9 + param_.subsampling_y * 18;
+  absl::Duration elapsed_time;
+  for (int num_tests = 0; num_tests < num_runs; ++num_tests) {
+    for (int plane = kPlaneY; plane < kMaxPlanes; ++plane) {
+      const int subsampling_x = (plane == kPlaneY) ? 0 : param_.subsampling_x;
+      const int subsampling_y = (plane == kPlaneY) ? 0 : param_.subsampling_y;
+      const int block_width = 8 >> subsampling_x;
+      const int block_height = 8 >> subsampling_y;
+      libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed() +
+                                 id + plane);
+      const int offset = 2 * kSourceStride + 2;
+      // Fill boundaries with a large value such that cdef does not take them
+      // into calculation.
+      const int plane_width = MultiplyBy4(param_.columns4x4) >> subsampling_x;
+      const int plane_height = MultiplyBy4(param_.rows4x4) >> subsampling_y;
+      for (int y = 0; y < plane_height; ++y) {
+        for (int x = 0; x < plane_width; ++x) {
+          source_[y * kSourceStride + x + offset] =
+              rnd.Rand16() & ((1 << bitdepth) - 1);
+        }
+      }
+      for (int y = 0; y < 2; ++y) {
+        Memset(&source_[y * kSourceStride], kCdefLargeValue, kSourceStride);
+        Memset(&source_[(y + plane_height + 2) * kSourceStride],
+               kCdefLargeValue, kSourceStride);
+      }
+      for (int y = 0; y < plane_height; ++y) {
+        Memset(&source_[y * kSourceStride + offset - 2], kCdefLargeValue, 2);
+        Memset(&source_[y * kSourceStride + offset + plane_width],
+               kCdefLargeValue, 2);
+      }
+      do {
+        int strength = rnd.Rand16() & 15;
+        if (strength == 3) ++strength;
+        primary_strength_ = strength << (bitdepth - 8);
+      } while (primary_strength_ == 0);
+      do {
+        int strength = rnd.Rand16() & 3;
+        if (strength == 3) ++strength;
+        secondary_strength_ = strength << (bitdepth - 8);
+      } while (secondary_strength_ == 0);
+      damping_ = (rnd.Rand16() & 3) + 3;
+      direction_ = (rnd.Rand16() & 7);
+
+      memset(dest_[plane], 0, sizeof(dest_[plane]));
+      const absl::Time start = absl::Now();
+      const int width_index = block_width >> 3;
+      if (cur_cdef_filter_[width_index][0] == nullptr) return;
+      cur_cdef_filter_[width_index][0](
+          source_ + offset, kSourceStride, block_height, primary_strength_,
+          secondary_strength_, damping_, direction_, dest_[plane],
+          kTestBufferStride * sizeof(dest_[0][0]));
+      elapsed_time += absl::Now() - start;
+    }
+  }
+
+  for (int plane = kPlaneY; plane < kMaxPlanes; ++plane) {
+    if (bitdepth == 8) {
+      test_utils::CheckMd5Digest(kCdef, kCdefFilterName,
+                                 GetDigest8bpp(id + plane),
+                                 reinterpret_cast<uint8_t*>(dest_[plane]),
+                                 sizeof(dest_[plane]), elapsed_time);
+#if LIBGAV1_MAX_BITDEPTH >= 10
+    } else {
+      test_utils::CheckMd5Digest(kCdef, kCdefFilterName,
+                                 GetDigest10bpp(id + plane),
+                                 reinterpret_cast<uint8_t*>(dest_[plane]),
+                                 sizeof(dest_[plane]), elapsed_time);
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+    }
+  }
+}
+
+// Do not test single blocks with any subsampling. 2xH and Wx2 blocks are not
+// supported.
+const CdefTestParam cdef_test_param[] = {
+    CdefTestParam(0, 0, 4, 4), CdefTestParam(0, 0, 2, 2),
+    CdefTestParam(1, 0, 4, 4), CdefTestParam(1, 0, 2, 2),
+    CdefTestParam(0, 1, 4, 4), CdefTestParam(0, 1, 2, 2),
+    CdefTestParam(1, 1, 4, 4), CdefTestParam(1, 1, 2, 2),
+};
+
+using CdefFilteringTest8bpp = CdefFilteringTest<8, uint8_t>;
+
+TEST_P(CdefFilteringTest8bpp, Correctness) { TestRandomValues(1); }
+
+TEST_P(CdefFilteringTest8bpp, DISABLED_Speed) {
+  TestRandomValues(kNumSpeedTests);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, CdefFilteringTest8bpp,
+                         testing::ValuesIn(cdef_test_param));
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, CdefFilteringTest8bpp,
+                         testing::ValuesIn(cdef_test_param));
+#endif
+
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, CdefFilteringTest8bpp,
+                         testing::ValuesIn(cdef_test_param));
+#endif
+
+#if LIBGAV1_ENABLE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, CdefFilteringTest8bpp,
+                         testing::ValuesIn(cdef_test_param));
+#endif  // LIBGAV1_ENABLE_AVX2
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using CdefFilteringTest10bpp = CdefFilteringTest<10, uint16_t>;
+
+TEST_P(CdefFilteringTest10bpp, Correctness) { TestRandomValues(1); }
+
+TEST_P(CdefFilteringTest10bpp, DISABLED_Speed) {
+  TestRandomValues(kNumSpeedTests);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, CdefFilteringTest10bpp,
+                         testing::ValuesIn(cdef_test_param));
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+}  // namespace
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/constants.cc b/src/dsp/constants.cc
index 0099ca3..1b85795 100644
--- a/src/dsp/constants.cc
+++ b/src/dsp/constants.cc
@@ -20,7 +20,7 @@ namespace libgav1 {
 
 // Each set of 7 taps is padded with a 0 to easily align and pack into the high
 // and low 8 bytes. This way, we can load 16 at a time to fit mulhi and mullo.
-const int8_t kFilterIntraTaps[kNumFilterIntraPredictors][8][8] = {
+alignas(16) const int8_t kFilterIntraTaps[kNumFilterIntraPredictors][8][8] = {
     {{-6, 10, 0, 0, 0, 12, 0, 0},
      {-5, 2, 10, 0, 0, 9, 0, 0},
      {-3, 1, 1, 10, 0, 7, 0, 0},
diff --git a/src/dsp/convolve.cc b/src/dsp/convolve.cc
index 8c6f68f..727b4af 100644
--- a/src/dsp/convolve.cc
+++ b/src/dsp/convolve.cc
@@ -623,6 +623,8 @@ void ConvolveIntraBlockCopy2D_C(const void* const reference,
                                 const int /*vertical_filter_id*/,
                                 const int width, const int height,
                                 void* prediction, const ptrdiff_t pred_stride) {
+  assert(width >= 4 && width <= kMaxSuperBlockSizeInPixels);
+  assert(height >= 4 && height <= kMaxSuperBlockSizeInPixels);
   const auto* src = static_cast<const Pixel*>(reference);
   const ptrdiff_t src_stride = reference_stride / sizeof(Pixel);
   auto* dest = static_cast<Pixel*>(prediction);
@@ -676,6 +678,8 @@ void ConvolveIntraBlockCopy1D_C(const void* const reference,
                                 const int /*vertical_filter_id*/,
                                 const int width, const int height,
                                 void* prediction, const ptrdiff_t pred_stride) {
+  assert(width >= 4 && width <= kMaxSuperBlockSizeInPixels);
+  assert(height >= 4 && height <= kMaxSuperBlockSizeInPixels);
   const auto* src = static_cast<const Pixel*>(reference);
   const ptrdiff_t src_stride = reference_stride / sizeof(Pixel);
   auto* dest = static_cast<Pixel*>(prediction);
diff --git a/src/dsp/convolve_test.cc b/src/dsp/convolve_test.cc
new file mode 100644
index 0000000..4a2a9f1
--- /dev/null
+++ b/src/dsp/convolve_test.cc
@@ -0,0 +1,1373 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/convolve.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <cstring>
+#include <ostream>
+#include <string>
+#include <tuple>
+
+#include "absl/strings/match.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/memory.h"
+#include "tests/block_utils.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/third_party/libvpx/md5_helper.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+// The convolve function will access at most (block_height + 7) rows/columns
+// from the beginning.
+constexpr int kMaxBlockWidth = kMaxSuperBlockSizeInPixels + kSubPixelTaps;
+constexpr int kMaxBlockHeight = kMaxSuperBlockSizeInPixels + kSubPixelTaps;
+
+// Test all the filters in |kSubPixelFilters|. There are 6 different filters but
+// filters [4] and [5] are only reached through GetFilterIndex().
+constexpr int kMinimumViableRuns = 4 * 16;
+
+// When is_scaled_convolve_ is true, we don't test every combination of
+// type_param_, so some digests in ths array are redudant, marked as
+// "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa".
+// We keep it so that the logic of calculation id in GetDigestId() is clearer.
+const char* GetDigest8bpp(int id) {
+  static const char* const kDigest[] = {
+      "ae5977a4ceffbac0cde72a04a43a9d57", "fab093b917d36f6b69fb4f50a6b5c822",
+      "1168251e6261e2ff1fa69a93226dbd76", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "d2f5ca2b7958c332a3fb771f66da01f0", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "6bbcc075f8b768a02cdc9149f150326d", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "c4e90cd202f9867517433b550afdc644", "43d6df191744f6c5d489c0673714a714",
+      "bfe8197057b0f3f096344251047f481f", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "1681719b0f8905d99382f4132fe1472a", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "8d24b59c0f3942079ba4945ed6686269", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "ae5977a4ceffbac0cde72a04a43a9d57", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "995318eff1fe62822366490192ad8b5e", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "0ef1c5beb3228c6d9ecf3ced584c4aa8", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "fc02228efb85c665bd27a3dab72a9037", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "6cf5f791fe0d8dcd3526be3c6b814035", "eaa0942097fd2b2dd621b77e0a659896",
+      "4821befdf63f8c6da6440afeb57f320f", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "7aec92c3b65e456b64ae285c12b03b0d", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "4ae70d9db2ec36885394db7d59bdd4f7", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "911212ae2492690de06d12bfaf71c7d4", "cb284b0ae039582039563638f682db26",
+      "6b4393b2d7387dd291d3a7bd3aabcae4", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "0804d93136549388b6cd7fdcd187a578", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "b25f037602efdb4eaacb3ade1dc5c28f", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "6cf5f791fe0d8dcd3526be3c6b814035", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "704b0bb4128aa163ef5899e6d8ad9664", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "abf3f31ec4daff000e80f7ab9628688b", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "09e12a389cd454e10f750062102ea1b2", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "d905dfcad930aded7718587c05b48aaf", "fe85aaee8007d2130d56919242e01163",
+      "c30fc44d83821141e84cc4793e127301", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "f72a99ad63f6a88c23724e898b705d21", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "5fee162fe52c11c823db4d5ede370654", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "a9210113ff6873e5b50d5d3ad67e440f", "b7633a78f959b20ca27ffb700b44b45c",
+      "6d1c5145be9fd636ababd64c64d23a10", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "d55d8012ddddb55e6c3e51dafab92980", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "b1948cb353fa308f0d5592b0ad338997", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "d905dfcad930aded7718587c05b48aaf", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "04e3b7f46e748431c76cf6125057601c", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "71362b65cffd008d1ca4a20adc8cc15f", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "987f7a6a8bef47acbd1e49bb39f51ac4", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "6baf153feff04cc5b7e87c0bb60a905d", "fa1ad095bf696745599079fb73975b75",
+      "a8293b933d9f2e5d7f922ea40111d643", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "07a1f07f114c4a38ba08d2f44e1e1132", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "9365186c59ef66d9def40f437022ad93", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "a7305087fae23de53d21a6909009ff69", "bd44440b5757b74bcc3e2f7f32ef42af",
+      "a5a1ac658d7ce4a846a32b9fcfaa3475", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "3b1ceebf0579fcbbfd6136938c595b91", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "3bfad931bce82335219e0e29c15f2b21", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "6baf153feff04cc5b7e87c0bb60a905d", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "4cfad2c437084a93ea76913e21c2dd89", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "1a0bdfc96a3b9fd904e658f238ab1076", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "b8a710baa6a9fc784909671d450ecd99", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "871ed5a69ca31e6444faa720895949bf", "e55d0c54fd28355d32e29d411488b571",
+      "354a54861a94e8b027afd9931e61f997", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "26b9de95edb45b31ac5aa19825831c7a", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "0f95fb0276c9c7910937fbdf75f2811d", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "8dcce009395264379c1a51239f4bb22c", "06925f05ea49811e3efc2a44b111b32b",
+      "2370f4e4a83edf91b7f504bbe4b00e90", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "ecafabcad1045f15d31ce2f3b13132f2", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "68a701313d2247d2b32636ebc1f2a008", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "871ed5a69ca31e6444faa720895949bf", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "d372f0c17bce98855d6d59fbee814c3d", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "56d16e54afe205e97527902770e71c71", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "f9e6a56382d8d12da676d6631bb6ef75", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "68e2f90eaa0ab5da7e6f5776993f7eea", "8718965c4831a363a321a25f4aada7ba",
+      "eeeb8589c1b31cbb565154736ca939ec", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "c1b836a6ce023663b90db0e320389414", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "b355dab2dbb6f5869018563eece22862", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "8dcce009395264379c1a51239f4bb22c", "e7c2bfd356c860c36053dea19f8d0705",
+      "ae5464066a049622a7a264cdf9394b55", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "5f211eba020e256a5781b203c5aa1d2e", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "05afe1f40d37a45a97a5e0aadd5066fb", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "68e2f90eaa0ab5da7e6f5776993f7eea", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "d99ffd2579eb781c30bc0df7b76ad61e", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "1f7b5b8282ff3cf4d8e8c52d80ef5b4d", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "3bf8e11e18527b16f0d7c0361d74a52d", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "f1f8282fb33c30eb68c0c315b7a4bc01", "4c718ddbe8b5aa7118c8bc1c2f5ea158",
+      "f49dab626ddd977ed171f79295c24935", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "5befcf222152ebc8d779fcc10b95320a", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "cf6ff8c43d8059cea6090a23ab66a0ef", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "d90a69e7bae8aa46ed0e1e5f911d7a07", "1d7113d705fa0edeef49e5c50a91151d",
+      "45368b6db3d1fee739a64b0bc823ea9c", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "3b04497634364dd2cd3f2482b5d4b32f", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "9e1f0e0bddb58d15d0925eeaede9b84c", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "f1f8282fb33c30eb68c0c315b7a4bc01", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "4e139e57cbb049a0f4ef816adc48d026", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "79e9e260a2028c5fe320005c272064b9", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "b9ff54c6f1e3b41fc7fc0f3fa0e75cf2", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "9412064b0eebf8123f23d74147d04dff", "0dee657827cd48c4ce4a7657f6f92233",
+      "78d2f27e0d4708cb16856d7d40dc16fb", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "62adf407fc27d8682ced4dd7b55af14e", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "a336f8b7bcf188840ca65c0d0e66518a", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "6ab4dc87be03be1dcc5d956ca819d938", "78cef82670ff99b1e4a279de3538c233",
+      "8dff0f28192d9f8c0bf7fb5405719dd8", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "a8ac7b5dc65ffb758b0643508a0e744e", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "03313cdaa593a1a7b4869010dcc7b241", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "9412064b0eebf8123f23d74147d04dff", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "be53b2507048e7ff50226d15c0b28865", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "2418ebcdf85551b9ae6e3725f04aae6d", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "06ef1504f31af5f173d3317866ca57cb", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "cc08936effe309ab9a4fa1bf7e28e24e", "a81bcdeb021d3a23477c40c47548df52",
+      "9d2393ea156a1c2083f5b4207793064b", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "35be0786a072bf2f1286989261bf6580", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "de953f03895923359c6a719e6a537b89", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "6ab4dc87be03be1dcc5d956ca819d938", "e053321d7c75951d5ff3dce85762acd3",
+      "632738ef3ff3021cff45045c41978849", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "561ed8be43c221a561f8885a0d74c7ef", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "88a50d2b4107ee5b5074b2520183f8ac", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "cc08936effe309ab9a4fa1bf7e28e24e", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "b73f3c1a10405de89d1f9e812ff73b5a", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "98bdf907ebacacb734c9eef1ee727c6e", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "635e8ee11cf04d73598549234ad732a0", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "36cbef36fa21b98df03536c918bf752a", "b7a4d080e2f24040eebb785f437de66a",
+      "a9c62745b95c66fa497a524886af57e2", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "90562fc42dc5d879ae74c4909c1dec30", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "8463ade9347ed602663e2cec5c4c3fe6", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "8f2afdb2f03cd04ffacd421b958caaa0", "2e15a26905467e5ad9f8da04b94e60b6",
+      "f7ec43384037e8d6c618e0df826ec029", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "8159619fc234598c8c75154d80021fd4", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "ac50ea9f7306da95a5092709442989cf", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "36cbef36fa21b98df03536c918bf752a", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "c7d51b1f2df49ab83962257e8a5934e5", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "4dd5672d53c8f359e8f80badaa843dfc", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "fab693410d59ee88aa2895527efc31ac", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "9d0da6321cf5311ea0bdd41271763030", "22ff7819c55ce6b2e0ce5431eb8c309c",
+      "2c614ec4463386ec075a0f1dbb587933", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "a1427352f9e413975a0949e2b300c657", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "392de11ffcd5c2ecf3db3480ee135340", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "710ccecc103033088d898a2b924551fb", "160c29a91e372d66b12e171e4d81bc18",
+      "a6bc648197781a2dc99c487e66464320", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "8f43645dce92cf7594aa4822aa53b17d", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "739b17591437edffd36799237b962658", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "9d0da6321cf5311ea0bdd41271763030", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "159e443d79cc59b11ca4a80aa7aa09be", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "a1bef519bbf07138e2eec5a91694de46", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "3041eb26c23a63a587fbec623919e2d2", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "55a10165ee8a660d7dddacf7de558cdd", "355b691a656e6a287e4504ef2fbb8034",
+      "7a8856480d752153370240b066b90f6a", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "bcbc418bc2beb243e463851cd95335a9", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "bddd31e3e852712e6244b616622af83d", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "710ccecc103033088d898a2b924551fb", "f6cb80c4d5683553929b1e41f99e487e",
+      "1112ebd509007154c72c5a485b220b62", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "b6ccddb7dfa4eddc87b4eff08b5a3195", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "b8a7eb7dd9c216e240517edfc6489397", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "55a10165ee8a660d7dddacf7de558cdd", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "6ef14b14882e1465b0482b0e0b16d8ce", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "df1cb51fe1a937cd7834e973dc5cb814", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "c61d99d5daf575664fb7ad64976f4b03", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "ac7fc9f9ea7213743fae5a023faaaf08", "a6307a981600c3fb5b9d3e89ddf55069",
+      "beaef1dbffadc701fccb7c18a03e3a41", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "cb8fedcbecee3947358dc61f95e56530", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "30a36245c40d978fc8976b442a8600c3", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "a4093e3e5902dd659407ce6471635a4e", "658f0f51eb2f965f7490053852653fc0",
+      "9714c4ce636b6fb0ad05cba246d48c76", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "b4e605327b28db573d88844a1a09db8d", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "75b755f199dbf4a0e5ebbb86c2bd871d", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "ac7fc9f9ea7213743fae5a023faaaf08", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "22a8d287b425c870f40c64a50f91ce54", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "317fe65abf81ef3ea07976ef8667baeb", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "822f6c4eb5db760468d822b21f48d94d", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "077e1b7b355c7ab3ca40230ee8efd8ea", "628229ce2484d67e72c51b2f4ad124a6",
+      "72b1e700c949d06eaf62d664dafdb5b6", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "0d0154a7d573685285a83a4cf201ac57", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "93aa662b988b8502e5ea95659eafde59", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "375d7f5358d7a088a498b8b3aaecc0d5", "b726ef75b641c21519ecc2f802bbaf39",
+      "2c93dde8884f09fb5bb5ad6d95cde86d", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "15b00a15d1cc6cc96ca85d00b167e4dd", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "31b0017ba1110e3d70b020901bc15564", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "077e1b7b355c7ab3ca40230ee8efd8ea", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "f1d96db5a2e0a2160df38bd96d28d19b", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "2da29da97806ae0ee300c5e69c35a4aa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "3f6fcb9fae3666e085b9e29002a802fc", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "7a3e8de2a1caae206cf3e51a86dfd15a", "c266a1b65599686c771fad8a822e7a49",
+      "684f5c3a25a080edaf79add6e9137a8e", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "b14bd8068f108905682b83cc15778065", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "70440ba9ee7f9d16d297dbb49e54a56e", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "375d7f5358d7a088a498b8b3aaecc0d5", "4dca696cc0552c1d684c4fc963adc336",
+      "a49e6160b5d1b56bc2046963101cd606", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "7bf911888c11a9fefd604b8b9c82e9a1", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "0a1aa8f5ecfd11ddba080af0051c576a", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "7a3e8de2a1caae206cf3e51a86dfd15a", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "637d1e5221422dfe9a6dbcfd7f62ebdd", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "555475f5d1685638169ab904447e4f13", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "d9b9fecd195736a6049c528d4cb886b5", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "1ddf9020f18fa7883355cf8c0881186a", "e681b35b1fe02e2a6698525040015cd0",
+      "3be970f49e4288988818b087201d54da", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "c96c867d998473197dde9b587be14e3a", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "1eb2be4c05b50e427e29c72fa566bff5", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "08867ea5cc38c705ec52af821bc4736a", "c51c8bb294f4fa20bdab355ad1e7df37",
+      "7f084953976111e9f65b57876e7552b1", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "bfb69b4d7d4aed73cfa75a0f55b66440", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "536181ee90de883cc383787aec089221", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "1ddf9020f18fa7883355cf8c0881186a", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "f275af4f1f350ffaaf650310cb5dddec", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "b3e3a6234e8045e6182cf90a09f767b2", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "fed17fc391e6c3db4aa14ea1d6596c87", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "2377dd167ef2707978bed6f10ffd4e76", "b1f6c0cd490b584b1883222a4c281e0f",
+      "d2b9dba2968894a414756bb510ac389a", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "f596c63c7b14cada0174e17124c83942", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "52c0980bae63e8459e82eee7d8af2334", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "2afb540e8063f58d1b03896486c5e89b", "b929f7956cf35dd6225ca6cf45eacb23",
+      "0846ec82555b66197c5c45b08240fbcc", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "034d1d62581bd0d840c4cf1e28227931", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "29f82b0f3e4113944bd28aacd9b8489a", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "2377dd167ef2707978bed6f10ffd4e76", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "f81c4d6b001a14584528880fa6988a87", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "849dfeca59074525dea59681a7f88ab4", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "d0d3482d981989e117cbb32fc4550267", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "f918e0e4422967c6a7e47298135c7ae9", "fc8718e6f9e6663c2b6bf9710f835bfc",
+      "9a3215eb97aedbbddd76c7440837d040", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "eb2822ad8204ed4ecbf0f30fcb210498", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "75e57104d6058cd2bce1d3d8142d273d", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "2afb540e8063f58d1b03896486c5e89b", "d9d9f3c699cd03ab9d698e6b235ddcc6",
+      "ca7471c126ccd22189e874f0a6e41960", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "8cba849640e9e2859d509bc81ca94acd", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "ee3e76371240d1f1ff811cea6a7d4f63", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "f918e0e4422967c6a7e47298135c7ae9", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "a5a2f9c2e7759d8a3dec1bc4b56be587", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "39a68af80be11e1682b6f3c4ede33530", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "39561688bf6680054edbfae6035316ce", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "b2264e129636368b5496760b39e64b7a", "4dbb4ce94d4948c990a51b15959d2fa6",
+      "4e317feac6da46addf0e8b9d8d54304b", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "538ce869ffd23b6963e61badfab7712b", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "b4c735269ade44419169adbd852d5ddc", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "6ce47b11d2e60c5d183c84ce9f2e46cc", "3ac8d5b68ebb29fd1a41c5fa9d5f4382",
+      "0802b6318fbd0969a33de8fdfcd07f10", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "bc79acf2a0fe419194cdb4529bc7dcc8", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "17a20dbbf09feae557d40aa5818fbe76", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "b2264e129636368b5496760b39e64b7a", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "2317c57ab69a36eb3bf278cf8a8795a3", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "b22d765af176d87e7d3048b4b89b86ad", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "087c5992ca6f829e1ba4ba5332d67947", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "c9cf1deba08dac5972b3b0a43eff8f98", "84777bdeb84e2530a1c8c1ee432ec934",
+      "b384e9e3d81f9f4f9024028fbe451d8b", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "4e4677a0623d44237eb8d6a622cdc526", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "356d4003477283e157c8d2b5a79d913c", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "c9cf1deba08dac5972b3b0a43eff8f98", "1e58b76ca365b0bd4fd3c4519ec4a500",
+      "24accebe2e795b13fcb56dd3abacf53f", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "98f584ceaf2d65af997f85d71ceeda1b", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "c9cf1deba08dac5972b3b0a43eff8f98", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "1e58b76ca365b0bd4fd3c4519ec4a500", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "24accebe2e795b13fcb56dd3abacf53f", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "98f584ceaf2d65af997f85d71ceeda1b", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+  };
+  return kDigest[id];
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+const char* GetDigest10bpp(int id) {
+  static const char* const kDigest[] = {
+      "b1b6903d60501c7bc11e5285beb26a52", "3fa4ebd556ea33cfa7f0129ddfda0c5b",
+      "a693b4bd0334a3b98d45e67d3985bb63", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "3e787534dff83c22b3033750e448865a", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "fd1da8d197cb385f7917cd296d67afb9", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "d9941769b66d012c68f70accc1a3b664", "98728677401560d7c29ba8bec59c6a00",
+      "2924788891caa175bb0725b57de6cbd2", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "915a60e7bb2c38ad5a556098230d6092", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "a25de86fd8d389c1c75405aac8049b58", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "b1b6903d60501c7bc11e5285beb26a52", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "cf792b94b1f3f321fa0c1d6362d89c90", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "5f1622fde194bd04560b04f13dc47a7c", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "d935e0ec1d933d0c48fa529be4f998eb", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "a7855ed75772d7fa815978a202bbcd9f", "cd3e8b96ff6796650e138f5d106d70d4",
+      "156de3172d9acf3c7f251cd7a18ad461", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "4c91f676a054d582bcae1ca9adb87a31", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "a984202c527b757337c605443f376915", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "20a390cc7e06a265ecc1e118f776c25a", "ab0da36b88021ed0efd806a1a4cd4fa0",
+      "fc57a318fbf0c0f29c24edbc84e35ec6", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "568055866caf274d67e984307cda2742", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "3ff2b19730d6bb8b97f4d72085d2d5b8", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "a7855ed75772d7fa815978a202bbcd9f", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "acc8588292b326f15076dd3a3d260072", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "f990a13f7a062665d7f18a40bd5da2ae", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "931df73c3d50c4b2e4ec3502bc8774de", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "bde291a4e8087c085fe8b3632f4d7351", "555eead3b67766f56b0e3714d431506f",
+      "e545b8a3ff958f8363c7968cbae96732", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "eab5894046a99ad0a1a12c91b0f37bd7", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "c347f4a58fd784c5e88c1a23e4ff15d2", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "9272ee0820b09bfdc252a97b2e103862", "be8dd418158226a00d5e01ccc3e4f66b",
+      "34b37b59ee49108276be28a2e4585c2d", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "f4deb462014249d4ab02db7f7f62308e", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "6ae557169928f3be15c7aad8d67205b1", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "bde291a4e8087c085fe8b3632f4d7351", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "14be0f12550c814f75655b4e1e22ddde", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "af4cadb78ee54aacebac76c8ad275375", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "c0c4ebfd6dbbddd88114c36e8c9085da", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "238980eebc9e63ae3eea2771c7a70f12", "661c69a7b49984fa1e92cf8485ab28b6",
+      "7842b2047356c1417d9d88219707f1a1", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "765b4cfbfc1a4988878c412d53bcb597", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "29cbaadbff9adf4a3d49bd9900a9dd0b", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "7e3fa9c03bc3dfbdeb67f24c5d9a49cd", "a65e13b534b32fdff3f48d09389daaf1",
+      "da1a6ff2be03ec8acde4cb1cd519a6f0", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "d54206c34785cc3d8a06c2ceac46378c", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "b1f26ee13df2e14a757416ba8a682278", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "238980eebc9e63ae3eea2771c7a70f12", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "e552466a4e7ff187251b8914b084d404", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aba5d5ef5e96fe418e65d20e506ea834", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "972aeba65e8a6d20dd0f95279be2aa75", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "0eac13431bd7d8a573318408a72246d5", "71c57b774e4c3d9b965b060e2a895448",
+      "1a487c658d684314d91bb6d961a94672", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "bc63b29ec78c1efec5543885a45bb822", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "c5997b802a6ba1cf5ba1057ddc5baa7e", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "f3454ca93cbb0c8c09b0695d90a0df3d", "d259b9c0d0e3322114b2bcce04ae35dd",
+      "a4ca37cb869a0dbd1c4a2dcc449a8f31", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "85a11892ed884e3e74968435f6b16e64", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "996b6c166f9ed25bd07ea6acdf7597ff", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "0eac13431bd7d8a573318408a72246d5", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "981b7c44b6f7b7ac2acf0cc4096e6bf4", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "d70bf16e2a31e90b7b3cdeaef1494cf9", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "34165457282e2af2e9b3f5840e4dec5d", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "73438155feb62595e3e406921102d748", "86d00d2e3dd4a198343f37e3dc4461c9",
+      "0635a296be01b7e641de98ee27c33cd2", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "cecd57396a0033456408f3f3554c6912", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "59f33727e5beeb783a057770bec7b4cd", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "f3454ca93cbb0c8c09b0695d90a0df3d", "b11f98b5bb864413952d47a67b4add79",
+      "1b5d1d4c7be8d5ec00a42a49eecf918f", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "16434230d24b9522ae2680e8c37e1b95", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "34895d4c69a6c3303693e6f431bcd5d8", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "73438155feb62595e3e406921102d748", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "a4c75372af36162831cb872e24e1088c", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "6df80bb7f264f4f285d09a4d61533fae", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "b8c5582b9bbb789c45471f93be83b41f", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "5871e0e88a776840d619670fbf107858", "57dd2cde826c50e0b0ec504396cb3ceb",
+      "82dc120bf8c2043bc5eee81007309ebf", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "5b37f94ef136c1eb9a6181c19491459c", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "0654d72f22306b28d9ae42515845240c", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "1a77d2af4d2b6cf8737cfbcacacdc4e4", "7123d4aa8083da90ec6986dda0e126ce",
+      "98b77e88b0784baaea64c98c8707fe46", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "963dea92f3efbb99137d1de9c56728d3", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "c9497b00cb1bc3363dd126ffdddadc8e", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "5871e0e88a776840d619670fbf107858", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "497271227a70a72f9ad25b415d41563f", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "c8831118d1004a7cca015a4fca140018", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "257bf5467db570974d7cf2356bacf116", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "1c6376ce55c9ee9e35d432edb1ffb3b7", "6fff9189c1d11f183f7c42d4ce5febdb",
+      "58c826cad3c14cdf26a649265758c58b", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "716ba3a25b454e44b46caa42622c128c", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "6c9d7d9e6ef81d76e775a85c53abe209", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "89bec831efea2f88129dedcad06bb3fa", "e1ef4ae726d864b36a9b64b1e43ede7e",
+      "8148788044522edc3c497e1017efe2ce", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "b72fb6a9a073c2fe65013af1842dc9b0", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "1e461869bb2ee9b6069c5e52cf817291", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "1c6376ce55c9ee9e35d432edb1ffb3b7", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "c48bd7e11ec44ba7b2bc8b6a04592439", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "b7f82c140369067c105c7967c75b6f9e", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "5255dded79f56b0078543b5a1814a668", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "d675e0195c9feca956e637f3f1959f40", "670fa8c31c82fced9a810b64c03e87ee",
+      "f166254037c0dfb140f54cd7b08bddfe", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "9076f58c4ab20f2f06d701a6b53b1c4f", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "a35f435ccc67717a49251a07e62ae204", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "89bec831efea2f88129dedcad06bb3fa", "7c3a79a90f3f4b460540e796f3197ef1",
+      "acf60abeda98bbea161139b915317423", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "86fa0c299737eb499cbcdce94abe2d33", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "8d7f1d7ea6a0dcc922ad5d2e77bc74dd", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "d675e0195c9feca956e637f3f1959f40", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "0960a9af91250e9faa1eaac32227bf6f", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "130f47aae365aabfec4360fa5b5ff554", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "ef745100f5f34c8ff841b2b0b57eb33f", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "b5681673903ade13d69e295f82fdd009", "9ccd4cc6216eab35ddcb66a76b55dd2f",
+      "74ab206f14ac5f62653cd3dd71a7916d", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "d3212ab3922f147c3cf126c3b1aa17f6", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "c5325015cb0b7c42839ac4aa21803fa0", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "dead0fe4030085c22e92d16bb110de9d", "3c6d97f25d6bc647c843850be007f512",
+      "262c96b1f2c4f85c86c0e9c77fedff1e", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "6b80af04470b83673d98f46925e678a5", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "138855d9bf0ccd0c62ac14c7bff4fd37", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "b5681673903ade13d69e295f82fdd009", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "746c2e0f96ae2246d534d67102be068c", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "92483ed631de21b685ffe6ccadbbec8f", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "edae8ed67286ca6a31573a541b3deb6f", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "3c43020105ae93a301404b4cd6238654", "cef7cfdcb8ca8d2612f31a1fe95ce371",
+      "5621caef7cc1d6522903290ccc5c2cb8", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "b55fea77f0e14a8bf8b6562b766fe91f", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "f81f31f1585c0f70438c09e829416f20", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "306a2f5dfd675df4ed9af44fd5cac8c0", "1dfda318021a05a7e72fd815ddb0dfc8",
+      "f35a3d13516440f9168076d9b07c9e98", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "65baca6167fe5249f7a839ce5b2fd591", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "64035142864914d05a48ef8e013631d0", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "3c43020105ae93a301404b4cd6238654", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "d6f6db079da9b8909a153c07cc9d0e63", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "cbb6ab31547df6b91cfb48630fdffb48", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "01adcd8bf15fbf70df47fbf3a953aa14", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "dd2c5880a94ed3758bfea0b0e8c78286", "5f6c1725f4c7c73a8d8f0d9468106624",
+      "78ec6cf42cce4b1feb65e076c78ca241", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "59b578268ff26a1e21c5b4273f73f852", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "ab10b22fb8dd8199040745565b28595d", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "306a2f5dfd675df4ed9af44fd5cac8c0", "9209f83153ef6f09b5262536a2dc1671",
+      "13782526fc2726100cb3cf375b3150ed", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "e47ded6c0eec1d5baadd02aff172f2b1", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "205904fa3c644433b46e01c11dd2fe40", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "dd2c5880a94ed3758bfea0b0e8c78286", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "7c8928a0d769f4264d195f39cb68a772", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "1eea5e8a24d6aa11778eb3e5e5e9c9f2", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "ba539808a8501609ce052a1562a62b25", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "4ebb1a7b25a39d8b9868ec8a1243103f", "c2732a08997e1f5176dfb297d2e89235",
+      "42188e2dbb4e02cd353552ea147ad03f", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "16761e7c8ba2645718153bed83ae78f6", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "0d928d6111f86c60ccefc6c6604d5659", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "9d01c946a12f5ef9d9cebd9816e06014", "d738eb9f3f4f0b412b93687b55b6e45a",
+      "13c07441b47b0c1ed80f015ac302d220", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "c0950e609f278efb7050d319a9756bb3", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "291425aaf8206b20e88db8ebf3cf7e7f", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "4ebb1a7b25a39d8b9868ec8a1243103f", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "db645c96fc8be04015e0eb538afec9ae", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "9e193b6b28ce798c44c744efde19eee9", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "ac8e6391200cec2abdebb00744a2ba82", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "d34ec07845cd8523651e5f5112984a14", "745c794b557d4a0d734e45d720a7f7ad",
+      "f9813870fc27941a7c00a0443d7c2fe7", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "a9e9805769fe1baf5c7933793ccca0d8", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "4ed1a6200912995d4f571bdb7822aa83", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "768f63912e43148c13688d7f23281531", "43fb786fd2e79610d6a6d912b95f4509",
+      "02880fde51ac991ad18d8986f4e5145c", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "9051290279237f9fb1389989b142d2dd", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "cb6238b8eb6b72980958e6fcceb2f2eb", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "d34ec07845cd8523651e5f5112984a14", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "946af3a8f5362def5f4e27cb0fd4e754", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "885c384d90aaa34acd8303958033c252", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "54b17120f7d71ddb4d70590ecd231cc1", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "2ce55308d873f4cd244f16da2b06e06e", "af7b76d3471cfbdc97d1e57bc2876ce7",
+      "20b14a6b5af7aa356963bcaaf23d230d", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "553a2c24939dff18ec5833c77f556cfb", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "92e31a45513582f386dc9c22a57bbbbd", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "768f63912e43148c13688d7f23281531", "4e255554dab9dfa1064e20a905538308",
+      "aa25073115bad49432953254e7dce0bc", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "34cdc1be291c95981c98812c5c343a15", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "626321a6dfac542d0fc70321fac13ff3", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "2ce55308d873f4cd244f16da2b06e06e", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "7ad78dfe7bbedf696dd58d9ad01bcfba", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "8110ed10e7234851dff3c7e4a51108a2", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "f6e36446a97611a4db4425df926974b2", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "a4bb5d5ff4b25f391265b5231049a09a", "cf4867c6b1b8be86a7e0bee708c28d83",
+      "9c9c41435697f75fa118b6d6464ee7cb", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "5c1ec75a160c444fa90abf106fa1140e", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "6dbf310a9c8d85f76306d6a35545f8af", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "2e7927158e7b8e40e7269fc909fb584b", "8b72feff8bb0901229a2bd7da2857c4b",
+      "69e3361b7199e10e75685b90fb0df623", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "5b64a6911cb7c3d60bb8f961ed9782a2", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "1c6fda7501e0f8bdad972f7857cd9354", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "a4bb5d5ff4b25f391265b5231049a09a", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "f0fd9c09d454e4ce918faa97e9ac10be", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "6fb9383302eb7e7a13387464d2634e03", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "a82f4080699300b659bbe1b5c4463147", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "c9106e0c820b03bcdde3aa94efc11a3e", "0408e10e51a31ac756a57d5149a2b409",
+      "38816245ed832ba313fefafcbed1e5c8", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "2266840f11ac4c066d941ec473b1a54f", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "80fce29dc82d5857c1ed5ef2aea16835", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "123028e18c2bfb334e34adb5a4f67de4", "1670eb8ed876e609ed81236a683b4a3d",
+      "2f8ab35f6e7030e82ca922a68b29af4a", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "7133de9d03a4b07716a12226b5e493e8", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "4fd485dadcb570e5a0a5addaf9ba84da", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "c9106e0c820b03bcdde3aa94efc11a3e", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "af6ae5c0eb28417bd251184baf2eaba7", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "864d51fcc737bc73a3f588b67515039a", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "ecedb178f7cad3dc1b921eca67f9efb6", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "7ec2eae9e118506da8b33440b399511a", "108a4a6530a6b9c933ccf14edbd896be",
+      "5d34137cc8ddba75347b0fa1d0a91791", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "9e194755b2a37b615a517d5f8746dfbb", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "14f2c5b9d2cd621c178a39f1ec0c38eb", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "123028e18c2bfb334e34adb5a4f67de4", "2fdc713ba418780d0be33a3ebbcb323c",
+      "452f91b01833c57db4e909575a029ff6", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "3594eff52d5ed875bd9655ddbf106fae", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "d3f140aea9e8eabf4e1e5190e0148288", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "7ec2eae9e118506da8b33440b399511a", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "866f8df540dd3b58ab1339314d139cbd", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "2ecb7890f00234bcb28c1d969f489012", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "0609ca0ff3ca90069e8b48829b4b0891", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "78de867c8ee947ed6d29055747f26949", "0a7cb4f51f1acf0940b59295b2327465",
+      "465dcb046a0449b9dfb3e0b297aa3863", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "bbf86f8174334f0b8d869fd8d58bf92d", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "da54cfb4530841bda29966cfa05f4879", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "2c979c2bddef79a760e72a802f83cc76", "545426be3436073ba63790aa3c4a5598",
+      "1fabf0655bedb671e4d7287fec8119ba", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "90d7e13aa2f9a064493ff2b3b5b12109", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "e4938219593bbed5ae638a93f2f4a580", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "78de867c8ee947ed6d29055747f26949", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "72803589b453a29501540aeddc23e6f4", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "c4793d431dbf2d88826bb440bf027512", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "839e86c681e97359f7819c766000dd1c", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "d05a237ed7a9ca877256b71555b1b8e4", "3052776d186fca6dd8011f4fe908a212",
+      "94b3e5bcd6b849b66a4571ec3d23f9be", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "91d6bdbc62d4bb80c9b371d9704e3c9e", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "4f750f6375524311d260306deb233861", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "d05a237ed7a9ca877256b71555b1b8e4", "03ce2d07cac044d6b68604d398571844",
+      "68ece92dcbe70a2ae9776d72972740a7", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "380d296d0d55a49dd86ee562b053a9d8", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "d05a237ed7a9ca877256b71555b1b8e4", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "03ce2d07cac044d6b68604d398571844", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "68ece92dcbe70a2ae9776d72972740a7", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "380d296d0d55a49dd86ee562b053a9d8", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+  };
+  return kDigest[id];
+}
+#endif
+
+struct ConvolveTestParam {
+  ConvolveTestParam(int width, int height) : width(width), height(height) {}
+  int width;
+  int height;
+};
+
+struct ConvolveTypeParam {
+  ConvolveTypeParam(bool is_intra_block_copy, bool is_compound,
+                    bool has_vertical_filter, bool has_horizontal_filter)
+      : is_intra_block_copy(is_intra_block_copy),
+        is_compound(is_compound),
+        has_vertical_filter(has_vertical_filter),
+        has_horizontal_filter(has_horizontal_filter) {}
+  bool is_intra_block_copy;
+  bool is_compound;
+  bool has_vertical_filter;
+  bool has_horizontal_filter;
+};
+
+std::ostream& operator<<(std::ostream& os, const ConvolveTestParam& param) {
+  return os << "BlockSize" << param.width << "x" << param.height;
+}
+
+std::ostream& operator<<(std::ostream& os, const ConvolveTypeParam& param) {
+  return os << "is_intra_block_copy: " << param.is_intra_block_copy
+            << ", is_compound: " << param.is_compound
+            << ", has_(vertical/horizontal)_filter: "
+            << param.has_vertical_filter << "/" << param.has_horizontal_filter;
+}
+
+// TODO(b/146062680): split this to ConvolveTest and ConvolveScaleTest to
+// simplify the members and test logic.
+template <int bitdepth, typename Pixel>
+class ConvolveTest
+    : public testing::TestWithParam<
+          std::tuple<ConvolveTestParam, ConvolveTypeParam, bool>> {
+ public:
+  ConvolveTest() = default;
+  ~ConvolveTest() override = default;
+
+  void SetUp() override {
+    ConvolveInit_C();
+
+    const Dsp* const dsp = GetDspTable(bitdepth);
+    ASSERT_NE(dsp, nullptr);
+    GetConvolveFuncs(dsp, &base_convolve_func_, &base_convolve_scale_func_);
+
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    const absl::string_view test_case = test_info->test_suite_name();
+    if (absl::StartsWith(test_case, "C/")) {
+      base_convolve_func_ = nullptr;
+      base_convolve_scale_func_ = nullptr;
+    } else if (absl::StartsWith(test_case, "SSE41/")) {
+      if ((GetCpuInfo() & kSSE4_1) != 0) {
+        ConvolveInit_SSE4_1();
+      }
+    } else if (absl::StartsWith(test_case, "AVX2/")) {
+      if ((GetCpuInfo() & kAVX2) != 0) {
+        ConvolveInit_AVX2();
+      }
+    } else if (absl::StartsWith(test_case, "NEON/")) {
+      ConvolveInit_NEON();
+    } else {
+      FAIL() << "Unrecognized architecture prefix in test case name: "
+             << test_case;
+    }
+
+    GetConvolveFuncs(dsp, &cur_convolve_func_, &cur_convolve_scale_func_);
+
+    // Skip functions that have not been specialized for this particular
+    // architecture.
+    if (cur_convolve_func_ == base_convolve_func_) {
+      cur_convolve_func_ = nullptr;
+    }
+    if (cur_convolve_scale_func_ == base_convolve_scale_func_) {
+      cur_convolve_scale_func_ = nullptr;
+    }
+  }
+
+ protected:
+  int GetDigestId() const {
+    // id is the combination of the 3-dimension array:
+    // (param_, type_param_, is_scaled_convolve_)
+    // The number of each array is 20, 16, 2.
+    // The range of id is from 0 to 20x16x2 - 1.
+    // is_scaled_convolve_: false, id += 0; true, id += 1;
+    // type_param_: (0, 0, 0, 0), id += 0 * 2.
+    // (0, 0, 0, 1), id += 1 * 2; (0, 0, 1, 0), id += 2 * 2;
+    // ...
+    // param_: (2, 2), id += 0 * 32; (2, 4), id += 1 * 32;
+    // (4, 2), id += 2 * 32; (4, 4), id += 3 * 32;
+    // ...
+    int id = static_cast<int>(is_scaled_convolve_);
+    id += 2 * static_cast<int>(type_param_.has_horizontal_filter);
+    id += 2 * 2 * static_cast<int>(type_param_.has_vertical_filter);
+    id += 2 * 4 * static_cast<int>(type_param_.is_compound);
+    id += 2 * 8 * static_cast<int>(type_param_.is_intra_block_copy);
+    if (param_.width == param_.height) {
+      id += 32 * 3 * static_cast<int>(std::log2(param_.width) - 1);
+    } else if (param_.width < param_.height) {
+      id += 32 * (1 + 3 * static_cast<int>(std::log2(param_.width) - 1));
+    } else {
+      // param_.width > param_.height
+      if (param_.width == 8 && param_.height == 2) {
+        // Special case is at the end of the array.
+        id += 32 * 19;
+      } else {
+        id += 32 * (2 + 3 * static_cast<int>(std::log2(param_.height) - 1));
+      }
+    }
+    return id;
+  }
+
+  void GetConvolveFuncs(const Dsp* dsp, ConvolveFunc* func,
+                        ConvolveScaleFunc* scale_func);
+  void SetInputData(bool use_fixed_values, int value);
+  void Check(bool use_fixed_values, const Pixel* src, const Pixel* dest,
+             libvpx_test::MD5* md5_digest);
+  void Check16Bit(bool use_fixed_values, const uint16_t* src,
+                  const uint16_t* dest, libvpx_test::MD5* md5_digest);
+  // |num_runs| covers the categories of filters (6) and the number of filters
+  // under each category (16).
+  void Test(bool use_fixed_values, int value,
+            int num_runs = kMinimumViableRuns);
+
+  const ConvolveTestParam param_ = std::get<0>(GetParam());
+  const ConvolveTypeParam type_param_ = std::get<1>(GetParam());
+  const bool is_scaled_convolve_ = std::get<2>(GetParam());
+
+ private:
+  ConvolveFunc base_convolve_func_;
+  ConvolveFunc cur_convolve_func_;
+  ConvolveScaleFunc base_convolve_scale_func_;
+  ConvolveScaleFunc cur_convolve_scale_func_;
+  // Convolve filters are 7-tap, which needs 3 pixels (kRestorationBoder)
+  // padding.
+  // When is_scaled_convolve_ is true, the source can be at most 2 times of
+  // max width/height. So we allocate a larger memory for it and setup the
+  // extra memory when is_scaled_convolve_ is true.
+  Pixel source_[kMaxBlockHeight * kMaxBlockWidth * 4] = {};
+  uint16_t source_16bit_[kMaxBlockHeight * kMaxBlockWidth * 4] = {};
+  uint16_t dest_16bit_[kMaxBlockHeight * kMaxBlockWidth] = {};
+  Pixel dest_clipped_[kMaxBlockHeight * kMaxBlockWidth] = {};
+
+  const int source_stride_ =
+      is_scaled_convolve_ ? kMaxBlockWidth * 2 : kMaxBlockWidth;
+  const int source_height_ =
+      is_scaled_convolve_ ? kMaxBlockHeight * 2 : kMaxBlockHeight;
+};
+
+template <int bitdepth, typename Pixel>
+void ConvolveTest<bitdepth, Pixel>::GetConvolveFuncs(
+    const Dsp* const dsp, ConvolveFunc* func, ConvolveScaleFunc* scale_func) {
+  if (is_scaled_convolve_) {
+    *func = nullptr;
+    *scale_func = dsp->convolve_scale[type_param_.is_compound];
+  } else {
+    *scale_func = nullptr;
+    *func =
+        dsp->convolve[type_param_.is_intra_block_copy][type_param_.is_compound]
+                     [type_param_.has_vertical_filter]
+                     [type_param_.has_horizontal_filter];
+  }
+}
+
+template <int bitdepth, typename Pixel>
+void ConvolveTest<bitdepth, Pixel>::SetInputData(bool use_fixed_values,
+                                                 int value) {
+  if (use_fixed_values) {
+    std::fill(source_, source_ + source_height_ * source_stride_, value);
+  } else {
+    const int offset =
+        kConvolveBorderLeftTop * source_stride_ + kConvolveBorderLeftTop;
+    const int mask = (1 << bitdepth) - 1;
+    libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+    const int height = is_scaled_convolve_ ? param_.height * 2 : param_.height;
+    const int width = is_scaled_convolve_ ? param_.width * 2 : param_.width;
+    for (int y = 0; y < height; ++y) {
+      for (int x = 0; x < width; ++x) {
+        source_[y * source_stride_ + x + offset] = rnd.Rand16() & mask;
+      }
+    }
+    // Copy border pixels to the left and right borders.
+    for (int y = 0; y < height; ++y) {
+      Memset(&source_[(y + kConvolveBorderLeftTop) * source_stride_],
+             source_[y * source_stride_ + offset], kConvolveBorderLeftTop);
+      Memset(&source_[y * source_stride_ + offset + width],
+             source_[y * source_stride_ + offset + width - 1],
+             kConvolveBorderLeftTop);
+    }
+    // Copy border pixels to the top and bottom borders.
+    for (int y = 0; y < kConvolveBorderLeftTop; ++y) {
+      memcpy(&source_[y * source_stride_],
+             &source_[kConvolveBorderLeftTop * source_stride_],
+             source_stride_ * sizeof(Pixel));
+      memcpy(&source_[(y + kConvolveBorderLeftTop + height) * source_stride_],
+             &source_[(kConvolveBorderLeftTop + height - 1) * source_stride_],
+             source_stride_ * sizeof(Pixel));
+    }
+  }
+}
+
+template <int bitdepth, typename Pixel>
+void ConvolveTest<bitdepth, Pixel>::Check(bool use_fixed_values,
+                                          const Pixel* src, const Pixel* dest,
+                                          libvpx_test::MD5* md5_digest) {
+  if (use_fixed_values) {
+    // For fixed values, input and output are identical.
+    const bool success =
+        test_utils::CompareBlocks(src, dest, param_.width, param_.height,
+                                  kMaxBlockWidth, kMaxBlockWidth, false, false);
+    EXPECT_TRUE(success);
+  } else {
+    // For random input, compare md5.
+    const int offset =
+        kConvolveBorderLeftTop * kMaxBlockWidth + kConvolveBorderLeftTop;
+    const size_t size = sizeof(dest_clipped_) - offset * sizeof(Pixel);
+    md5_digest->Add(reinterpret_cast<const uint8_t*>(dest), size);
+  }
+}
+
+template <int bitdepth, typename Pixel>
+void ConvolveTest<bitdepth, Pixel>::Check16Bit(bool use_fixed_values,
+                                               const uint16_t* src,
+                                               const uint16_t* dest,
+                                               libvpx_test::MD5* md5_digest) {
+  if (use_fixed_values) {
+    // For fixed values, input and output are identical.
+    const bool success =
+        test_utils::CompareBlocks(src, dest, param_.width, param_.height,
+                                  kMaxBlockWidth, kMaxBlockWidth, false);
+    EXPECT_TRUE(success);
+  } else {
+    // For random input, compare md5.
+    const int offset =
+        kConvolveBorderLeftTop * kMaxBlockWidth + kConvolveBorderLeftTop;
+    const size_t size = sizeof(dest_16bit_) - offset * sizeof(uint16_t);
+    md5_digest->Add(reinterpret_cast<const uint8_t*>(dest), size);
+  }
+}
+
+template <int bitdepth, typename Pixel>
+void ConvolveTest<bitdepth, Pixel>::Test(bool use_fixed_values, int value,
+                                         int num_runs /*= 16 * 6*/) {
+  // There's no meaning testing fixed input in compound convolve.
+  if (type_param_.is_compound && use_fixed_values) GTEST_SKIP();
+
+  // Scaled convolve does not behave differently under most params. Only need to
+  // test the enabled compound implementation.
+  if (is_scaled_convolve_ &&
+      (type_param_.is_intra_block_copy || type_param_.has_vertical_filter ||
+       type_param_.has_horizontal_filter)) {
+    GTEST_SKIP();
+  }
+
+  // There should not be any function set for this combination.
+  if (type_param_.is_intra_block_copy && type_param_.is_compound) {
+    ASSERT_EQ(cur_convolve_func_, nullptr);
+    return;
+  }
+
+  // Compound and intra block copy functions are only used for blocks 4x4 or
+  // greater.
+  if (type_param_.is_compound || type_param_.is_intra_block_copy) {
+    if (param_.width < 4 || param_.height < 4) {
+      GTEST_SKIP();
+    }
+  }
+
+  // Skip unspecialized functions.
+  if (cur_convolve_func_ == nullptr && cur_convolve_scale_func_ == nullptr) {
+    GTEST_SKIP();
+  }
+
+  SetInputData(use_fixed_values, value);
+  libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed() +
+                             GetDigestId());
+  // [1,2048] for |step_[xy]|. This covers a scaling range of 1/1024 to 2x.
+  const int step_x = (rnd.Rand16() & ((1 << 11) - 1)) + 1;
+  const int step_y = (rnd.Rand16() & ((1 << 11) - 1)) + 1;
+  int subpixel_x = 0;
+  int subpixel_y = 0;
+  int vertical_index = 0;
+  int horizontal_index = 0;
+  const int offset =
+      kConvolveBorderLeftTop * kMaxBlockWidth + kConvolveBorderLeftTop;
+  const int offset_scale =
+      kConvolveBorderLeftTop * source_stride_ + kConvolveBorderLeftTop;
+  const Pixel* const src = source_ + offset;
+  const Pixel* const src_scale = source_ + offset_scale;
+  const ptrdiff_t src_stride = source_stride_ * sizeof(Pixel);
+  const ptrdiff_t src_stride_16 = source_stride_;
+  const ptrdiff_t dst_stride = kMaxBlockWidth * sizeof(Pixel);
+  // Pack Compound output since we control the predictor buffer.
+  const ptrdiff_t dst_stride_compound = param_.width;
+
+  // Output is always 16 bits regardless of |bitdepth|.
+  uint16_t* dst_16 = dest_16bit_ + offset;
+  // Output depends on |bitdepth|.
+  Pixel* dst_pixel = dest_clipped_ + offset;
+
+  // Collect the first |kMinimumViableRuns| into one md5 buffer.
+  libvpx_test::MD5 md5_digest;
+
+  absl::Duration elapsed_time;
+  for (int i = 0; i < num_runs; ++i) {
+    // Test every filter.
+    // Because of masking |subpixel_{x,y}| values roll over every 16 iterations.
+    subpixel_x += 1 << 6;
+    subpixel_y += 1 << 6;
+
+    const int horizontal_filter_id = (subpixel_x >> 6) & 0xF;
+    const int vertical_filter_id = (subpixel_y >> 6) & 0xF;
+
+    // |filter_id| == 0 (copy) must be handled by the appropriate 1D or copy
+    // function.
+    if (horizontal_filter_id == 0 || vertical_filter_id == 0) {
+      continue;
+    }
+
+    // For focused speed testing these can be set to the desired filter. Want
+    // only 8 tap filters? Set |{vertical,horizontal}_index| to 2.
+    vertical_index += static_cast<int>(i % 16 == 0);
+    vertical_index %= 4;
+    horizontal_index += static_cast<int>(i % 16 == 0);
+    horizontal_index %= 4;
+
+    if (is_scaled_convolve_) {
+      ASSERT_EQ(cur_convolve_func_, nullptr);
+      // Output type is uint16_t.
+      const absl::Time start = absl::Now();
+      if (type_param_.is_compound) {
+        cur_convolve_scale_func_(
+            source_, src_stride, horizontal_index, vertical_index, 0, 0, step_x,
+            step_y, param_.width, param_.height, dst_16, dst_stride_compound);
+      } else {
+        cur_convolve_scale_func_(
+            source_, src_stride, horizontal_index, vertical_index, 0, 0, step_x,
+            step_y, param_.width, param_.height, dst_pixel, dst_stride);
+      }
+      elapsed_time += absl::Now() - start;
+    } else if (type_param_.is_compound) {
+      ASSERT_EQ(cur_convolve_scale_func_, nullptr);
+      // Output type is uint16_t.
+      const absl::Time start = absl::Now();
+      cur_convolve_func_(src, src_stride, horizontal_index, vertical_index,
+                         horizontal_filter_id, vertical_filter_id, param_.width,
+                         param_.height, dst_16, dst_stride_compound);
+      elapsed_time += absl::Now() - start;
+    } else {
+      ASSERT_EQ(cur_convolve_scale_func_, nullptr);
+      // Output type is Pixel.
+      const absl::Time start = absl::Now();
+      cur_convolve_func_(src, src_stride, horizontal_index, vertical_index,
+                         horizontal_filter_id, vertical_filter_id, param_.width,
+                         param_.height, dst_pixel, dst_stride);
+      elapsed_time += absl::Now() - start;
+    }
+
+    // Only check the output for the first set. After that it's just repeated
+    // runs for speed timing.
+    if (i >= kMinimumViableRuns) continue;
+
+    if (is_scaled_convolve_) {
+      // Convolve function does not clip the output. The clipping is applied
+      // later. But libaom clips the output. So we apply clipping to match
+      // libaom in tests.
+      if (type_param_.is_compound) {
+        const int single_round_offset = (1 << bitdepth) + (1 << (bitdepth - 1));
+        Pixel* dest_row = dest_clipped_;
+        for (int y = 0; y < kMaxBlockHeight; ++y) {
+          for (int x = 0; x < kMaxBlockWidth; ++x) {
+            dest_row[x] = static_cast<Pixel>(Clip3(
+                dest_16bit_[y * dst_stride_compound + x] - single_round_offset,
+                0, (1 << bitdepth) - 1));
+          }
+          dest_row += kMaxBlockWidth;
+        }
+      }
+
+      if (type_param_.is_compound) {
+        Check16Bit(use_fixed_values, source_16bit_ + offset_scale, dst_16,
+                   &md5_digest);
+      } else {
+        Check(use_fixed_values, src_scale, dst_pixel, &md5_digest);
+      }
+    } else if (type_param_.is_compound) {
+      // Need to copy source to a uint16_t buffer for comparison.
+      Pixel* src_ptr = source_;
+      uint16_t* src_ptr_16 = source_16bit_;
+      for (int y = 0; y < kMaxBlockHeight; ++y) {
+        for (int x = 0; x < kMaxBlockWidth; ++x) {
+          src_ptr_16[x] = src_ptr[x];
+        }
+        src_ptr += src_stride_16;
+        src_ptr_16 += src_stride_16;
+      }
+
+      Check16Bit(use_fixed_values, source_16bit_ + offset, dst_16, &md5_digest);
+    } else {
+      Check(use_fixed_values, src, dst_pixel, &md5_digest);
+    }
+  }
+
+  if (!use_fixed_values) {
+    // md5 sums are only calculated for random input.
+    const char* ref_digest;
+    if (bitdepth == 8) {
+      ref_digest = GetDigest8bpp(GetDigestId());
+    } else {
+#if LIBGAV1_MAX_BITDEPTH >= 10
+      ref_digest = GetDigest10bpp(GetDigestId());
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+    }
+    const char* direction;
+    if (is_scaled_convolve_ || (type_param_.has_vertical_filter &&
+                                type_param_.has_horizontal_filter)) {
+      direction = "2D";
+    } else if (type_param_.has_vertical_filter) {
+      direction = "Vertical";
+    } else if (type_param_.has_horizontal_filter) {
+      direction = "Horizontal";
+    } else {
+      direction = "Copy";
+    }
+    const auto elapsed_time_us =
+        static_cast<int>(absl::ToInt64Microseconds(elapsed_time));
+    printf("Mode Convolve%s%s%s%s[%25s]: %5d us MD5: %s\n",
+           type_param_.is_compound ? "Compound" : "",
+           type_param_.is_intra_block_copy ? "IntraBlockCopy" : "",
+           is_scaled_convolve_ ? "Scale" : "", direction,
+           absl::StrFormat("%dx%d", param_.width, param_.height).c_str(),
+           elapsed_time_us, md5_digest.Get());
+    EXPECT_STREQ(ref_digest, md5_digest.Get());
+  }
+}
+
+void ApplyFilterToSignedInput(const int min_input, const int max_input,
+                              const int8_t filter[kSubPixelTaps],
+                              int* min_output, int* max_output) {
+  int min = 0, max = 0;
+  for (int i = 0; i < kSubPixelTaps; ++i) {
+    const int tap = filter[i];
+    if (tap > 0) {
+      max += max_input * tap;
+      min += min_input * tap;
+    } else {
+      min += max_input * tap;
+      max += min_input * tap;
+    }
+  }
+  *min_output = min;
+  *max_output = max;
+}
+
+void ApplyFilterToUnsignedInput(const int max_input,
+                                const int8_t filter[kSubPixelTaps],
+                                int* min_output, int* max_output) {
+  ApplyFilterToSignedInput(0, max_input, filter, min_output, max_output);
+}
+
+// Validate the maximum ranges for different parts of the Convolve process.
+template <int bitdepth>
+void ShowRange() {
+  // Subtract one from the shift bits because the filter is pre-shifted by 1.
+  constexpr int horizontal_bits = (bitdepth == kBitdepth12)
+                                      ? kInterRoundBitsHorizontal12bpp - 1
+                                      : kInterRoundBitsHorizontal - 1;
+  constexpr int vertical_bits = (bitdepth == kBitdepth12)
+                                    ? kInterRoundBitsVertical12bpp - 1
+                                    : kInterRoundBitsVertical - 1;
+  constexpr int compound_vertical_bits = kInterRoundBitsCompoundVertical - 1;
+
+  constexpr int compound_offset = (bitdepth == 8) ? 0 : kCompoundOffset;
+
+  constexpr int max_input = (1 << bitdepth) - 1;
+
+  const int8_t* worst_convolve_filter = kHalfSubPixelFilters[2][8];
+
+  // First pass.
+  printf("Bitdepth: %2d Input range:            [%8d, %8d]\n", bitdepth, 0,
+         max_input);
+
+  int min, max;
+  ApplyFilterToUnsignedInput(max_input, worst_convolve_filter, &min, &max);
+
+  if (bitdepth == 8) {
+    // 8bpp can use int16_t for sums.
+    assert(min > INT16_MIN);
+    assert(max < INT16_MAX);
+  } else {
+    // 10bpp and 12bpp require int32_t.
+    assert(min > INT32_MIN);
+    assert(max > INT16_MAX && max < INT32_MAX);
+  }
+
+  printf("  intermediate range:                [%8d, %8d]\n", min, max);
+
+  const int first_pass_min = RightShiftWithRounding(min, horizontal_bits);
+  const int first_pass_max = RightShiftWithRounding(max, horizontal_bits);
+
+  // All bitdepths can use int16_t for first pass output.
+  assert(first_pass_min > INT16_MIN);
+  assert(first_pass_max < INT16_MAX);
+
+  printf("  first pass output range:           [%8d, %8d]\n", first_pass_min,
+         first_pass_max);
+
+  // Second pass.
+  ApplyFilterToSignedInput(first_pass_min, first_pass_max,
+                           worst_convolve_filter, &min, &max);
+
+  // All bitdepths require int32_t for second pass sums.
+  assert(min < INT16_MIN && min > INT32_MIN);
+  assert(max > INT16_MAX && max < INT32_MAX);
+
+  printf("  intermediate range:                [%8d, %8d]\n", min, max);
+
+  // Second pass non-compound output is clipped to Pixel values.
+  const int second_pass_min =
+      Clip3(RightShiftWithRounding(min, vertical_bits), 0, max_input);
+  const int second_pass_max =
+      Clip3(RightShiftWithRounding(max, vertical_bits), 0, max_input);
+  printf("  second pass output range:          [%8d, %8d]\n", second_pass_min,
+         second_pass_max);
+
+  // Output is Pixel so matches Pixel values.
+  assert(second_pass_min == 0);
+  assert(second_pass_max == max_input);
+
+  const int compound_second_pass_min =
+      RightShiftWithRounding(min, compound_vertical_bits) + compound_offset;
+  const int compound_second_pass_max =
+      RightShiftWithRounding(max, compound_vertical_bits) + compound_offset;
+
+  printf("  compound second pass output range: [%8d, %8d]\n",
+         compound_second_pass_min, compound_second_pass_max);
+
+  if (bitdepth == 8) {
+    // 8bpp output is int16_t without an offset.
+    assert(compound_second_pass_min > INT16_MIN);
+    assert(compound_second_pass_max < INT16_MAX);
+  } else {
+    // 10bpp and 12bpp use the offset to fit inside uint16_t.
+    assert(compound_second_pass_min > 0);
+    assert(compound_second_pass_max < UINT16_MAX);
+  }
+
+  printf("\n");
+}
+
+TEST(ConvolveTest, ShowRange) {
+  ShowRange<kBitdepth8>();
+  ShowRange<kBitdepth10>();
+  ShowRange<kBitdepth12>();
+}
+
+using ConvolveTest8bpp = ConvolveTest<8, uint8_t>;
+
+TEST_P(ConvolveTest8bpp, FixedValues) {
+  Test(true, 0);
+  Test(true, 1);
+  Test(true, 128);
+  Test(true, 255);
+}
+
+TEST_P(ConvolveTest8bpp, RandomValues) { Test(false, 0); }
+
+TEST_P(ConvolveTest8bpp, DISABLED_Speed) {
+  const int num_runs = static_cast<int>(1.0e7 / (param_.width * param_.height));
+  Test(false, 0, num_runs);
+}
+
+const ConvolveTestParam kConvolveParam[] = {
+    ConvolveTestParam(2, 2),    ConvolveTestParam(2, 4),
+    ConvolveTestParam(4, 2),    ConvolveTestParam(4, 4),
+    ConvolveTestParam(4, 8),    ConvolveTestParam(8, 2),
+    ConvolveTestParam(8, 4),    ConvolveTestParam(8, 8),
+    ConvolveTestParam(8, 16),   ConvolveTestParam(16, 8),
+    ConvolveTestParam(16, 16),  ConvolveTestParam(16, 32),
+    ConvolveTestParam(32, 16),  ConvolveTestParam(32, 32),
+    ConvolveTestParam(32, 64),  ConvolveTestParam(64, 32),
+    ConvolveTestParam(64, 64),  ConvolveTestParam(64, 128),
+    ConvolveTestParam(128, 64), ConvolveTestParam(128, 128),
+};
+
+const ConvolveTypeParam kConvolveTypeParam[] = {
+    ConvolveTypeParam(false, false, false, false),
+    ConvolveTypeParam(false, false, false, true),
+    ConvolveTypeParam(false, false, true, false),
+    ConvolveTypeParam(false, false, true, true),
+    ConvolveTypeParam(false, true, false, false),
+    ConvolveTypeParam(false, true, false, true),
+    ConvolveTypeParam(false, true, true, false),
+    ConvolveTypeParam(false, true, true, true),
+    ConvolveTypeParam(true, false, false, false),
+    ConvolveTypeParam(true, false, false, true),
+    ConvolveTypeParam(true, false, true, false),
+    ConvolveTypeParam(true, false, true, true),
+    ConvolveTypeParam(true, true, false, false),
+    ConvolveTypeParam(true, true, false, true),
+    ConvolveTypeParam(true, true, true, false),
+    ConvolveTypeParam(true, true, true, true),
+};
+
+INSTANTIATE_TEST_SUITE_P(C, ConvolveTest8bpp,
+                         testing::Combine(testing::ValuesIn(kConvolveParam),
+                                          testing::ValuesIn(kConvolveTypeParam),
+                                          testing::Bool()));
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, ConvolveTest8bpp,
+                         testing::Combine(testing::ValuesIn(kConvolveParam),
+                                          testing::ValuesIn(kConvolveTypeParam),
+                                          testing::Bool()));
+#endif  // LIBGAV1_ENABLE_NEON
+
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, ConvolveTest8bpp,
+                         testing::Combine(testing::ValuesIn(kConvolveParam),
+                                          testing::ValuesIn(kConvolveTypeParam),
+                                          testing::Bool()));
+#endif  // LIBGAV1_ENABLE_SSE4_1
+
+#if LIBGAV1_ENABLE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, ConvolveTest8bpp,
+                         testing::Combine(testing::ValuesIn(kConvolveParam),
+                                          testing::ValuesIn(kConvolveTypeParam),
+                                          testing::Bool()));
+#endif  // LIBGAV1_ENABLE_AVX2
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using ConvolveTest10bpp = ConvolveTest<10, uint16_t>;
+
+TEST_P(ConvolveTest10bpp, FixedValues) {
+  Test(true, 0);
+  Test(true, 1);
+  Test(true, 128);
+  Test(true, (1 << 10) - 1);
+}
+
+TEST_P(ConvolveTest10bpp, RandomValues) { Test(false, 0); }
+
+TEST_P(ConvolveTest10bpp, DISABLED_Speed) {
+  const int num_runs = static_cast<int>(1.0e7 / (param_.width * param_.height));
+  Test(false, 0, num_runs);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, ConvolveTest10bpp,
+                         testing::Combine(testing::ValuesIn(kConvolveParam),
+                                          testing::ValuesIn(kConvolveTypeParam),
+                                          testing::Bool()));
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+}  // namespace
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/distance_weighted_blend_test.cc b/src/dsp/distance_weighted_blend_test.cc
new file mode 100644
index 0000000..b3f3a2e
--- /dev/null
+++ b/src/dsp/distance_weighted_blend_test.cc
@@ -0,0 +1,324 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/distance_weighted_blend.h"
+
+#include <cstdint>
+#include <ostream>
+#include <string>
+#include <type_traits>
+
+#include "absl/strings/match.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/memory.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kNumSpeedTests = 500000;
+
+constexpr int kQuantizedDistanceLookup[4][2] = {
+    {9, 7}, {11, 5}, {12, 4}, {13, 3}};
+
+struct TestParam {
+  TestParam(int width, int height) : width(width), height(height) {}
+  int width;
+  int height;
+};
+
+std::ostream& operator<<(std::ostream& os, const TestParam& param) {
+  return os << "BlockSize" << param.width << "x" << param.height;
+}
+
+template <int bitdepth, typename Pixel>
+class DistanceWeightedBlendTest : public testing::TestWithParam<TestParam>,
+                                  public test_utils::MaxAlignedAllocable {
+ public:
+  DistanceWeightedBlendTest() = default;
+  ~DistanceWeightedBlendTest() override = default;
+
+  void SetUp() override {
+    test_utils::ResetDspTable(bitdepth);
+    DistanceWeightedBlendInit_C();
+    const dsp::Dsp* const dsp = dsp::GetDspTable(bitdepth);
+    ASSERT_NE(dsp, nullptr);
+    base_func_ = dsp->distance_weighted_blend;
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    const absl::string_view test_case = test_info->test_suite_name();
+    if (absl::StartsWith(test_case, "C/")) {
+      base_func_ = nullptr;
+    } else if (absl::StartsWith(test_case, "SSE41/")) {
+      if ((GetCpuInfo() & kSSE4_1) != 0) {
+        DistanceWeightedBlendInit_SSE4_1();
+      }
+    } else if (absl::StartsWith(test_case, "NEON/")) {
+      DistanceWeightedBlendInit_NEON();
+    } else {
+      FAIL() << "Unrecognized architecture prefix in test case name: "
+             << test_case;
+    }
+    func_ = dsp->distance_weighted_blend;
+  }
+
+ protected:
+  void Test(const char* digest, int num_tests);
+
+ private:
+  using PredType =
+      typename std::conditional<bitdepth == 8, int16_t, uint16_t>::type;
+  static constexpr int kDestStride = kMaxSuperBlockSizeInPixels;
+  const int width_ = GetParam().width;
+  const int height_ = GetParam().height;
+  alignas(kMaxAlignment) PredType
+      source1_[kMaxSuperBlockSizeInPixels * kMaxSuperBlockSizeInPixels];
+  alignas(kMaxAlignment) PredType
+      source2_[kMaxSuperBlockSizeInPixels * kMaxSuperBlockSizeInPixels];
+  Pixel dest_[kMaxSuperBlockSizeInPixels * kMaxSuperBlockSizeInPixels] = {};
+  Pixel reference_[kMaxSuperBlockSizeInPixels * kMaxSuperBlockSizeInPixels] =
+      {};
+  dsp::DistanceWeightedBlendFunc base_func_;
+  dsp::DistanceWeightedBlendFunc func_;
+};
+
+template <int bitdepth, typename Pixel>
+void DistanceWeightedBlendTest<bitdepth, Pixel>::Test(const char* digest,
+                                                      int num_tests) {
+  if (func_ == nullptr) return;
+  libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+  PredType* src_1 = source1_;
+  PredType* src_2 = source2_;
+
+  const int index = rnd.Rand8() & 3;
+  const uint8_t weight_0 = kQuantizedDistanceLookup[index][0];
+  const uint8_t weight_1 = kQuantizedDistanceLookup[index][1];
+  // In libgav1, predictors have an offset which are later subtracted and
+  // clipped in distance weighted blending. Therefore we add the offset
+  // here to match libaom's implementation.
+  for (int y = 0; y < height_; ++y) {
+    for (int x = 0; x < width_; ++x) {
+      // distance_weighted_blend is applied to compound prediction values. This
+      // implies a range far exceeding that of pixel values.
+      // The ranges include kCompoundOffset in 10bpp and 12bpp.
+      // see: src/dsp/convolve.cc & src/dsp/warp.cc.
+      static constexpr int kCompoundPredictionRange[3][2] = {
+          // 8bpp
+          {-5132, 9212},
+          // 10bpp
+          {3988, 61532},
+          // 12bpp
+          {3974, 61559},
+      };
+      constexpr int bitdepth_index = (bitdepth - 8) >> 1;
+      const int min_val = kCompoundPredictionRange[bitdepth_index][0];
+      const int max_val = kCompoundPredictionRange[bitdepth_index][1];
+      src_1[x] = static_cast<PredType>(rnd(max_val - min_val) + min_val);
+      src_2[x] = static_cast<PredType>(rnd(max_val - min_val) + min_val);
+    }
+    src_1 += width_;
+    src_2 += width_;
+  }
+  absl::Duration elapsed_time;
+  for (int i = 0; i < num_tests; ++i) {
+    const absl::Time start = absl::Now();
+    func_(source1_, source2_, weight_0, weight_1, width_, height_, dest_,
+          sizeof(Pixel) * kDestStride);
+    elapsed_time += absl::Now() - start;
+  }
+
+  test_utils::CheckMd5Digest(
+      "DistanceWeightedBlend",
+      absl::StrFormat("BlockSize%dx%d", width_, height_).c_str(), digest, dest_,
+      sizeof(dest_), elapsed_time);
+}
+
+const TestParam kTestParam[] = {
+    TestParam(4, 4),    TestParam(4, 8),    TestParam(4, 16),
+    TestParam(8, 4),    TestParam(8, 8),    TestParam(8, 16),
+    TestParam(8, 32),   TestParam(16, 4),   TestParam(16, 8),
+    TestParam(16, 16),  TestParam(16, 32),  TestParam(16, 64),
+    TestParam(32, 8),   TestParam(32, 16),  TestParam(32, 32),
+    TestParam(32, 64),  TestParam(32, 128), TestParam(64, 16),
+    TestParam(64, 32),  TestParam(64, 64),  TestParam(64, 128),
+    TestParam(128, 32), TestParam(128, 64), TestParam(128, 128),
+};
+
+const char* GetDistanceWeightedBlendDigest8bpp(const TestParam block_size) {
+  static const char* const kDigestsWidth4[] = {
+      "ebf389f724f8ab46a2cac895e4e073ca",
+      "09acd567b6b12c8cf8eb51d8b86eb4bf",
+      "57bb4d65695d8ec6752f2bd8686b64fd",
+  };
+  static const char* const kDigestsWidth8[] = {
+      "270905ac76f9a2cba8a552eb0bf7c8c1",
+      "f0801c8574d2c271ef2bbea77a1d7352",
+      "e761b580e3312be33a227492a233ce72",
+      "ff214dab1a7e98e2285961d6421720c6",
+  };
+  static const char* const kDigestsWidth16[] = {
+      "4f712609a36e817f9752326d58562ff8", "14243f5c5f7c7104160c1f2cef0a0fbc",
+      "3ac3f3161b7c8dd8436b02abfdde104a", "81a00b704e0e41a5dbe6436ac70c098d",
+      "af8fd02017c7acdff788be742d700baa",
+  };
+  static const char* const kDigestsWidth32[] = {
+      "ee34332c66a6d6ed8ce64031aafe776c", "b5e3d22bd2dbdb624c8b86a1afb5ce6d",
+      "607ffc22098d81b7e37a7bf62f4af5d3", "3823dbf043b4682f56d5ca698e755ea5",
+      "57f7e8d1e67645269ce760a2c8da4afc",
+  };
+  static const char* const kDigestsWidth64[] = {
+      "4acf556b921956c2bc24659cd5128401",
+      "a298c544c9c3b27924b4c23cc687ea5a",
+      "539e2df267782ce61c70103b23b7d922",
+      "3b0cb2a0b5d384efee4d81401025bec1",
+  };
+  static const char* const kDigestsWidth128[] = {
+      "d71ee689a40ff5f390d07717df4b7233",
+      "8b56b636dd712c2f8d138badb7219991",
+      "8cfc8836908902b8f915639b7bff45b3",
+  };
+  const int height_index =
+      FloorLog2(block_size.height) - FloorLog2(block_size.width) + 2;
+  switch (block_size.width) {
+    case 4:
+      return kDigestsWidth4[height_index - 2];
+    case 8:
+      return kDigestsWidth8[height_index - 1];
+    case 16:
+      return kDigestsWidth16[height_index];
+    case 32:
+      return kDigestsWidth32[height_index];
+    case 64:
+      return kDigestsWidth64[height_index];
+    default:
+      EXPECT_EQ(block_size.width, 128)
+          << "Unknown width parameter: " << block_size.width;
+      return kDigestsWidth128[height_index];
+  }
+}
+
+using DistanceWeightedBlendTest8bpp = DistanceWeightedBlendTest<8, uint8_t>;
+
+TEST_P(DistanceWeightedBlendTest8bpp, Blending) {
+  Test(GetDistanceWeightedBlendDigest8bpp(GetParam()), 1);
+}
+
+TEST_P(DistanceWeightedBlendTest8bpp, DISABLED_Speed) {
+  Test(GetDistanceWeightedBlendDigest8bpp(GetParam()), kNumSpeedTests);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, DistanceWeightedBlendTest8bpp,
+                         testing::ValuesIn(kTestParam));
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, DistanceWeightedBlendTest8bpp,
+                         testing::ValuesIn(kTestParam));
+#endif
+
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, DistanceWeightedBlendTest8bpp,
+                         testing::ValuesIn(kTestParam));
+#endif
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+const char* GetDistanceWeightedBlendDigest10bpp(const TestParam block_size) {
+  static const char* const kDigestsWidth4[] = {
+      "55f594b56e16d5c401274affebbcc3d3",
+      "69df14da4bb33a8f7d7087921008e919",
+      "1b61f33604c54015794198a13bfebf46",
+  };
+  static const char* const kDigestsWidth8[] = {
+      "825a938185b152f7cf09bf1c0723ce2b",
+      "85ea315c51d979bc9b45834d6b40ec6f",
+      "92ebde208e8c39f7ec6de2de82182dbb",
+      "520f84716db5b43684dbb703806383fe",
+  };
+  static const char* const kDigestsWidth16[] = {
+      "12ca23e3e2930005a0511646e8c83da4", "6208694a6744f4a3906f58c1add670e3",
+      "a33d63889df989a3bbf84ff236614267", "34830846ecb0572a98bbd192fed02b16",
+      "34bb2f79c0bd7f9a80691b8af597f2a8",
+  };
+  static const char* const kDigestsWidth32[] = {
+      "fa97f2d0e3143f1f44d3ac018b0d696d", "3df4a22456c9ab6ed346ab1b9750ae7d",
+      "6276a058b35c6131bc0c94a4b4a37ebc", "9ca42da5d2d5eb339df03ae2c7a26914",
+      "2ff0dc010a7b40830fb47423a9beb894",
+  };
+  static const char* const kDigestsWidth64[] = {
+      "800e692c520f99223bc24c1ac95a0166",
+      "818b6d20426585ef7fe844015a03aaf5",
+      "fb48691ccfff083e01d74826e88e613f",
+      "0bd350bc5bc604a224d77a5f5a422698",
+  };
+  static const char* const kDigestsWidth128[] = {
+      "02aac5d5669c1245da876c5440c4d829",
+      "a130840813cd6bd69d09bcf5f8d0180f",
+      "6ece1846bea55e8f8f2ed7fbf73718de",
+  };
+  const int height_index =
+      FloorLog2(block_size.height) - FloorLog2(block_size.width) + 2;
+  switch (block_size.width) {
+    case 4:
+      return kDigestsWidth4[height_index - 2];
+    case 8:
+      return kDigestsWidth8[height_index - 1];
+    case 16:
+      return kDigestsWidth16[height_index];
+    case 32:
+      return kDigestsWidth32[height_index];
+    case 64:
+      return kDigestsWidth64[height_index];
+    default:
+      EXPECT_EQ(block_size.width, 128)
+          << "Unknown width parameter: " << block_size.width;
+      return kDigestsWidth128[height_index];
+  }
+}
+
+using DistanceWeightedBlendTest10bpp = DistanceWeightedBlendTest<10, uint16_t>;
+
+TEST_P(DistanceWeightedBlendTest10bpp, Blending) {
+  Test(GetDistanceWeightedBlendDigest10bpp(GetParam()), 1);
+}
+
+TEST_P(DistanceWeightedBlendTest10bpp, DISABLED_Speed) {
+  Test(GetDistanceWeightedBlendDigest10bpp(GetParam()), kNumSpeedTests);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, DistanceWeightedBlendTest10bpp,
+                         testing::ValuesIn(kTestParam));
+
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, DistanceWeightedBlendTest10bpp,
+                         testing::ValuesIn(kTestParam));
+#endif
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, DistanceWeightedBlendTest10bpp,
+                         testing::ValuesIn(kTestParam));
+#endif
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+}  // namespace
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/dsp.cc b/src/dsp/dsp.cc
index 5b54c4e..a3d7701 100644
--- a/src/dsp/dsp.cc
+++ b/src/dsp/dsp.cc
@@ -16,7 +16,6 @@
 
 #include <mutex>  // NOLINT (unapproved c++11 header)
 
-#include "src/dsp/arm/weight_mask_neon.h"
 #include "src/dsp/average_blend.h"
 #include "src/dsp/cdef.h"
 #include "src/dsp/convolve.h"
@@ -24,6 +23,10 @@
 #include "src/dsp/film_grain.h"
 #include "src/dsp/intra_edge.h"
 #include "src/dsp/intrapred.h"
+#include "src/dsp/intrapred_cfl.h"
+#include "src/dsp/intrapred_directional.h"
+#include "src/dsp/intrapred_filter.h"
+#include "src/dsp/intrapred_smooth.h"
 #include "src/dsp/inverse_transform.h"
 #include "src/dsp/loop_filter.h"
 #include "src/dsp/loop_restoration.h"
@@ -39,6 +42,30 @@
 namespace libgav1 {
 namespace dsp_internal {
 
+void DspInit_C() {
+  dsp::AverageBlendInit_C();
+  dsp::CdefInit_C();
+  dsp::ConvolveInit_C();
+  dsp::DistanceWeightedBlendInit_C();
+  dsp::FilmGrainInit_C();
+  dsp::IntraEdgeInit_C();
+  dsp::IntraPredCflInit_C();
+  dsp::IntraPredDirectionalInit_C();
+  dsp::IntraPredFilterInit_C();
+  dsp::IntraPredInit_C();
+  dsp::IntraPredSmoothInit_C();
+  dsp::InverseTransformInit_C();
+  dsp::LoopFilterInit_C();
+  dsp::LoopRestorationInit_C();
+  dsp::MaskBlendInit_C();
+  dsp::MotionFieldProjectionInit_C();
+  dsp::MotionVectorSearchInit_C();
+  dsp::ObmcInit_C();
+  dsp::SuperResInit_C();
+  dsp::WarpInit_C();
+  dsp::WeightMaskInit_C();
+}
+
 dsp::Dsp* GetWritableDspTable(int bitdepth) {
   switch (bitdepth) {
     case 8: {
@@ -62,23 +89,7 @@ namespace dsp {
 void DspInit() {
   static std::once_flag once;
   std::call_once(once, []() {
-    AverageBlendInit_C();
-    CdefInit_C();
-    ConvolveInit_C();
-    DistanceWeightedBlendInit_C();
-    FilmGrainInit_C();
-    IntraEdgeInit_C();
-    IntraPredInit_C();
-    InverseTransformInit_C();
-    LoopFilterInit_C();
-    LoopRestorationInit_C();
-    MaskBlendInit_C();
-    MotionFieldProjectionInit_C();
-    MotionVectorSearchInit_C();
-    ObmcInit_C();
-    SuperResInit_C();
-    WarpInit_C();
-    WeightMaskInit_C();
+    dsp_internal::DspInit_C();
 #if LIBGAV1_ENABLE_SSE4_1 || LIBGAV1_ENABLE_AVX2
     const uint32_t cpu_features = GetCpuInfo();
 #if LIBGAV1_ENABLE_SSE4_1
@@ -87,7 +98,11 @@ void DspInit() {
       CdefInit_SSE4_1();
       ConvolveInit_SSE4_1();
       DistanceWeightedBlendInit_SSE4_1();
+      FilmGrainInit_SSE4_1();
       IntraEdgeInit_SSE4_1();
+      IntraPredCflInit_SSE4_1();
+      IntraPredDirectionalInit_SSE4_1();
+      IntraPredFilterInit_SSE4_1();
       IntraPredInit_SSE4_1();
       IntraPredCflInit_SSE4_1();
       IntraPredSmoothInit_SSE4_1();
@@ -108,6 +123,7 @@ void DspInit() {
 #endif  // LIBGAV1_ENABLE_SSE4_1
 #if LIBGAV1_ENABLE_AVX2
     if ((cpu_features & kAVX2) != 0) {
+      CdefInit_AVX2();
       ConvolveInit_AVX2();
       LoopRestorationInit_AVX2();
 #if LIBGAV1_MAX_BITDEPTH >= 10
@@ -125,7 +141,7 @@ void DspInit() {
     IntraEdgeInit_NEON();
     IntraPredCflInit_NEON();
     IntraPredDirectionalInit_NEON();
-    IntraPredFilterIntraInit_NEON();
+    IntraPredFilterInit_NEON();
     IntraPredInit_NEON();
     IntraPredSmoothInit_NEON();
     InverseTransformInit_NEON();
@@ -138,6 +154,9 @@ void DspInit() {
     SuperResInit_NEON();
     WarpInit_NEON();
     WeightMaskInit_NEON();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+    InverseTransformInit10bpp_NEON();
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
 #endif  // LIBGAV1_ENABLE_NEON
   });
 }
diff --git a/src/dsp/dsp.h b/src/dsp/dsp.h
index fcbac3a..153db7f 100644
--- a/src/dsp/dsp.h
+++ b/src/dsp/dsp.h
@@ -17,7 +17,7 @@
 #ifndef LIBGAV1_SRC_DSP_DSP_H_
 #define LIBGAV1_SRC_DSP_DSP_H_
 
-#include <cstddef>  // ptrdiff_t
+#include <cstddef>
 #include <cstdint>
 #include <cstdlib>
 
@@ -372,8 +372,9 @@ using SuperResCoefficientsFunc = void (*)(int upscaled_width,
 // |coefficients| is the upscale filter used by each pixel in a row. It is not
 // used by the C function.
 // |source| is the input frame buffer. It will be line extended.
+// |source_stride| is given in pixels.
 // |dest| is the output buffer.
-// |stride| is given in pixels, and shared by |source| and |dest|.
+// |dest_stride| is given in pixels.
 // |height| is the height of the block to be processed.
 // |downscaled_width| is the width of the input frame.
 // |upscaled_width| is the width of the output frame.
@@ -381,9 +382,10 @@ using SuperResCoefficientsFunc = void (*)(int upscaled_width,
 // pixel.
 // |initial_subpixel_x| is a base offset from which |step| increments.
 using SuperResFunc = void (*)(const void* coefficients, void* source,
-                              ptrdiff_t stride, int height,
+                              ptrdiff_t source_stride, int height,
                               int downscaled_width, int upscaled_width,
-                              int initial_subpixel_x, int step, void* dest);
+                              int initial_subpixel_x, int step, void* dest,
+                              ptrdiff_t dest_stride);
 
 // Loop restoration function signature. Sections 7.16, 7.17.
 // |restoration_info| contains loop restoration information, such as filter
@@ -391,14 +393,15 @@ using SuperResFunc = void (*)(const void* coefficients, void* source,
 // |source| is the input frame buffer, which is deblocked and cdef filtered.
 // |top_border| and |bottom_border| are the top and bottom borders.
 // |dest| is the output.
-// |stride| is given in pixels, and shared by |source|, |top_border|,
-// |bottom_border| and |dest|.
+// |stride| is given in pixels, and shared by |source| and |dest|.
+// |top_border_stride| and |bottom_border_stride| are given in pixels.
 // |restoration_buffer| contains buffers required for self guided filter and
 // wiener filter. They must be initialized before calling.
 using LoopRestorationFunc = void (*)(
     const RestorationUnitInfo& restoration_info, const void* source,
-    const void* top_border, const void* bottom_border, ptrdiff_t stride,
-    int width, int height, RestorationBuffer* restoration_buffer, void* dest);
+    ptrdiff_t stride, const void* top_border, ptrdiff_t top_border_stride,
+    const void* bottom_border, ptrdiff_t bottom_border_stride, int width,
+    int height, RestorationBuffer* restoration_buffer, void* dest);
 
 // Index 0 is Wiener Filter.
 // Index 1 is Self Guided Restoration Filter.
@@ -900,6 +903,11 @@ namespace dsp_internal {
   (LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \
    LIBGAV1_Dsp10bpp_##func == LIBGAV1_CPU_SSE4_1)
 
+// Initializes C-only function pointers. Note some entries may be set to
+// nullptr if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS is not defined. This is meant
+// for use in tests only, it is not thread-safe.
+void DspInit_C();
+
 // Returns the appropriate Dsp table for |bitdepth| or nullptr if one doesn't
 // exist. This version is meant for use by test or dsp/*Init() functions only.
 dsp::Dsp* GetWritableDspTable(int bitdepth);
diff --git a/src/dsp/dsp_test.cc b/src/dsp/dsp_test.cc
new file mode 100644
index 0000000..bf7b9f3
--- /dev/null
+++ b/src/dsp/dsp_test.cc
@@ -0,0 +1,248 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/dsp.h"
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+
+#include "absl/strings/str_cat.h"
+#include "gtest/gtest.h"
+#include "src/dsp/constants.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+#include "tests/utils.h"
+#endif
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+// Maps 1D transform to the maximum valid size for the corresponding transform.
+constexpr int kMax1DTransformSize[kNum1DTransforms] = {
+    k1DTransformSize64,  // Dct.
+    k1DTransformSize16,  // Adst.
+    k1DTransformSize32,  // Identity.
+    k1DTransformSize4,   // Wht.
+};
+
+void CheckTables(bool c_only) {
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  static constexpr int kBitdepths[] = {kBitdepth8, kBitdepth10};
+#else
+  static constexpr int kBitdepths[] = {kBitdepth8};
+#endif
+
+  for (const auto& bitdepth : kBitdepths) {
+    const Dsp* const dsp = GetDspTable(bitdepth);
+    ASSERT_NE(dsp, nullptr);
+    SCOPED_TRACE(absl::StrCat("bitdepth: ", bitdepth));
+    for (int i = 0; i < kNumTransformSizes; ++i) {
+      for (int j = 0; j < kNumIntraPredictors; ++j) {
+        EXPECT_NE(dsp->intra_predictors[i][j], nullptr)
+            << "index [" << i << "][" << j << "]";
+      }
+    }
+    EXPECT_NE(dsp->directional_intra_predictor_zone1, nullptr);
+    EXPECT_NE(dsp->directional_intra_predictor_zone2, nullptr);
+    EXPECT_NE(dsp->directional_intra_predictor_zone3, nullptr);
+    EXPECT_NE(dsp->filter_intra_predictor, nullptr);
+    for (int i = 0; i < kNumTransformSizes; ++i) {
+      if (std::max(kTransformWidth[i], kTransformHeight[i]) == 64) {
+        EXPECT_EQ(dsp->cfl_intra_predictors[i], nullptr)
+            << "index [" << i << "]";
+        for (int j = 0; j < kNumSubsamplingTypes; ++j) {
+          EXPECT_EQ(dsp->cfl_subsamplers[i][j], nullptr)
+              << "index [" << i << "][" << j << "]";
+        }
+      } else {
+        EXPECT_NE(dsp->cfl_intra_predictors[i], nullptr)
+            << "index [" << i << "]";
+        for (int j = 0; j < kNumSubsamplingTypes; ++j) {
+          EXPECT_NE(dsp->cfl_subsamplers[i][j], nullptr)
+              << "index [" << i << "][" << j << "]";
+        }
+      }
+    }
+    EXPECT_NE(dsp->intra_edge_filter, nullptr);
+    EXPECT_NE(dsp->intra_edge_upsampler, nullptr);
+    for (int i = 0; i < kNum1DTransforms; ++i) {
+      for (int j = 0; j < kNum1DTransformSizes; ++j) {
+        for (int k = 0; k < 2; ++k) {
+          if (j <= kMax1DTransformSize[i]) {
+            EXPECT_NE(dsp->inverse_transforms[i][j][k], nullptr)
+                << "index [" << i << "][" << j << "][" << k << "]";
+          } else {
+            EXPECT_EQ(dsp->inverse_transforms[i][j][k], nullptr)
+                << "index [" << i << "][" << j << "][" << k << "]";
+          }
+        }
+      }
+    }
+    for (int i = 0; i < kNumLoopFilterSizes; ++i) {
+      for (int j = 0; j < kNumLoopFilterTypes; ++j) {
+        EXPECT_NE(dsp->loop_filters[i][j], nullptr)
+            << "index [" << i << "][" << j << "]";
+      }
+    }
+    for (int i = 0; i < 2; ++i) {
+      EXPECT_NE(dsp->loop_restorations[i], nullptr) << "index [" << i << "]";
+    }
+
+    bool super_res_coefficients_is_nonnull = LIBGAV1_ENABLE_NEON;
+#if LIBGAV1_ENABLE_SSE4_1
+    const uint32_t cpu_features = GetCpuInfo();
+    super_res_coefficients_is_nonnull = (cpu_features & kSSE4_1) != 0;
+#endif
+    if (c_only) super_res_coefficients_is_nonnull = false;
+    if (super_res_coefficients_is_nonnull) {
+      EXPECT_NE(dsp->super_res_coefficients, nullptr);
+    } else {
+      EXPECT_EQ(dsp->super_res_coefficients, nullptr);
+    }
+
+    EXPECT_NE(dsp->super_res, nullptr);
+    EXPECT_NE(dsp->cdef_direction, nullptr);
+    for (int i = 0; i < 2; ++i) {
+      for (int j = 0; j < 3; ++j) {
+        EXPECT_NE(dsp->cdef_filters[i][j], nullptr)
+            << "index [" << i << "][" << j << "]";
+      }
+    }
+    for (auto convolve_func : dsp->convolve_scale) {
+      EXPECT_NE(convolve_func, nullptr);
+    }
+    for (int j = 0; j < 2; ++j) {
+      for (int k = 0; k < 2; ++k) {
+        for (int l = 0; l < 2; ++l) {
+          for (int m = 0; m < 2; ++m) {
+            if (j == 1 && k == 1) {
+              EXPECT_EQ(dsp->convolve[j][k][l][m], nullptr);
+            } else {
+              EXPECT_NE(dsp->convolve[j][k][l][m], nullptr);
+            }
+          }
+        }
+      }
+    }
+    for (const auto& m : dsp->mask_blend) {
+      for (int i = 0; i < 2; ++i) {
+        if (i == 0 || bitdepth >= 10) {
+          EXPECT_NE(m[i], nullptr);
+        } else {
+          EXPECT_EQ(m[i], nullptr);
+        }
+      }
+    }
+    for (const auto& m : dsp->inter_intra_mask_blend_8bpp) {
+      if (bitdepth == 8) {
+        EXPECT_NE(m, nullptr);
+      } else {
+        EXPECT_EQ(m, nullptr);
+      }
+    }
+    for (int i = kBlock4x4; i < kMaxBlockSizes; ++i) {
+      const int width_index = k4x4WidthLog2[i] - 1;
+      const int height_index = k4x4HeightLog2[i] - 1;
+      // Only block sizes >= 8x8 are handled with this function.
+      if (width_index < 0 || height_index < 0) continue;
+
+      for (size_t j = 0; j < 2; ++j) {
+        EXPECT_NE(dsp->weight_mask[width_index][height_index][j], nullptr)
+            << ToString(static_cast<BlockSize>(i)) << " index [" << width_index
+            << "]"
+            << "[" << height_index << "][" << j << "]";
+      }
+    }
+
+    EXPECT_NE(dsp->average_blend, nullptr);
+    EXPECT_NE(dsp->distance_weighted_blend, nullptr);
+    for (int i = 0; i < kNumObmcDirections; ++i) {
+      EXPECT_NE(dsp->obmc_blend[i], nullptr)
+          << "index [" << ToString(static_cast<ObmcDirection>(i)) << "]";
+    }
+    EXPECT_NE(dsp->warp, nullptr);
+    EXPECT_NE(dsp->warp_compound, nullptr);
+
+    for (int i = 0; i < kNumAutoRegressionLags - 1; ++i) {
+      EXPECT_NE(dsp->film_grain.luma_auto_regression[i], nullptr)
+          << "index [" << i << "]";
+    }
+    for (int i = 0; i < 2; ++i) {
+      for (int j = 0; j < kNumAutoRegressionLags; ++j) {
+        if (i == 0 && j == 0) {
+          EXPECT_EQ(dsp->film_grain.chroma_auto_regression[i][j], nullptr)
+              << " index [" << i << "]"
+              << "[" << j << "]";
+        } else {
+          EXPECT_NE(dsp->film_grain.chroma_auto_regression[i][j], nullptr)
+              << " index [" << i << "]"
+              << "[" << j << "]";
+        }
+      }
+      EXPECT_NE(dsp->film_grain.construct_noise_stripes[i], nullptr)
+          << "index [" << i << "]";
+      EXPECT_NE(dsp->film_grain.blend_noise_chroma[i], nullptr)
+          << "index [" << i << "]";
+    }
+    EXPECT_NE(dsp->film_grain.construct_noise_image_overlap, nullptr);
+    EXPECT_NE(dsp->film_grain.initialize_scaling_lut, nullptr);
+    EXPECT_NE(dsp->film_grain.blend_noise_luma, nullptr);
+
+    EXPECT_NE(dsp->motion_field_projection_kernel, nullptr);
+    EXPECT_NE(dsp->mv_projection_compound[0], nullptr);
+    EXPECT_NE(dsp->mv_projection_compound[1], nullptr);
+    EXPECT_NE(dsp->mv_projection_compound[2], nullptr);
+    EXPECT_NE(dsp->mv_projection_single[0], nullptr);
+    EXPECT_NE(dsp->mv_projection_single[1], nullptr);
+    EXPECT_NE(dsp->mv_projection_single[2], nullptr);
+  }
+}
+
+TEST(Dsp, TablesArePopulated) {
+  DspInit();
+  CheckTables(/*c_only=*/false);
+}
+
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+TEST(Dsp, TablesArePopulatedCOnly) {
+  test_utils::ResetDspTable(kBitdepth8);
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  test_utils::ResetDspTable(kBitdepth10);
+#endif
+  dsp_internal::DspInit_C();
+  CheckTables(/*c_only=*/true);
+}
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+
+TEST(Dsp, GetDspTable) {
+  EXPECT_EQ(GetDspTable(1), nullptr);
+  EXPECT_NE(GetDspTable(8), nullptr);
+  EXPECT_EQ(dsp_internal::GetWritableDspTable(1), nullptr);
+  EXPECT_NE(dsp_internal::GetWritableDspTable(8), nullptr);
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  EXPECT_NE(GetDspTable(10), nullptr);
+  EXPECT_NE(dsp_internal::GetWritableDspTable(10), nullptr);
+#else
+  EXPECT_EQ(GetDspTable(10), nullptr);
+  EXPECT_EQ(dsp_internal::GetWritableDspTable(10), nullptr);
+#endif
+}
+
+}  // namespace
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/film_grain.h b/src/dsp/film_grain.h
index fe93270..f75a354 100644
--- a/src/dsp/film_grain.h
+++ b/src/dsp/film_grain.h
@@ -25,6 +25,14 @@
 // ARM:
 #include "src/dsp/arm/film_grain_neon.h"
 
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/film_grain_sse4.h"
+// clang-format on
+
 // IWYU pragma: end_exports
 
 namespace libgav1 {
diff --git a/src/dsp/intra_edge_test.cc b/src/dsp/intra_edge_test.cc
new file mode 100644
index 0000000..90960c6
--- /dev/null
+++ b/src/dsp/intra_edge_test.cc
@@ -0,0 +1,504 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intra_edge.h"
+
+#include <cstdint>
+#include <cstdio>
+#include <ostream>
+
+#include "absl/strings/match.h"
+#include "absl/strings/string_view.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+const char kIntraEdge[] = "IntraEdge";
+const char kIntraEdgeFilterName[] = "Intra Edge Filter";
+const char kIntraEdgeUpsamplerName[] = "Intra Edge Upsampler";
+
+constexpr int kIntraEdgeFilterTestMaxSize = 129;
+constexpr int kIntraEdgeFilterTestFixedInput[kIntraEdgeFilterTestMaxSize] = {
+    159, 208, 54,  136, 205, 124, 125, 165, 164, 63,  171, 143, 210, 236, 253,
+    233, 139, 113, 66,  211, 133, 61,  91,  123, 187, 76,  110, 172, 61,  103,
+    239, 147, 247, 120, 18,  106, 180, 159, 208, 54,  136, 205, 124, 125, 165,
+    164, 63,  171, 143, 210, 236, 253, 233, 139, 113, 66,  211, 133, 61,  91,
+    123, 187, 76,  110, 172, 61,  103, 239, 147, 247, 120, 18,  106, 180, 159,
+    208, 54,  136, 205, 124, 125, 165, 164, 63,  171, 143, 210, 236, 253, 233,
+    139, 113, 66,  211, 133, 61,  91,  123, 187, 76,  110, 172, 61,  103, 239,
+    147, 247, 120, 18,  106, 180, 159, 208, 54,  136, 205, 124, 125, 165, 164,
+    63,  171, 143, 210, 236, 253, 233, 139, 113,
+};
+constexpr int kIntraEdgeUpsamplerTestFixedInput[] = {
+    208, 54,  136, 205, 124, 125, 165, 164, 63,
+    171, 143, 210, 236, 208, 54,  136, 205};
+
+struct EdgeFilterParams {
+  int size;
+  int strength;
+};
+
+std::ostream& operator<<(std::ostream& os, const EdgeFilterParams& param) {
+  return os << "size: " << param.size << ", strength: " << param.strength;
+}
+
+// Each size is paired with strength 1, 2, and 3.
+// In general, the size is expressible as 2^n+1, but all sizes up to 129 are
+// permissible.
+constexpr EdgeFilterParams kIntraEdgeFilterParamList[] = {
+    {1, 1},  {1, 2},  {1, 3},  {2, 1},   {2, 2},   {2, 3},  {5, 1},  {5, 2},
+    {5, 3},  {9, 1},  {9, 2},  {9, 3},   {17, 1},  {17, 2}, {17, 3}, {33, 1},
+    {33, 2}, {33, 3}, {50, 1}, {50, 2},  {50, 3},  {55, 1}, {55, 2}, {55, 3},
+    {65, 1}, {65, 2}, {65, 3}, {129, 1}, {129, 2}, {129, 3}};
+
+template <int bitdepth, typename Pixel>
+class IntraEdgeFilterTest : public testing::TestWithParam<EdgeFilterParams> {
+ public:
+  IntraEdgeFilterTest() = default;
+  IntraEdgeFilterTest(const IntraEdgeFilterTest&) = delete;
+  IntraEdgeFilterTest& operator=(const IntraEdgeFilterTest&) = delete;
+  ~IntraEdgeFilterTest() override = default;
+
+ protected:
+  void SetUp() override {
+    test_utils::ResetDspTable(bitdepth);
+    IntraEdgeInit_C();
+
+    const Dsp* const dsp = GetDspTable(bitdepth);
+    ASSERT_NE(dsp, nullptr);
+    base_intra_edge_filter_ = dsp->intra_edge_filter;
+
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    const absl::string_view test_case = test_info->test_suite_name();
+    if (absl::StartsWith(test_case, "C/")) {
+      base_intra_edge_filter_ = nullptr;
+    } else if (absl::StartsWith(test_case, "SSE41/")) {
+      if ((GetCpuInfo() & kSSE4_1) != 0) {
+        IntraEdgeInit_SSE4_1();
+      }
+    } else if (absl::StartsWith(test_case, "NEON/")) {
+      IntraEdgeInit_NEON();
+    } else {
+      FAIL() << "Unrecognized architecture prefix in test case name: "
+             << test_case;
+    }
+
+    cur_intra_edge_filter_ = dsp->intra_edge_filter;
+  }
+
+  void TestFixedValues(const char* digest);
+  void TestRandomValues(int num_runs);
+
+  Pixel buffer_[kIntraEdgeFilterTestMaxSize];
+  Pixel base_buffer_[kIntraEdgeFilterTestMaxSize];
+  int strength_ = GetParam().strength;
+  int size_ = GetParam().size;
+
+  IntraEdgeFilterFunc base_intra_edge_filter_;
+  IntraEdgeFilterFunc cur_intra_edge_filter_;
+};
+
+template <int bitdepth, typename Pixel>
+void IntraEdgeFilterTest<bitdepth, Pixel>::TestFixedValues(
+    const char* const digest) {
+  if (cur_intra_edge_filter_ == nullptr) return;
+  for (int i = 0; i < kIntraEdgeFilterTestMaxSize; ++i) {
+    buffer_[i] = kIntraEdgeFilterTestFixedInput[i];
+  }
+  const absl::Time start = absl::Now();
+  cur_intra_edge_filter_(buffer_, size_, strength_);
+  const absl::Duration elapsed_time = absl::Now() - start;
+  test_utils::CheckMd5Digest(kIntraEdge, kIntraEdgeFilterName, digest, buffer_,
+                             kIntraEdgeFilterTestMaxSize * sizeof(buffer_[0]),
+                             elapsed_time);
+}
+
+template <int bitdepth, typename Pixel>
+void IntraEdgeFilterTest<bitdepth, Pixel>::TestRandomValues(int num_runs) {
+  if (base_intra_edge_filter_ == nullptr) return;
+  if (cur_intra_edge_filter_ == nullptr) return;
+  libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+  absl::Duration elapsed_time;
+  absl::Duration base_elapsed_time;
+  for (int num_tests = 0; num_tests < num_runs; ++num_tests) {
+    for (int i = 0; i < kIntraEdgeFilterTestMaxSize; ++i) {
+      const Pixel val = rnd(bitdepth);
+      buffer_[i] = val;
+      base_buffer_[i] = val;
+    }
+    const absl::Time base_start = absl::Now();
+    base_intra_edge_filter_(base_buffer_, size_, strength_);
+    base_elapsed_time += absl::Now() - base_start;
+    const absl::Time start = absl::Now();
+    cur_intra_edge_filter_(buffer_, size_, strength_);
+    elapsed_time += absl::Now() - start;
+  }
+  if (num_runs > 1) {
+    printf("Mode %s[%31s] Size %3d Strength %d C: %5d us SIMD: %5d us %2.2fx\n",
+           kIntraEdge, kIntraEdgeFilterName, size_, strength_,
+           static_cast<int>(absl::ToInt64Microseconds(base_elapsed_time)),
+           static_cast<int>(absl::ToInt64Microseconds(elapsed_time)),
+           absl::ToDoubleMicroseconds(base_elapsed_time) /
+               absl::ToDoubleMicroseconds(elapsed_time));
+  } else {
+    printf("Mode %s[%31s] Size %3d Strength %d\n", kIntraEdge,
+           kIntraEdgeFilterName, size_, strength_);
+  }
+  for (int i = 0; i < kIntraEdgeFilterTestMaxSize; ++i) {
+    EXPECT_EQ(buffer_[i], base_buffer_[i]) << "Mismatch in index: " << i;
+  }
+}
+
+using IntraEdgeFilterTest8bpp = IntraEdgeFilterTest<8, uint8_t>;
+
+const char* GetIntraEdgeFilterDigest8bpp(int strength, int size) {
+  static const char* const kDigestsSize1[3] = {
+      "f7f681cf7047602fafc7fb416ecf46e1", "f7f681cf7047602fafc7fb416ecf46e1",
+      "f7f681cf7047602fafc7fb416ecf46e1"};
+  static const char* const kDigestsSize2[3] = {
+      "cb24cc54900fb75d767f3de797451e43", "380c80c89e1e8cda81ee0d3d4b29b8b7",
+      "a7eb3dba95ff35c2df45a274afbc9772"};
+  static const char* const kDigestsSize5[3] = {
+      "23380cb37688d4c3a8f70a276be65eed", "ec1e23d5b996a527ed3d45c0d552bf22",
+      "d313523d3b7646fdbb873c61ffe7a51a"};
+  static const char* const kDigestsSize9[3] = {
+      "e79597e9d62893754fc77d80ca86329a", "f7644e9748984914100e7031c6432272",
+      "bdf4f16734c86338716fb436c196ecc6"};
+  static const char* const kDigestsSize17[3] = {
+      "13ad15c833e850348eecb9fea4f3cadb", "e5988a72391250c702a8192893df40dd",
+      "8f68603598638fa33203fe1233d273b1"};
+  static const char* const kDigestsSize33[3] = {
+      "51156da8f4d527e0c011040769987dbd", "eff17eaf73a7bb7fd4c921510ade9f67",
+      "aca87680e0649d0728091c92c6de8871"};
+  static const char* const kDigestsSize50[3] = {
+      "87c1d43751125f1ea4987517a90d378d", "942a9d056231683bdfc52346b6b032c2",
+      "16a9148daf0e5f69808b9f0caa1ef110"};
+  static const char* const kDigestsSize55[3] = {
+      "833480d74957fb0356dec5b09412eefa", "a307ef31f10affc3b7fb262d05f1b80a",
+      "0318b2fde088c472215fe155f3b48d36"};
+  static const char* const kDigestsSize65[3] = {
+      "5000dada34ed2e6692bb44a4398ddf53", "8da6c776d897064ecd4a1e84aae92dd3",
+      "d7c71db339c28d33119974987b2f9d85"};
+  static const char* const kDigestsSize129[3] = {
+      "bf174d8b45b8131404fd4a4686f8c117", "e81518d6d85eed2f1b18c59424561d6b",
+      "7306715602b0f5536771724a2f0a39bc"};
+
+  switch (size) {
+    case 1:
+      return kDigestsSize1[strength - 1];
+    case 2:
+      return kDigestsSize2[strength - 1];
+    case 5:
+      return kDigestsSize5[strength - 1];
+    case 9:
+      return kDigestsSize9[strength - 1];
+    case 17:
+      return kDigestsSize17[strength - 1];
+    case 33:
+      return kDigestsSize33[strength - 1];
+    case 50:
+      return kDigestsSize50[strength - 1];
+    case 55:
+      return kDigestsSize55[strength - 1];
+    case 65:
+      return kDigestsSize65[strength - 1];
+    case 129:
+      return kDigestsSize129[strength - 1];
+    default:
+      ADD_FAILURE() << "Unknown edge size: " << size;
+      return nullptr;
+  }
+}
+
+TEST_P(IntraEdgeFilterTest8bpp, Correctness) {
+  TestFixedValues(GetIntraEdgeFilterDigest8bpp(strength_, size_));
+  TestRandomValues(1);
+}
+
+TEST_P(IntraEdgeFilterTest8bpp, DISABLED_Speed) { TestRandomValues(5e7); }
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using IntraEdgeFilterTest10bpp = IntraEdgeFilterTest<10, uint16_t>;
+
+const char* GetIntraEdgeFilterDigest10bpp(int strength, int size) {
+  static const char* const kDigestsSize1[3] = {
+      "2d2088560e3ccb5b809c97f5299bb1c0", "2d2088560e3ccb5b809c97f5299bb1c0",
+      "2d2088560e3ccb5b809c97f5299bb1c0"};
+  static const char* const kDigestsSize2[3] = {
+      "db3e785852e98fba18a1fb531f68466c", "8caea330489bc6ed0f99fbf769f53181",
+      "bcdd1b21f3baf5f6f29caea9ef93fb0c"};
+  static const char* const kDigestsSize5[3] = {
+      "326f4193a62f5a959b86d95f5204608e", "4673e453203f75eae97ef44f43f098f2",
+      "48d516b06313683aca30e975ce6a3cad"};
+  static const char* const kDigestsSize9[3] = {
+      "79217575a32e36a51d9dd40621af9c2d", "ccec1c16bc09b28ad6513c5e4c48b6d2",
+      "bb61aa9c5fa720c667a053769e7b7d08"};
+  static const char* const kDigestsSize17[3] = {
+      "46d90e99ba46e89326a5fa547bcd9361", "824aee8950aecb356d5f4a91dbc90a7d",
+      "37d44d10a2545385af1da55f8c08564f"};
+  static const char* const kDigestsSize33[3] = {
+      "c95108e06eb2aef61ecb6839af306edd", "832c695460b4dd2b85c5f8726e4470d1",
+      "994902f549eefd83fbcbf7ecb7dc5cca"};
+  static const char* const kDigestsSize50[3] = {
+      "48119ef1436c3a4fe69d275bbaafedf8", "72c221c91c3df0a324ccbc9acea35f89",
+      "84e40aadcc416ef3f51cea3cc23b30c7"};
+  static const char* const kDigestsSize55[3] = {
+      "6b68e4e0b00c4eb38a6d0d83c0f34658", "43a919f928a80379df5c9e07c9d8000d",
+      "7c320d55b11f93185b811bdaa379f2db"};
+  static const char* const kDigestsSize65[3] = {
+      "c28de89cf9f3bc5a904647ab2c64caf7", "7ce63b1b28dce0624fc7586e8fb3ab8f",
+      "d06e6b88585f7f1a1f6af5bb59ee2180"};
+  static const char* const kDigestsSize129[3] = {
+      "79160902c5c85004382d5ffa549b43cc", "3b0df95c3ca7b0b559b79234cf434738",
+      "500786d8561effec283d4f3d13886f8c"};
+
+  switch (size) {
+    case 1:
+      return kDigestsSize1[strength - 1];
+    case 2:
+      return kDigestsSize2[strength - 1];
+    case 5:
+      return kDigestsSize5[strength - 1];
+    case 9:
+      return kDigestsSize9[strength - 1];
+    case 17:
+      return kDigestsSize17[strength - 1];
+    case 33:
+      return kDigestsSize33[strength - 1];
+    case 50:
+      return kDigestsSize50[strength - 1];
+    case 55:
+      return kDigestsSize55[strength - 1];
+    case 65:
+      return kDigestsSize65[strength - 1];
+    case 129:
+      return kDigestsSize129[strength - 1];
+    default:
+      ADD_FAILURE() << "Unknown edge size: " << size;
+      return nullptr;
+  }
+}
+
+TEST_P(IntraEdgeFilterTest10bpp, FixedInput) {
+  TestFixedValues(GetIntraEdgeFilterDigest10bpp(strength_, size_));
+  TestRandomValues(1);
+}
+
+TEST_P(IntraEdgeFilterTest10bpp, DISABLED_Speed) { TestRandomValues(5e7); }
+#endif
+
+template <int bitdepth, typename Pixel>
+class IntraEdgeUpsamplerTest : public testing::TestWithParam<int> {
+ public:
+  IntraEdgeUpsamplerTest() = default;
+  IntraEdgeUpsamplerTest(const IntraEdgeUpsamplerTest&) = delete;
+  IntraEdgeUpsamplerTest& operator=(const IntraEdgeUpsamplerTest&) = delete;
+  ~IntraEdgeUpsamplerTest() override = default;
+
+ protected:
+  void SetUp() override {
+    test_utils::ResetDspTable(bitdepth);
+    IntraEdgeInit_C();
+
+    const Dsp* const dsp = GetDspTable(bitdepth);
+    ASSERT_NE(dsp, nullptr);
+    base_intra_edge_upsampler_ = dsp->intra_edge_upsampler;
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    const absl::string_view test_case = test_info->test_suite_name();
+    if (absl::StartsWith(test_case, "C/")) {
+      base_intra_edge_upsampler_ = nullptr;
+    } else if (absl::StartsWith(test_case, "SSE41/")) {
+      if ((GetCpuInfo() & kSSE4_1) != 0) {
+        IntraEdgeInit_SSE4_1();
+      }
+    } else if (absl::StartsWith(test_case, "NEON/")) {
+      IntraEdgeInit_NEON();
+    } else {
+      FAIL() << "Unrecognized architecture prefix in test case name: "
+             << test_case;
+    }
+    cur_intra_edge_upsampler_ = dsp->intra_edge_upsampler;
+  }
+
+  void TestFixedValues(const char* digest);
+  void TestRandomValues(int num_runs);
+
+  Pixel buffer_[128];
+  Pixel base_buffer_[128];
+  int size_ = GetParam();
+
+  IntraEdgeUpsamplerFunc base_intra_edge_upsampler_;
+  IntraEdgeUpsamplerFunc cur_intra_edge_upsampler_;
+};
+
+template <int bitdepth, typename Pixel>
+void IntraEdgeUpsamplerTest<bitdepth, Pixel>::TestFixedValues(
+    const char* const digest) {
+  if (cur_intra_edge_upsampler_ == nullptr) return;
+  buffer_[0] = 0;
+  for (int i = 0; i < size_ + 1; ++i) {
+    buffer_[i + 1] = kIntraEdgeUpsamplerTestFixedInput[i];
+  }
+  const absl::Time start = absl::Now();
+  cur_intra_edge_upsampler_(buffer_ + 2, size_);
+  const absl::Duration elapsed_time = absl::Now() - start;
+  test_utils::CheckMd5Digest(kIntraEdge, kIntraEdgeUpsamplerName, digest,
+                             buffer_, (size_ * 2 + 1) * sizeof(buffer_[0]),
+                             elapsed_time);
+}
+
+template <int bitdepth, typename Pixel>
+void IntraEdgeUpsamplerTest<bitdepth, Pixel>::TestRandomValues(int num_runs) {
+  if (base_intra_edge_upsampler_ == nullptr) return;
+  if (cur_intra_edge_upsampler_ == nullptr) return;
+  libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+  absl::Duration base_elapsed_time;
+  absl::Duration elapsed_time;
+  for (int num_tests = 0; num_tests < num_runs; ++num_tests) {
+    // Populate what will be buffer[-2..size] when passed to the upsample
+    // function.
+    buffer_[0] = 0;
+    base_buffer_[0] = 0;
+    for (int i = 1; i < size_ + 2; ++i) {
+      const Pixel val = rnd(bitdepth);
+      buffer_[i] = val;
+      base_buffer_[i] = val;
+    }
+    const absl::Time base_start = absl::Now();
+    base_intra_edge_upsampler_(base_buffer_ + 2, size_);
+    base_elapsed_time += absl::Now() - base_start;
+    const absl::Time start = absl::Now();
+    cur_intra_edge_upsampler_(buffer_ + 2, size_);
+    elapsed_time += absl::Now() - start;
+  }
+  if (num_runs > 1) {
+    printf("Mode %s[%31s] size %d C: %5d us SIMD: %5d us %2.2fx\n", kIntraEdge,
+           kIntraEdgeUpsamplerName, size_,
+           static_cast<int>(absl::ToInt64Microseconds(base_elapsed_time)),
+           static_cast<int>(absl::ToInt64Microseconds(elapsed_time)),
+           absl::ToDoubleMicroseconds(base_elapsed_time) /
+               absl::ToDoubleMicroseconds(elapsed_time));
+  } else {
+    printf("Mode %s[%31s]: size %d \n", kIntraEdge, kIntraEdgeUpsamplerName,
+           size_);
+  }
+
+  for (int i = 0; i < size_ * 2 + 1; ++i) {
+    EXPECT_EQ(buffer_[i], base_buffer_[i]) << "Mismatch in index: " << i;
+  }
+}
+
+using IntraEdgeUpsamplerTest8bpp = IntraEdgeUpsamplerTest<8, uint8_t>;
+
+constexpr int kIntraEdgeUpsampleSizes[] = {4, 8, 12, 16};
+
+const char* GetIntraEdgeUpsampleDigest8bpp(int size) {
+  switch (size) {
+    case 4:
+      return "aa9002e03f8d15eb26bbee76f40bb923";
+    case 8:
+      return "cacfca86d65eff0d951eb21fc15f242a";
+    case 12:
+      return "0529e00a1fa80bc866fa7662ad2d7b9f";
+    case 16:
+      return "03e3b3e0ea438ea48ef05651c0a54986";
+    default:
+      ADD_FAILURE() << "Unknown upsample size: " << size;
+      return "";
+  }
+}
+
+TEST_P(IntraEdgeUpsamplerTest8bpp, Correctness) {
+  TestFixedValues(GetIntraEdgeUpsampleDigest8bpp(size_));
+  TestRandomValues(1);
+}
+
+TEST_P(IntraEdgeUpsamplerTest8bpp, DISABLED_Speed) { TestRandomValues(5e7); }
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using IntraEdgeUpsamplerTest10bpp = IntraEdgeUpsamplerTest<10, uint16_t>;
+
+const char* GetIntraEdgeUpsampleDigest10bpp(int size) {
+  switch (size) {
+    case 4:
+      return "341c6bb705a02bba65b34f92d8ca83cf";
+    case 8:
+      return "fdbe4b3b341921dcb0edf00dfc4d7667";
+    case 12:
+      return "ad69a491287495ec9973d4006d5ac461";
+    case 16:
+      return "04acf32e517d80ce4c4958e711b9b890";
+    default:
+      ADD_FAILURE() << "Unknown upsample size: " << size;
+      return "";
+  }
+}
+
+TEST_P(IntraEdgeUpsamplerTest10bpp, FixedInput) {
+  TestFixedValues(GetIntraEdgeUpsampleDigest10bpp(size_));
+  TestRandomValues(1);
+}
+
+TEST_P(IntraEdgeUpsamplerTest10bpp, DISABLED_Speed) { TestRandomValues(5e7); }
+#endif
+
+INSTANTIATE_TEST_SUITE_P(C, IntraEdgeFilterTest8bpp,
+                         testing::ValuesIn(kIntraEdgeFilterParamList));
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, IntraEdgeFilterTest8bpp,
+                         testing::ValuesIn(kIntraEdgeFilterParamList));
+#endif
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, IntraEdgeFilterTest8bpp,
+                         testing::ValuesIn(kIntraEdgeFilterParamList));
+#endif
+INSTANTIATE_TEST_SUITE_P(C, IntraEdgeUpsamplerTest8bpp,
+                         testing::ValuesIn(kIntraEdgeUpsampleSizes));
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, IntraEdgeUpsamplerTest8bpp,
+                         testing::ValuesIn(kIntraEdgeUpsampleSizes));
+#endif
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, IntraEdgeUpsamplerTest8bpp,
+                         testing::ValuesIn(kIntraEdgeUpsampleSizes));
+#endif
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+INSTANTIATE_TEST_SUITE_P(C, IntraEdgeFilterTest10bpp,
+                         testing::ValuesIn(kIntraEdgeFilterParamList));
+INSTANTIATE_TEST_SUITE_P(C, IntraEdgeUpsamplerTest10bpp,
+                         testing::ValuesIn(kIntraEdgeUpsampleSizes));
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, IntraEdgeFilterTest10bpp,
+                         testing::ValuesIn(kIntraEdgeFilterParamList));
+INSTANTIATE_TEST_SUITE_P(NEON, IntraEdgeUpsamplerTest10bpp,
+                         testing::ValuesIn(kIntraEdgeUpsampleSizes));
+#endif
+
+#endif
+}  // namespace
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/intrapred.cc b/src/dsp/intrapred.cc
index 4bcb580..4520c2c 100644
--- a/src/dsp/intrapred.cc
+++ b/src/dsp/intrapred.cc
@@ -19,21 +19,18 @@
 #include <cstddef>
 #include <cstdint>
 #include <cstdlib>
-#include <cstring>  // memset
+#include <cstring>
 
 #include "src/dsp/constants.h"
 #include "src/dsp/dsp.h"
 #include "src/utils/common.h"
+#include "src/utils/constants.h"
 #include "src/utils/memory.h"
 
 namespace libgav1 {
 namespace dsp {
 namespace {
 
-constexpr TransformSize kTransformSizesLargerThan32x32[] = {
-    kTransformSize16x64, kTransformSize32x64, kTransformSize64x16,
-    kTransformSize64x32, kTransformSize64x64};
-
 template <int block_width, int block_height, typename Pixel>
 struct IntraPredFuncs_C {
   IntraPredFuncs_C() = delete;
@@ -50,12 +47,6 @@ struct IntraPredFuncs_C {
                          const void* left_column);
   static void Paeth(void* dest, ptrdiff_t stride, const void* top_row,
                     const void* left_column);
-  static void Smooth(void* dest, ptrdiff_t stride, const void* top_row,
-                     const void* left_column);
-  static void SmoothVertical(void* dest, ptrdiff_t stride, const void* top_row,
-                             const void* left_column);
-  static void SmoothHorizontal(void* dest, ptrdiff_t stride,
-                               const void* top_row, const void* left_column);
 };
 
 // Intra-predictors that require bitdepth.
@@ -190,16 +181,6 @@ void IntraPredFuncs_C<block_width, block_height, Pixel>::Horizontal(
   }
 }
 
-template <typename Pixel>
-inline Pixel Average(Pixel a, Pixel b) {
-  return static_cast<Pixel>((a + b + 1) >> 1);
-}
-
-template <typename Pixel>
-inline Pixel Average(Pixel a, Pixel b, Pixel c) {
-  return static_cast<Pixel>((a + 2 * b + c + 2) >> 2);
-}
-
 // IntraPredFuncs_C::Paeth
 template <int block_width, int block_height, typename Pixel>
 void IntraPredFuncs_C<block_width, block_height, Pixel>::Paeth(
@@ -238,110 +219,6 @@ void IntraPredFuncs_C<block_width, block_height, Pixel>::Paeth(
   }
 }
 
-constexpr uint8_t kSmoothWeights[] = {
-    // block dimension = 4
-    255, 149, 85, 64,
-    // block dimension = 8
-    255, 197, 146, 105, 73, 50, 37, 32,
-    // block dimension = 16
-    255, 225, 196, 170, 145, 123, 102, 84, 68, 54, 43, 33, 26, 20, 17, 16,
-    // block dimension = 32
-    255, 240, 225, 210, 196, 182, 169, 157, 145, 133, 122, 111, 101, 92, 83, 74,
-    66, 59, 52, 45, 39, 34, 29, 25, 21, 17, 14, 12, 10, 9, 8, 8,
-    // block dimension = 64
-    255, 248, 240, 233, 225, 218, 210, 203, 196, 189, 182, 176, 169, 163, 156,
-    150, 144, 138, 133, 127, 121, 116, 111, 106, 101, 96, 91, 86, 82, 77, 73,
-    69, 65, 61, 57, 54, 50, 47, 44, 41, 38, 35, 32, 29, 27, 25, 22, 20, 18, 16,
-    15, 13, 12, 10, 9, 8, 7, 6, 6, 5, 5, 4, 4, 4};
-
-// IntraPredFuncs_C::Smooth
-template <int block_width, int block_height, typename Pixel>
-void IntraPredFuncs_C<block_width, block_height, Pixel>::Smooth(
-    void* const dest, ptrdiff_t stride, const void* const top_row,
-    const void* const left_column) {
-  const auto* const top = static_cast<const Pixel*>(top_row);
-  const auto* const left = static_cast<const Pixel*>(left_column);
-  const Pixel top_right = top[block_width - 1];
-  const Pixel bottom_left = left[block_height - 1];
-  static_assert(
-      block_width >= 4 && block_height >= 4,
-      "Weights for smooth predictor undefined for block width/height < 4");
-  const uint8_t* const weights_x = kSmoothWeights + block_width - 4;
-  const uint8_t* const weights_y = kSmoothWeights + block_height - 4;
-  const uint16_t scale_value = (1 << kSmoothWeightScale);
-  auto* dst = static_cast<Pixel*>(dest);
-  stride /= sizeof(Pixel);
-
-  for (int y = 0; y < block_height; ++y) {
-    for (int x = 0; x < block_width; ++x) {
-      assert(scale_value >= weights_y[y] && scale_value >= weights_x[x]);
-      uint32_t pred = weights_y[y] * top[x];
-      pred += weights_x[x] * left[y];
-      pred += static_cast<uint8_t>(scale_value - weights_y[y]) * bottom_left;
-      pred += static_cast<uint8_t>(scale_value - weights_x[x]) * top_right;
-      // The maximum value of pred with the rounder is 2^9 * (2^bitdepth - 1)
-      // + 256. With the descale there's no need for saturation.
-      dst[x] = static_cast<Pixel>(
-          RightShiftWithRounding(pred, kSmoothWeightScale + 1));
-    }
-    dst += stride;
-  }
-}
-
-// IntraPredFuncs_C::SmoothVertical
-template <int block_width, int block_height, typename Pixel>
-void IntraPredFuncs_C<block_width, block_height, Pixel>::SmoothVertical(
-    void* const dest, ptrdiff_t stride, const void* const top_row,
-    const void* const left_column) {
-  const auto* const top = static_cast<const Pixel*>(top_row);
-  const auto* const left = static_cast<const Pixel*>(left_column);
-  const Pixel bottom_left = left[block_height - 1];
-  static_assert(block_height >= 4,
-                "Weights for smooth predictor undefined for block height < 4");
-  const uint8_t* const weights_y = kSmoothWeights + block_height - 4;
-  const uint16_t scale_value = (1 << kSmoothWeightScale);
-  auto* dst = static_cast<Pixel*>(dest);
-  stride /= sizeof(Pixel);
-
-  for (int y = 0; y < block_height; ++y) {
-    for (int x = 0; x < block_width; ++x) {
-      assert(scale_value >= weights_y[y]);
-      uint32_t pred = weights_y[y] * top[x];
-      pred += static_cast<uint8_t>(scale_value - weights_y[y]) * bottom_left;
-      dst[x] =
-          static_cast<Pixel>(RightShiftWithRounding(pred, kSmoothWeightScale));
-    }
-    dst += stride;
-  }
-}
-
-// IntraPredFuncs_C::SmoothHorizontal
-template <int block_width, int block_height, typename Pixel>
-void IntraPredFuncs_C<block_width, block_height, Pixel>::SmoothHorizontal(
-    void* const dest, ptrdiff_t stride, const void* const top_row,
-    const void* const left_column) {
-  const auto* const top = static_cast<const Pixel*>(top_row);
-  const auto* const left = static_cast<const Pixel*>(left_column);
-  const Pixel top_right = top[block_width - 1];
-  static_assert(block_width >= 4,
-                "Weights for smooth predictor undefined for block width < 4");
-  const uint8_t* const weights_x = kSmoothWeights + block_width - 4;
-  const uint16_t scale_value = (1 << kSmoothWeightScale);
-  auto* dst = static_cast<Pixel*>(dest);
-  stride /= sizeof(Pixel);
-
-  for (int y = 0; y < block_height; ++y) {
-    for (int x = 0; x < block_width; ++x) {
-      assert(scale_value >= weights_x[x]);
-      uint32_t pred = weights_x[x] * left[y];
-      pred += static_cast<uint8_t>(scale_value - weights_x[x]) * top_right;
-      dst[x] =
-          static_cast<Pixel>(RightShiftWithRounding(pred, kSmoothWeightScale));
-    }
-    dst += stride;
-  }
-}
-
 //------------------------------------------------------------------------------
 // IntraPredBppFuncs_C
 template <int fill, typename Pixel>
@@ -366,288 +243,7 @@ void IntraPredBppFuncs_C<block_width, block_height, bitdepth, Pixel>::DcFill(
                                           block_height);
 }
 
-//------------------------------------------------------------------------------
-// FilterIntraPredictor_C
-
-template <int bitdepth, typename Pixel>
-void FilterIntraPredictor_C(void* const dest, ptrdiff_t stride,
-                            const void* const top_row,
-                            const void* const left_column,
-                            const FilterIntraPredictor pred, const int width,
-                            const int height) {
-  const int kMaxPixel = (1 << bitdepth) - 1;
-  const auto* const top = static_cast<const Pixel*>(top_row);
-  const auto* const left = static_cast<const Pixel*>(left_column);
-
-  assert(width <= 32 && height <= 32);
-
-  Pixel buffer[3][33];  // cache 2 rows + top & left boundaries
-  memcpy(buffer[0], &top[-1], (width + 1) * sizeof(top[0]));
-
-  auto* dst = static_cast<Pixel*>(dest);
-  stride /= sizeof(Pixel);
-  int row0 = 0, row2 = 2;
-  int ystep = 1;
-  int y = 0;
-  do {
-    buffer[1][0] = left[y];
-    buffer[row2][0] = left[y + 1];
-    int x = 1;
-    do {
-      const Pixel p0 = buffer[row0][x - 1];  // top-left
-      const Pixel p1 = buffer[row0][x + 0];  // top 0
-      const Pixel p2 = buffer[row0][x + 1];  // top 1
-      const Pixel p3 = buffer[row0][x + 2];  // top 2
-      const Pixel p4 = buffer[row0][x + 3];  // top 3
-      const Pixel p5 = buffer[1][x - 1];     // left 0
-      const Pixel p6 = buffer[row2][x - 1];  // left 1
-      for (int i = 0; i < 8; ++i) {
-        const int xoffset = i & 0x03;
-        const int yoffset = (i >> 2) * ystep;
-        const int value = kFilterIntraTaps[pred][i][0] * p0 +
-                          kFilterIntraTaps[pred][i][1] * p1 +
-                          kFilterIntraTaps[pred][i][2] * p2 +
-                          kFilterIntraTaps[pred][i][3] * p3 +
-                          kFilterIntraTaps[pred][i][4] * p4 +
-                          kFilterIntraTaps[pred][i][5] * p5 +
-                          kFilterIntraTaps[pred][i][6] * p6;
-        buffer[1 + yoffset][x + xoffset] = static_cast<Pixel>(
-            Clip3(RightShiftWithRounding(value, 4), 0, kMaxPixel));
-      }
-      x += 4;
-    } while (x < width);
-    memcpy(dst, &buffer[1][1], width * sizeof(dst[0]));
-    dst += stride;
-    memcpy(dst, &buffer[row2][1], width * sizeof(dst[0]));
-    dst += stride;
-
-    // The final row becomes the top for the next pass.
-    row0 ^= 2;
-    row2 ^= 2;
-    ystep = -ystep;
-    y += 2;
-  } while (y < height);
-}
-
-//------------------------------------------------------------------------------
-// CflIntraPredictor_C
-
-// |luma| can be within +/-(((1 << bitdepth) - 1) << 3), inclusive.
-// |alpha| can be -16 to 16 (inclusive).
-template <int block_width, int block_height, int bitdepth, typename Pixel>
-void CflIntraPredictor_C(
-    void* const dest, ptrdiff_t stride,
-    const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
-    const int alpha) {
-  auto* dst = static_cast<Pixel*>(dest);
-  const int dc = dst[0];
-  stride /= sizeof(Pixel);
-  const int max_value = (1 << bitdepth) - 1;
-  for (int y = 0; y < block_height; ++y) {
-    for (int x = 0; x < block_width; ++x) {
-      assert(luma[y][x] >= -(((1 << bitdepth) - 1) << 3));
-      assert(luma[y][x] <= ((1 << bitdepth) - 1) << 3);
-      dst[x] = Clip3(dc + RightShiftWithRoundingSigned(alpha * luma[y][x], 6),
-                     0, max_value);
-    }
-    dst += stride;
-  }
-}
-
-//------------------------------------------------------------------------------
-// CflSubsampler_C
-
-template <int block_width, int block_height, int bitdepth, typename Pixel,
-          int subsampling_x, int subsampling_y>
-void CflSubsampler_C(int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
-                     const int max_luma_width, const int max_luma_height,
-                     const void* const source, ptrdiff_t stride) {
-  assert(max_luma_width >= 4);
-  assert(max_luma_height >= 4);
-  const auto* src = static_cast<const Pixel*>(source);
-  stride /= sizeof(Pixel);
-  int sum = 0;
-  for (int y = 0; y < block_height; ++y) {
-    for (int x = 0; x < block_width; ++x) {
-      const ptrdiff_t luma_x =
-          std::min(x << subsampling_x, max_luma_width - (1 << subsampling_x));
-      const ptrdiff_t luma_x_next = luma_x + stride;
-      luma[y][x] =
-          (src[luma_x] + ((subsampling_x != 0) ? src[luma_x + 1] : 0) +
-           ((subsampling_y != 0) ? (src[luma_x_next] + src[luma_x_next + 1])
-                                 : 0))
-          << (3 - subsampling_x - subsampling_y);
-      sum += luma[y][x];
-    }
-    if ((y << subsampling_y) < (max_luma_height - (1 << subsampling_y))) {
-      src += stride << subsampling_y;
-    }
-  }
-  const int average = RightShiftWithRounding(
-      sum, FloorLog2(block_width) + FloorLog2(block_height));
-  for (int y = 0; y < block_height; ++y) {
-    for (int x = 0; x < block_width; ++x) {
-      luma[y][x] -= average;
-    }
-  }
-}
-
-//------------------------------------------------------------------------------
-// 7.11.2.4. Directional intra prediction process
-
-template <typename Pixel>
-void DirectionalIntraPredictorZone1_C(void* const dest, ptrdiff_t stride,
-                                      const void* const top_row,
-                                      const int width, const int height,
-                                      const int xstep,
-                                      const bool upsampled_top) {
-  const auto* const top = static_cast<const Pixel*>(top_row);
-  auto* dst = static_cast<Pixel*>(dest);
-  stride /= sizeof(Pixel);
-
-  assert(xstep > 0);
-
-  // If xstep == 64 then |shift| always evaluates to 0 which sets |val| to
-  // |top[top_base_x]|. This corresponds to a 45 degree prediction.
-  if (xstep == 64) {
-    // 7.11.2.10. Intra edge upsample selection process
-    // if ( d <= 0 || d >= 40 ) useUpsample = 0
-    // For |upsampled_top| the delta is |predictor_angle - 90|. Since the
-    // |predictor_angle| is 45 the delta is also 45.
-    assert(!upsampled_top);
-    const Pixel* top_ptr = top + 1;
-    for (int y = 0; y < height; ++y, dst += stride, ++top_ptr) {
-      memcpy(dst, top_ptr, sizeof(*top_ptr) * width);
-    }
-    return;
-  }
-
-  const int upsample_shift = static_cast<int>(upsampled_top);
-  const int max_base_x = ((width + height) - 1) << upsample_shift;
-  const int scale_bits = 6 - upsample_shift;
-  const int base_step = 1 << upsample_shift;
-  int top_x = xstep;
-  int y = 0;
-  do {
-    int top_base_x = top_x >> scale_bits;
-
-    if (top_base_x >= max_base_x) {
-      for (int i = y; i < height; ++i) {
-        Memset(dst, top[max_base_x], width);
-        dst += stride;
-      }
-      return;
-    }
-
-    const int shift = ((top_x << upsample_shift) & 0x3F) >> 1;
-    int x = 0;
-    do {
-      if (top_base_x >= max_base_x) {
-        Memset(dst + x, top[max_base_x], width - x);
-        break;
-      }
-
-      const int val =
-          top[top_base_x] * (32 - shift) + top[top_base_x + 1] * shift;
-      dst[x] = RightShiftWithRounding(val, 5);
-      top_base_x += base_step;
-    } while (++x < width);
-
-    dst += stride;
-    top_x += xstep;
-  } while (++y < height);
-}
-
-template <typename Pixel>
-void DirectionalIntraPredictorZone2_C(void* const dest, ptrdiff_t stride,
-                                      const void* const top_row,
-                                      const void* const left_column,
-                                      const int width, const int height,
-                                      const int xstep, const int ystep,
-                                      const bool upsampled_top,
-                                      const bool upsampled_left) {
-  const auto* const top = static_cast<const Pixel*>(top_row);
-  const auto* const left = static_cast<const Pixel*>(left_column);
-  auto* dst = static_cast<Pixel*>(dest);
-  stride /= sizeof(Pixel);
-
-  assert(xstep > 0);
-  assert(ystep > 0);
-
-  const int upsample_top_shift = static_cast<int>(upsampled_top);
-  const int upsample_left_shift = static_cast<int>(upsampled_left);
-  const int scale_bits_x = 6 - upsample_top_shift;
-  const int scale_bits_y = 6 - upsample_left_shift;
-  const int min_base_x = -(1 << upsample_top_shift);
-  const int base_step_x = 1 << upsample_top_shift;
-  int y = 0;
-  int top_x = -xstep;
-  do {
-    int top_base_x = top_x >> scale_bits_x;
-    int left_y = (y << 6) - ystep;
-    int x = 0;
-    do {
-      int val;
-      if (top_base_x >= min_base_x) {
-        const int shift = ((top_x * (1 << upsample_top_shift)) & 0x3F) >> 1;
-        val = top[top_base_x] * (32 - shift) + top[top_base_x + 1] * shift;
-      } else {
-        // Note this assumes an arithmetic shift to handle negative values.
-        const int left_base_y = left_y >> scale_bits_y;
-        const int shift = ((left_y * (1 << upsample_left_shift)) & 0x3F) >> 1;
-        assert(left_base_y >= -(1 << upsample_left_shift));
-        val = left[left_base_y] * (32 - shift) + left[left_base_y + 1] * shift;
-      }
-      dst[x] = RightShiftWithRounding(val, 5);
-      top_base_x += base_step_x;
-      left_y -= ystep;
-    } while (++x < width);
-
-    top_x -= xstep;
-    dst += stride;
-  } while (++y < height);
-}
-
-template <typename Pixel>
-void DirectionalIntraPredictorZone3_C(void* const dest, ptrdiff_t stride,
-                                      const void* const left_column,
-                                      const int width, const int height,
-                                      const int ystep,
-                                      const bool upsampled_left) {
-  const auto* const left = static_cast<const Pixel*>(left_column);
-  stride /= sizeof(Pixel);
-
-  assert(ystep > 0);
-
-  const int upsample_shift = static_cast<int>(upsampled_left);
-  const int scale_bits = 6 - upsample_shift;
-  const int base_step = 1 << upsample_shift;
-  // Zone3 never runs out of left_column values.
-  assert((width + height - 1) << upsample_shift >  // max_base_y
-         ((ystep * width) >> scale_bits) +
-             base_step * (height - 1));  // left_base_y
-
-  int left_y = ystep;
-  int x = 0;
-  do {
-    auto* dst = static_cast<Pixel*>(dest);
-
-    int left_base_y = left_y >> scale_bits;
-    int y = 0;
-    do {
-      const int shift = ((left_y << upsample_shift) & 0x3F) >> 1;
-      const int val =
-          left[left_base_y] * (32 - shift) + left[left_base_y + 1] * shift;
-      dst[x] = RightShiftWithRounding(val, 5);
-      dst += stride;
-      left_base_y += base_step;
-    } while (++y < height);
-
-    left_y += ystep;
-  } while (++x < width);
-}
-
-//------------------------------------------------------------------------------
+// -----------------------------------------------------------------------------
 
 template <typename Pixel>
 struct IntraPredDefs {
@@ -718,15 +314,7 @@ using Defs8bpp = IntraPredBppDefs<8, uint8_t>;
   dsp->intra_predictors[kTransformSize##W##x##H][kIntraPredictorHorizontal] = \
       DEFS::_##W##x##H::Horizontal;                                           \
   dsp->intra_predictors[kTransformSize##W##x##H][kIntraPredictorPaeth] =      \
-      DEFS::_##W##x##H::Paeth;                                                \
-  dsp->intra_predictors[kTransformSize##W##x##H][kIntraPredictorSmooth] =     \
-      DEFS::_##W##x##H::Smooth;                                               \
-  dsp->intra_predictors[kTransformSize##W##x##H]                              \
-                       [kIntraPredictorSmoothVertical] =                      \
-      DEFS::_##W##x##H::SmoothVertical;                                       \
-  dsp->intra_predictors[kTransformSize##W##x##H]                              \
-                       [kIntraPredictorSmoothHorizontal] =                    \
-      DEFS::_##W##x##H::SmoothHorizontal
+      DEFS::_##W##x##H::Paeth
 
 #define INIT_INTRAPREDICTORS(DEFS, DEFSBPP)        \
   INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 4, 4);   \
@@ -749,45 +337,11 @@ using Defs8bpp = IntraPredBppDefs<8, uint8_t>;
   INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 64, 32); \
   INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 64, 64)
 
-#define INIT_CFL_INTRAPREDICTOR_WxH(W, H, BITDEPTH, PIXEL)             \
-  dsp->cfl_intra_predictors[kTransformSize##W##x##H] =                 \
-      CflIntraPredictor_C<W, H, BITDEPTH, PIXEL>;                      \
-  dsp->cfl_subsamplers[kTransformSize##W##x##H][kSubsamplingType444] = \
-      CflSubsampler_C<W, H, BITDEPTH, PIXEL, 0, 0>;                    \
-  dsp->cfl_subsamplers[kTransformSize##W##x##H][kSubsamplingType422] = \
-      CflSubsampler_C<W, H, BITDEPTH, PIXEL, 1, 0>;                    \
-  dsp->cfl_subsamplers[kTransformSize##W##x##H][kSubsamplingType420] = \
-      CflSubsampler_C<W, H, BITDEPTH, PIXEL, 1, 1>
-
-#define INIT_CFL_INTRAPREDICTORS(BITDEPTH, PIXEL)       \
-  INIT_CFL_INTRAPREDICTOR_WxH(4, 4, BITDEPTH, PIXEL);   \
-  INIT_CFL_INTRAPREDICTOR_WxH(4, 8, BITDEPTH, PIXEL);   \
-  INIT_CFL_INTRAPREDICTOR_WxH(4, 16, BITDEPTH, PIXEL);  \
-  INIT_CFL_INTRAPREDICTOR_WxH(8, 4, BITDEPTH, PIXEL);   \
-  INIT_CFL_INTRAPREDICTOR_WxH(8, 8, BITDEPTH, PIXEL);   \
-  INIT_CFL_INTRAPREDICTOR_WxH(8, 16, BITDEPTH, PIXEL);  \
-  INIT_CFL_INTRAPREDICTOR_WxH(8, 32, BITDEPTH, PIXEL);  \
-  INIT_CFL_INTRAPREDICTOR_WxH(16, 4, BITDEPTH, PIXEL);  \
-  INIT_CFL_INTRAPREDICTOR_WxH(16, 8, BITDEPTH, PIXEL);  \
-  INIT_CFL_INTRAPREDICTOR_WxH(16, 16, BITDEPTH, PIXEL); \
-  INIT_CFL_INTRAPREDICTOR_WxH(16, 32, BITDEPTH, PIXEL); \
-  INIT_CFL_INTRAPREDICTOR_WxH(32, 8, BITDEPTH, PIXEL);  \
-  INIT_CFL_INTRAPREDICTOR_WxH(32, 16, BITDEPTH, PIXEL); \
-  INIT_CFL_INTRAPREDICTOR_WxH(32, 32, BITDEPTH, PIXEL)
-
 void Init8bpp() {
   Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
   assert(dsp != nullptr);
 #if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
   INIT_INTRAPREDICTORS(Defs, Defs8bpp);
-  dsp->directional_intra_predictor_zone1 =
-      DirectionalIntraPredictorZone1_C<uint8_t>;
-  dsp->directional_intra_predictor_zone2 =
-      DirectionalIntraPredictorZone2_C<uint8_t>;
-  dsp->directional_intra_predictor_zone3 =
-      DirectionalIntraPredictorZone3_C<uint8_t>;
-  dsp->filter_intra_predictor = FilterIntraPredictor_C<8, uint8_t>;
-  INIT_CFL_INTRAPREDICTORS(8, uint8_t);
 #else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
 #ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcFill
   dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcFill] =
@@ -816,19 +370,6 @@ void Init8bpp() {
   dsp->intra_predictors[kTransformSize4x4][kIntraPredictorPaeth] =
       Defs::_4x4::Paeth;
 #endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmooth
-  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmooth] =
-      Defs::_4x4::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothVertical
-  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothVertical] =
-      Defs::_4x4::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothHorizontal
-  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothHorizontal] =
-      Defs::_4x4::SmoothHorizontal;
-#endif
-
 #ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDcFill
   dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcFill] =
       Defs8bpp::_4x8::DcFill;
@@ -856,19 +397,6 @@ void Init8bpp() {
   dsp->intra_predictors[kTransformSize4x8][kIntraPredictorPaeth] =
       Defs::_4x8::Paeth;
 #endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmooth
-  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmooth] =
-      Defs::_4x8::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothVertical
-  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothVertical] =
-      Defs::_4x8::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothHorizontal
-  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothHorizontal] =
-      Defs::_4x8::SmoothHorizontal;
-#endif
-
 #ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDcFill
   dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcFill] =
       Defs8bpp::_4x16::DcFill;
@@ -897,19 +425,6 @@ void Init8bpp() {
   dsp->intra_predictors[kTransformSize4x16][kIntraPredictorPaeth] =
       Defs::_4x16::Paeth;
 #endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmooth
-  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmooth] =
-      Defs::_4x16::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothVertical
-  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothVertical] =
-      Defs::_4x16::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothHorizontal
-  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothHorizontal] =
-      Defs::_4x16::SmoothHorizontal;
-#endif
-
 #ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDcFill
   dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcFill] =
       Defs8bpp::_8x4::DcFill;
@@ -937,19 +452,6 @@ void Init8bpp() {
   dsp->intra_predictors[kTransformSize8x4][kIntraPredictorPaeth] =
       Defs::_8x4::Paeth;
 #endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmooth
-  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmooth] =
-      Defs::_8x4::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothVertical
-  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothVertical] =
-      Defs::_8x4::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothHorizontal
-  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothHorizontal] =
-      Defs::_8x4::SmoothHorizontal;
-#endif
-
 #ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDcFill
   dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcFill] =
       Defs8bpp::_8x8::DcFill;
@@ -977,19 +479,6 @@ void Init8bpp() {
   dsp->intra_predictors[kTransformSize8x8][kIntraPredictorPaeth] =
       Defs::_8x8::Paeth;
 #endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmooth
-  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmooth] =
-      Defs::_8x8::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothVertical
-  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothVertical] =
-      Defs::_8x8::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothHorizontal
-  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothHorizontal] =
-      Defs::_8x8::SmoothHorizontal;
-#endif
-
 #ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDcFill
   dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcFill] =
       Defs8bpp::_8x16::DcFill;
@@ -1018,19 +507,6 @@ void Init8bpp() {
   dsp->intra_predictors[kTransformSize8x16][kIntraPredictorPaeth] =
       Defs::_8x16::Paeth;
 #endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmooth
-  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmooth] =
-      Defs::_8x16::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothVertical
-  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothVertical] =
-      Defs::_8x16::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothHorizontal
-  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothHorizontal] =
-      Defs::_8x16::SmoothHorizontal;
-#endif
-
 #ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDcFill
   dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcFill] =
       Defs8bpp::_8x32::DcFill;
@@ -1059,19 +535,6 @@ void Init8bpp() {
   dsp->intra_predictors[kTransformSize8x32][kIntraPredictorPaeth] =
       Defs::_8x32::Paeth;
 #endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmooth
-  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmooth] =
-      Defs::_8x32::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothVertical
-  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothVertical] =
-      Defs::_8x32::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothHorizontal
-  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothHorizontal] =
-      Defs::_8x32::SmoothHorizontal;
-#endif
-
 #ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDcFill
   dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcFill] =
       Defs8bpp::_16x4::DcFill;
@@ -1100,19 +563,6 @@ void Init8bpp() {
   dsp->intra_predictors[kTransformSize16x4][kIntraPredictorPaeth] =
       Defs::_16x4::Paeth;
 #endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmooth
-  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmooth] =
-      Defs::_16x4::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothVertical
-  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothVertical] =
-      Defs::_16x4::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothHorizontal
-  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothHorizontal] =
-      Defs::_16x4::SmoothHorizontal;
-#endif
-
 #ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDcFill
   dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcFill] =
       Defs8bpp::_16x8::DcFill;
@@ -1141,19 +591,6 @@ void Init8bpp() {
   dsp->intra_predictors[kTransformSize16x8][kIntraPredictorPaeth] =
       Defs::_16x8::Paeth;
 #endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmooth
-  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmooth] =
-      Defs::_16x8::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothVertical
-  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothVertical] =
-      Defs::_16x8::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothHorizontal
-  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothHorizontal] =
-      Defs::_16x8::SmoothHorizontal;
-#endif
-
 #ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDcFill
   dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcFill] =
       Defs8bpp::_16x16::DcFill;
@@ -1182,19 +619,6 @@ void Init8bpp() {
   dsp->intra_predictors[kTransformSize16x16][kIntraPredictorPaeth] =
       Defs::_16x16::Paeth;
 #endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmooth
-  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmooth] =
-      Defs::_16x16::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothVertical
-  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothVertical] =
-      Defs::_16x16::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothHorizontal
-  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothHorizontal] =
-      Defs::_16x16::SmoothHorizontal;
-#endif
-
 #ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDcFill
   dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcFill] =
       Defs8bpp::_16x32::DcFill;
@@ -1223,19 +647,6 @@ void Init8bpp() {
   dsp->intra_predictors[kTransformSize16x32][kIntraPredictorPaeth] =
       Defs::_16x32::Paeth;
 #endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmooth
-  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmooth] =
-      Defs::_16x32::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothVertical
-  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothVertical] =
-      Defs::_16x32::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothHorizontal
-  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothHorizontal] =
-      Defs::_16x32::SmoothHorizontal;
-#endif
-
 #ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDcFill
   dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcFill] =
       Defs8bpp::_16x64::DcFill;
@@ -1264,19 +675,6 @@ void Init8bpp() {
   dsp->intra_predictors[kTransformSize16x64][kIntraPredictorPaeth] =
       Defs::_16x64::Paeth;
 #endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmooth
-  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmooth] =
-      Defs::_16x64::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothVertical
-  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothVertical] =
-      Defs::_16x64::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothHorizontal
-  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothHorizontal] =
-      Defs::_16x64::SmoothHorizontal;
-#endif
-
 #ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDcFill
   dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcFill] =
       Defs8bpp::_32x8::DcFill;
@@ -1305,19 +703,6 @@ void Init8bpp() {
   dsp->intra_predictors[kTransformSize32x8][kIntraPredictorPaeth] =
       Defs::_32x8::Paeth;
 #endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmooth
-  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmooth] =
-      Defs::_32x8::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothVertical
-  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothVertical] =
-      Defs::_32x8::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothHorizontal
-  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothHorizontal] =
-      Defs::_32x8::SmoothHorizontal;
-#endif
-
 #ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDcFill
   dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcFill] =
       Defs8bpp::_32x16::DcFill;
@@ -1346,19 +731,6 @@ void Init8bpp() {
   dsp->intra_predictors[kTransformSize32x16][kIntraPredictorPaeth] =
       Defs::_32x16::Paeth;
 #endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmooth
-  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmooth] =
-      Defs::_32x16::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothVertical
-  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothVertical] =
-      Defs::_32x16::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothHorizontal
-  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothHorizontal] =
-      Defs::_32x16::SmoothHorizontal;
-#endif
-
 #ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDcFill
   dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcFill] =
       Defs8bpp::_32x32::DcFill;
@@ -1387,19 +759,6 @@ void Init8bpp() {
   dsp->intra_predictors[kTransformSize32x32][kIntraPredictorPaeth] =
       Defs::_32x32::Paeth;
 #endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmooth
-  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmooth] =
-      Defs::_32x32::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothVertical
-  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothVertical] =
-      Defs::_32x32::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothHorizontal
-  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothHorizontal] =
-      Defs::_32x32::SmoothHorizontal;
-#endif
-
 #ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDcFill
   dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcFill] =
       Defs8bpp::_32x64::DcFill;
@@ -1428,19 +787,6 @@ void Init8bpp() {
   dsp->intra_predictors[kTransformSize32x64][kIntraPredictorPaeth] =
       Defs::_32x64::Paeth;
 #endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmooth
-  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmooth] =
-      Defs::_32x64::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothVertical
-  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothVertical] =
-      Defs::_32x64::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothHorizontal
-  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothHorizontal] =
-      Defs::_32x64::SmoothHorizontal;
-#endif
-
 #ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDcFill
   dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcFill] =
       Defs8bpp::_64x16::DcFill;
@@ -1469,19 +815,6 @@ void Init8bpp() {
   dsp->intra_predictors[kTransformSize64x16][kIntraPredictorPaeth] =
       Defs::_64x16::Paeth;
 #endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmooth
-  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmooth] =
-      Defs::_64x16::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothVertical
-  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothVertical] =
-      Defs::_64x16::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothHorizontal
-  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothHorizontal] =
-      Defs::_64x16::SmoothHorizontal;
-#endif
-
 #ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDcFill
   dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcFill] =
       Defs8bpp::_64x32::DcFill;
@@ -1510,19 +843,6 @@ void Init8bpp() {
   dsp->intra_predictors[kTransformSize64x32][kIntraPredictorPaeth] =
       Defs::_64x32::Paeth;
 #endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmooth
-  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmooth] =
-      Defs::_64x32::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothVertical
-  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothVertical] =
-      Defs::_64x32::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothHorizontal
-  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothHorizontal] =
-      Defs::_64x32::SmoothHorizontal;
-#endif
-
 #ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDcFill
   dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcFill] =
       Defs8bpp::_64x64::DcFill;
@@ -1551,282 +871,7 @@ void Init8bpp() {
   dsp->intra_predictors[kTransformSize64x64][kIntraPredictorPaeth] =
       Defs::_64x64::Paeth;
 #endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmooth
-  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmooth] =
-      Defs::_64x64::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothVertical
-  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothVertical] =
-      Defs::_64x64::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothHorizontal
-  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothHorizontal] =
-      Defs::_64x64::SmoothHorizontal;
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone1
-  dsp->directional_intra_predictor_zone1 =
-      DirectionalIntraPredictorZone1_C<uint8_t>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone2
-  dsp->directional_intra_predictor_zone2 =
-      DirectionalIntraPredictorZone2_C<uint8_t>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone3
-  dsp->directional_intra_predictor_zone3 =
-      DirectionalIntraPredictorZone3_C<uint8_t>;
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_FilterIntraPredictor
-  dsp->filter_intra_predictor = FilterIntraPredictor_C<8, uint8_t>;
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflIntraPredictor
-  dsp->cfl_intra_predictors[kTransformSize4x4] =
-      CflIntraPredictor_C<4, 4, 8, uint8_t>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler444
-  dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType444] =
-      CflSubsampler_C<4, 4, 8, uint8_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler422
-  dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType422] =
-      CflSubsampler_C<4, 4, 8, uint8_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler420
-  dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType420] =
-      CflSubsampler_C<4, 4, 8, uint8_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflIntraPredictor
-  dsp->cfl_intra_predictors[kTransformSize4x8] =
-      CflIntraPredictor_C<4, 8, 8, uint8_t>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler444
-  dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType444] =
-      CflSubsampler_C<4, 8, 8, uint8_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler422
-  dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType422] =
-      CflSubsampler_C<4, 8, 8, uint8_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler420
-  dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType420] =
-      CflSubsampler_C<4, 8, 8, uint8_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflIntraPredictor
-  dsp->cfl_intra_predictors[kTransformSize4x16] =
-      CflIntraPredictor_C<4, 16, 8, uint8_t>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler444
-  dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType444] =
-      CflSubsampler_C<4, 16, 8, uint8_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler422
-  dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType422] =
-      CflSubsampler_C<4, 16, 8, uint8_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler420
-  dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType420] =
-      CflSubsampler_C<4, 16, 8, uint8_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflIntraPredictor
-  dsp->cfl_intra_predictors[kTransformSize8x4] =
-      CflIntraPredictor_C<8, 4, 8, uint8_t>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler444
-  dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType444] =
-      CflSubsampler_C<8, 4, 8, uint8_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler422
-  dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType422] =
-      CflSubsampler_C<8, 4, 8, uint8_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler420
-  dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType420] =
-      CflSubsampler_C<8, 4, 8, uint8_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflIntraPredictor
-  dsp->cfl_intra_predictors[kTransformSize8x8] =
-      CflIntraPredictor_C<8, 8, 8, uint8_t>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler444
-  dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType444] =
-      CflSubsampler_C<8, 8, 8, uint8_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler422
-  dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType422] =
-      CflSubsampler_C<8, 8, 8, uint8_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler420
-  dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType420] =
-      CflSubsampler_C<8, 8, 8, uint8_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflIntraPredictor
-  dsp->cfl_intra_predictors[kTransformSize8x16] =
-      CflIntraPredictor_C<8, 16, 8, uint8_t>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler444
-  dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType444] =
-      CflSubsampler_C<8, 16, 8, uint8_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler422
-  dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType422] =
-      CflSubsampler_C<8, 16, 8, uint8_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler420
-  dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType420] =
-      CflSubsampler_C<8, 16, 8, uint8_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflIntraPredictor
-  dsp->cfl_intra_predictors[kTransformSize8x32] =
-      CflIntraPredictor_C<8, 32, 8, uint8_t>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler444
-  dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType444] =
-      CflSubsampler_C<8, 32, 8, uint8_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler422
-  dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType422] =
-      CflSubsampler_C<8, 32, 8, uint8_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler420
-  dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType420] =
-      CflSubsampler_C<8, 32, 8, uint8_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflIntraPredictor
-  dsp->cfl_intra_predictors[kTransformSize16x4] =
-      CflIntraPredictor_C<16, 4, 8, uint8_t>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler444
-  dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType444] =
-      CflSubsampler_C<16, 4, 8, uint8_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler422
-  dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType422] =
-      CflSubsampler_C<16, 4, 8, uint8_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler420
-  dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType420] =
-      CflSubsampler_C<16, 4, 8, uint8_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflIntraPredictor
-  dsp->cfl_intra_predictors[kTransformSize16x8] =
-      CflIntraPredictor_C<16, 8, 8, uint8_t>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler444
-  dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType444] =
-      CflSubsampler_C<16, 8, 8, uint8_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler422
-  dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType422] =
-      CflSubsampler_C<16, 8, 8, uint8_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler420
-  dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType420] =
-      CflSubsampler_C<16, 8, 8, uint8_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflIntraPredictor
-  dsp->cfl_intra_predictors[kTransformSize16x16] =
-      CflIntraPredictor_C<16, 16, 8, uint8_t>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler444
-  dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType444] =
-      CflSubsampler_C<16, 16, 8, uint8_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler422
-  dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType422] =
-      CflSubsampler_C<16, 16, 8, uint8_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler420
-  dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType420] =
-      CflSubsampler_C<16, 16, 8, uint8_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflIntraPredictor
-  dsp->cfl_intra_predictors[kTransformSize16x32] =
-      CflIntraPredictor_C<16, 32, 8, uint8_t>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler444
-  dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType444] =
-      CflSubsampler_C<16, 32, 8, uint8_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler422
-  dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType422] =
-      CflSubsampler_C<16, 32, 8, uint8_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler420
-  dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType420] =
-      CflSubsampler_C<16, 32, 8, uint8_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflIntraPredictor
-  dsp->cfl_intra_predictors[kTransformSize32x8] =
-      CflIntraPredictor_C<32, 8, 8, uint8_t>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler444
-  dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType444] =
-      CflSubsampler_C<32, 8, 8, uint8_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler422
-  dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType422] =
-      CflSubsampler_C<32, 8, 8, uint8_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler420
-  dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType420] =
-      CflSubsampler_C<32, 8, 8, uint8_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflIntraPredictor
-  dsp->cfl_intra_predictors[kTransformSize32x16] =
-      CflIntraPredictor_C<32, 16, 8, uint8_t>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler444
-  dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType444] =
-      CflSubsampler_C<32, 16, 8, uint8_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler422
-  dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType422] =
-      CflSubsampler_C<32, 16, 8, uint8_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler420
-  dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType420] =
-      CflSubsampler_C<32, 16, 8, uint8_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflIntraPredictor
-  dsp->cfl_intra_predictors[kTransformSize32x32] =
-      CflIntraPredictor_C<32, 32, 8, uint8_t>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler444
-  dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType444] =
-      CflSubsampler_C<32, 32, 8, uint8_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler422
-  dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType422] =
-      CflSubsampler_C<32, 32, 8, uint8_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler420
-  dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType420] =
-      CflSubsampler_C<32, 32, 8, uint8_t, 1, 1>;
-#endif
 #endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
-  // Cfl predictors are available only for transform sizes with max(width,
-  // height) <= 32. Set all others to nullptr.
-  for (const auto i : kTransformSizesLargerThan32x32) {
-    dsp->cfl_intra_predictors[i] = nullptr;
-    for (int j = 0; j < kNumSubsamplingTypes; ++j) {
-      dsp->cfl_subsamplers[i][j] = nullptr;
-    }
-  }
 }  // NOLINT(readability/fn_size)
 
 #if LIBGAV1_MAX_BITDEPTH >= 10
@@ -1838,14 +883,6 @@ void Init10bpp() {
   assert(dsp != nullptr);
 #if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
   INIT_INTRAPREDICTORS(DefsHbd, Defs10bpp);
-  dsp->directional_intra_predictor_zone1 =
-      DirectionalIntraPredictorZone1_C<uint16_t>;
-  dsp->directional_intra_predictor_zone2 =
-      DirectionalIntraPredictorZone2_C<uint16_t>;
-  dsp->directional_intra_predictor_zone3 =
-      DirectionalIntraPredictorZone3_C<uint16_t>;
-  dsp->filter_intra_predictor = FilterIntraPredictor_C<10, uint16_t>;
-  INIT_CFL_INTRAPREDICTORS(10, uint16_t);
 #else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
 #ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDcFill
   dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcFill] =
@@ -1875,19 +912,6 @@ void Init10bpp() {
   dsp->intra_predictors[kTransformSize4x4][kIntraPredictorPaeth] =
       DefsHbd::_4x4::Paeth;
 #endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorSmooth
-  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmooth] =
-      DefsHbd::_4x4::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorSmoothVertical
-  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothVertical] =
-      DefsHbd::_4x4::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorSmoothHorizontal
-  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothHorizontal] =
-      DefsHbd::_4x4::SmoothHorizontal;
-#endif
-
 #ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorDcFill
   dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcFill] =
       Defs10bpp::_4x8::DcFill;
@@ -1916,19 +940,6 @@ void Init10bpp() {
   dsp->intra_predictors[kTransformSize4x8][kIntraPredictorPaeth] =
       DefsHbd::_4x8::Paeth;
 #endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorSmooth
-  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmooth] =
-      DefsHbd::_4x8::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorSmoothVertical
-  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothVertical] =
-      DefsHbd::_4x8::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorSmoothHorizontal
-  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothHorizontal] =
-      DefsHbd::_4x8::SmoothHorizontal;
-#endif
-
 #ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorDcFill
   dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcFill] =
       Defs10bpp::_4x16::DcFill;
@@ -1957,19 +968,6 @@ void Init10bpp() {
   dsp->intra_predictors[kTransformSize4x16][kIntraPredictorPaeth] =
       DefsHbd::_4x16::Paeth;
 #endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorSmooth
-  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmooth] =
-      DefsHbd::_4x16::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorSmoothVertical
-  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothVertical] =
-      DefsHbd::_4x16::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorSmoothHorizontal
-  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothHorizontal] =
-      DefsHbd::_4x16::SmoothHorizontal;
-#endif
-
 #ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorDcFill
   dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcFill] =
       Defs10bpp::_8x4::DcFill;
@@ -1998,19 +996,6 @@ void Init10bpp() {
   dsp->intra_predictors[kTransformSize8x4][kIntraPredictorPaeth] =
       DefsHbd::_8x4::Paeth;
 #endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorSmooth
-  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmooth] =
-      DefsHbd::_8x4::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorSmoothVertical
-  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothVertical] =
-      DefsHbd::_8x4::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorSmoothHorizontal
-  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothHorizontal] =
-      DefsHbd::_8x4::SmoothHorizontal;
-#endif
-
 #ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorDcFill
   dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcFill] =
       Defs10bpp::_8x8::DcFill;
@@ -2039,19 +1024,6 @@ void Init10bpp() {
   dsp->intra_predictors[kTransformSize8x8][kIntraPredictorPaeth] =
       DefsHbd::_8x8::Paeth;
 #endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorSmooth
-  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmooth] =
-      DefsHbd::_8x8::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorSmoothVertical
-  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothVertical] =
-      DefsHbd::_8x8::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorSmoothHorizontal
-  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothHorizontal] =
-      DefsHbd::_8x8::SmoothHorizontal;
-#endif
-
 #ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorDcFill
   dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcFill] =
       Defs10bpp::_8x16::DcFill;
@@ -2080,19 +1052,6 @@ void Init10bpp() {
   dsp->intra_predictors[kTransformSize8x16][kIntraPredictorPaeth] =
       DefsHbd::_8x16::Paeth;
 #endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorSmooth
-  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmooth] =
-      DefsHbd::_8x16::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorSmoothVertical
-  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothVertical] =
-      DefsHbd::_8x16::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorSmoothHorizontal
-  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothHorizontal] =
-      DefsHbd::_8x16::SmoothHorizontal;
-#endif
-
 #ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorDcFill
   dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcFill] =
       Defs10bpp::_8x32::DcFill;
@@ -2121,19 +1080,6 @@ void Init10bpp() {
   dsp->intra_predictors[kTransformSize8x32][kIntraPredictorPaeth] =
       DefsHbd::_8x32::Paeth;
 #endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorSmooth
-  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmooth] =
-      DefsHbd::_8x32::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorSmoothVertical
-  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothVertical] =
-      DefsHbd::_8x32::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorSmoothHorizontal
-  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothHorizontal] =
-      DefsHbd::_8x32::SmoothHorizontal;
-#endif
-
 #ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorDcFill
   dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcFill] =
       Defs10bpp::_16x4::DcFill;
@@ -2162,19 +1108,6 @@ void Init10bpp() {
   dsp->intra_predictors[kTransformSize16x4][kIntraPredictorPaeth] =
       DefsHbd::_16x4::Paeth;
 #endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorSmooth
-  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmooth] =
-      DefsHbd::_16x4::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorSmoothVertical
-  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothVertical] =
-      DefsHbd::_16x4::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorSmoothHorizontal
-  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothHorizontal] =
-      DefsHbd::_16x4::SmoothHorizontal;
-#endif
-
 #ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorDcFill
   dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcFill] =
       Defs10bpp::_16x8::DcFill;
@@ -2203,19 +1136,6 @@ void Init10bpp() {
   dsp->intra_predictors[kTransformSize16x8][kIntraPredictorPaeth] =
       DefsHbd::_16x8::Paeth;
 #endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorSmooth
-  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmooth] =
-      DefsHbd::_16x8::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorSmoothVertical
-  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothVertical] =
-      DefsHbd::_16x8::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorSmoothHorizontal
-  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothHorizontal] =
-      DefsHbd::_16x8::SmoothHorizontal;
-#endif
-
 #ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorDcFill
   dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcFill] =
       Defs10bpp::_16x16::DcFill;
@@ -2244,19 +1164,6 @@ void Init10bpp() {
   dsp->intra_predictors[kTransformSize16x16][kIntraPredictorPaeth] =
       DefsHbd::_16x16::Paeth;
 #endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorSmooth
-  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmooth] =
-      DefsHbd::_16x16::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorSmoothVertical
-  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothVertical] =
-      DefsHbd::_16x16::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorSmoothHorizontal
-  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothHorizontal] =
-      DefsHbd::_16x16::SmoothHorizontal;
-#endif
-
 #ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorDcFill
   dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcFill] =
       Defs10bpp::_16x32::DcFill;
@@ -2285,19 +1192,6 @@ void Init10bpp() {
   dsp->intra_predictors[kTransformSize16x32][kIntraPredictorPaeth] =
       DefsHbd::_16x32::Paeth;
 #endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorSmooth
-  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmooth] =
-      DefsHbd::_16x32::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorSmoothVertical
-  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothVertical] =
-      DefsHbd::_16x32::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorSmoothHorizontal
-  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothHorizontal] =
-      DefsHbd::_16x32::SmoothHorizontal;
-#endif
-
 #ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorDcFill
   dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcFill] =
       Defs10bpp::_16x64::DcFill;
@@ -2326,19 +1220,6 @@ void Init10bpp() {
   dsp->intra_predictors[kTransformSize16x64][kIntraPredictorPaeth] =
       DefsHbd::_16x64::Paeth;
 #endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorSmooth
-  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmooth] =
-      DefsHbd::_16x64::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorSmoothVertical
-  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothVertical] =
-      DefsHbd::_16x64::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorSmoothHorizontal
-  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothHorizontal] =
-      DefsHbd::_16x64::SmoothHorizontal;
-#endif
-
 #ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorDcFill
   dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcFill] =
       Defs10bpp::_32x8::DcFill;
@@ -2367,19 +1248,6 @@ void Init10bpp() {
   dsp->intra_predictors[kTransformSize32x8][kIntraPredictorPaeth] =
       DefsHbd::_32x8::Paeth;
 #endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorSmooth
-  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmooth] =
-      DefsHbd::_32x8::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorSmoothVertical
-  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothVertical] =
-      DefsHbd::_32x8::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorSmoothHorizontal
-  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothHorizontal] =
-      DefsHbd::_32x8::SmoothHorizontal;
-#endif
-
 #ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorDcFill
   dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcFill] =
       Defs10bpp::_32x16::DcFill;
@@ -2408,19 +1276,6 @@ void Init10bpp() {
   dsp->intra_predictors[kTransformSize32x16][kIntraPredictorPaeth] =
       DefsHbd::_32x16::Paeth;
 #endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorSmooth
-  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmooth] =
-      DefsHbd::_32x16::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorSmoothVertical
-  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothVertical] =
-      DefsHbd::_32x16::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorSmoothHorizontal
-  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothHorizontal] =
-      DefsHbd::_32x16::SmoothHorizontal;
-#endif
-
 #ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorDcFill
   dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcFill] =
       Defs10bpp::_32x32::DcFill;
@@ -2449,19 +1304,6 @@ void Init10bpp() {
   dsp->intra_predictors[kTransformSize32x32][kIntraPredictorPaeth] =
       DefsHbd::_32x32::Paeth;
 #endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorSmooth
-  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmooth] =
-      DefsHbd::_32x32::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorSmoothVertical
-  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothVertical] =
-      DefsHbd::_32x32::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorSmoothHorizontal
-  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothHorizontal] =
-      DefsHbd::_32x32::SmoothHorizontal;
-#endif
-
 #ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorDcFill
   dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcFill] =
       Defs10bpp::_32x64::DcFill;
@@ -2490,19 +1332,6 @@ void Init10bpp() {
   dsp->intra_predictors[kTransformSize32x64][kIntraPredictorPaeth] =
       DefsHbd::_32x64::Paeth;
 #endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorSmooth
-  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmooth] =
-      DefsHbd::_32x64::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorSmoothVertical
-  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothVertical] =
-      DefsHbd::_32x64::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorSmoothHorizontal
-  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothHorizontal] =
-      DefsHbd::_32x64::SmoothHorizontal;
-#endif
-
 #ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorDcFill
   dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcFill] =
       Defs10bpp::_64x16::DcFill;
@@ -2531,19 +1360,6 @@ void Init10bpp() {
   dsp->intra_predictors[kTransformSize64x16][kIntraPredictorPaeth] =
       DefsHbd::_64x16::Paeth;
 #endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorSmooth
-  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmooth] =
-      DefsHbd::_64x16::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorSmoothVertical
-  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothVertical] =
-      DefsHbd::_64x16::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorSmoothHorizontal
-  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothHorizontal] =
-      DefsHbd::_64x16::SmoothHorizontal;
-#endif
-
 #ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorDcFill
   dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcFill] =
       Defs10bpp::_64x32::DcFill;
@@ -2572,19 +1388,6 @@ void Init10bpp() {
   dsp->intra_predictors[kTransformSize64x32][kIntraPredictorPaeth] =
       DefsHbd::_64x32::Paeth;
 #endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorSmooth
-  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmooth] =
-      DefsHbd::_64x32::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorSmoothVertical
-  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothVertical] =
-      DefsHbd::_64x32::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorSmoothHorizontal
-  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothHorizontal] =
-      DefsHbd::_64x32::SmoothHorizontal;
-#endif
-
 #ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorDcFill
   dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcFill] =
       Defs10bpp::_64x64::DcFill;
@@ -2613,291 +1416,12 @@ void Init10bpp() {
   dsp->intra_predictors[kTransformSize64x64][kIntraPredictorPaeth] =
       DefsHbd::_64x64::Paeth;
 #endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorSmooth
-  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmooth] =
-      DefsHbd::_64x64::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorSmoothVertical
-  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothVertical] =
-      DefsHbd::_64x64::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorSmoothHorizontal
-  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothHorizontal] =
-      DefsHbd::_64x64::SmoothHorizontal;
-#endif
-
-#ifndef LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone1
-  dsp->directional_intra_predictor_zone1 =
-      DirectionalIntraPredictorZone1_C<uint16_t>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone2
-  dsp->directional_intra_predictor_zone2 =
-      DirectionalIntraPredictorZone2_C<uint16_t>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone3
-  dsp->directional_intra_predictor_zone3 =
-      DirectionalIntraPredictorZone3_C<uint16_t>;
-#endif
-
-#ifndef LIBGAV1_Dsp10bpp_FilterIntraPredictor
-  dsp->filter_intra_predictor = FilterIntraPredictor_C<10, uint16_t>;
-#endif
-
-#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflIntraPredictor
-  dsp->cfl_intra_predictors[kTransformSize4x4] =
-      CflIntraPredictor_C<4, 4, 10, uint16_t>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler444
-  dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType444] =
-      CflSubsampler_C<4, 4, 10, uint16_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler422
-  dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType422] =
-      CflSubsampler_C<4, 4, 10, uint16_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler420
-  dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType420] =
-      CflSubsampler_C<4, 4, 10, uint16_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflIntraPredictor
-  dsp->cfl_intra_predictors[kTransformSize4x8] =
-      CflIntraPredictor_C<4, 8, 10, uint16_t>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler444
-  dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType444] =
-      CflSubsampler_C<4, 8, 10, uint16_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler422
-  dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType422] =
-      CflSubsampler_C<4, 8, 10, uint16_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler420
-  dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType420] =
-      CflSubsampler_C<4, 8, 10, uint16_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflIntraPredictor
-  dsp->cfl_intra_predictors[kTransformSize4x16] =
-      CflIntraPredictor_C<4, 16, 10, uint16_t>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler444
-  dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType444] =
-      CflSubsampler_C<4, 16, 10, uint16_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler422
-  dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType422] =
-      CflSubsampler_C<4, 16, 10, uint16_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler420
-  dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType420] =
-      CflSubsampler_C<4, 16, 10, uint16_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflIntraPredictor
-  dsp->cfl_intra_predictors[kTransformSize8x4] =
-      CflIntraPredictor_C<8, 4, 10, uint16_t>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler444
-  dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType444] =
-      CflSubsampler_C<8, 4, 10, uint16_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler422
-  dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType422] =
-      CflSubsampler_C<8, 4, 10, uint16_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler420
-  dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType420] =
-      CflSubsampler_C<8, 4, 10, uint16_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflIntraPredictor
-  dsp->cfl_intra_predictors[kTransformSize8x8] =
-      CflIntraPredictor_C<8, 8, 10, uint16_t>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler444
-  dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType444] =
-      CflSubsampler_C<8, 8, 10, uint16_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler422
-  dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType422] =
-      CflSubsampler_C<8, 8, 10, uint16_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler420
-  dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType420] =
-      CflSubsampler_C<8, 8, 10, uint16_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflIntraPredictor
-  dsp->cfl_intra_predictors[kTransformSize8x16] =
-      CflIntraPredictor_C<8, 16, 10, uint16_t>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler444
-  dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType444] =
-      CflSubsampler_C<8, 16, 10, uint16_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler422
-  dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType422] =
-      CflSubsampler_C<8, 16, 10, uint16_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler420
-  dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType420] =
-      CflSubsampler_C<8, 16, 10, uint16_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflIntraPredictor
-  dsp->cfl_intra_predictors[kTransformSize8x32] =
-      CflIntraPredictor_C<8, 32, 10, uint16_t>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler444
-  dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType444] =
-      CflSubsampler_C<8, 32, 10, uint16_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler422
-  dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType422] =
-      CflSubsampler_C<8, 32, 10, uint16_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler420
-  dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType420] =
-      CflSubsampler_C<8, 32, 10, uint16_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflIntraPredictor
-  dsp->cfl_intra_predictors[kTransformSize16x4] =
-      CflIntraPredictor_C<16, 4, 10, uint16_t>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler444
-  dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType444] =
-      CflSubsampler_C<16, 4, 10, uint16_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler422
-  dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType422] =
-      CflSubsampler_C<16, 4, 10, uint16_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler420
-  dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType420] =
-      CflSubsampler_C<16, 4, 10, uint16_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflIntraPredictor
-  dsp->cfl_intra_predictors[kTransformSize16x8] =
-      CflIntraPredictor_C<16, 8, 10, uint16_t>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler444
-  dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType444] =
-      CflSubsampler_C<16, 8, 10, uint16_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler422
-  dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType422] =
-      CflSubsampler_C<16, 8, 10, uint16_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler420
-  dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType420] =
-      CflSubsampler_C<16, 8, 10, uint16_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflIntraPredictor
-  dsp->cfl_intra_predictors[kTransformSize16x16] =
-      CflIntraPredictor_C<16, 16, 10, uint16_t>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler444
-  dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType444] =
-      CflSubsampler_C<16, 16, 10, uint16_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler422
-  dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType422] =
-      CflSubsampler_C<16, 16, 10, uint16_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler420
-  dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType420] =
-      CflSubsampler_C<16, 16, 10, uint16_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflIntraPredictor
-  dsp->cfl_intra_predictors[kTransformSize16x32] =
-      CflIntraPredictor_C<16, 32, 10, uint16_t>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler444
-  dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType444] =
-      CflSubsampler_C<16, 32, 10, uint16_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler422
-  dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType422] =
-      CflSubsampler_C<16, 32, 10, uint16_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler420
-  dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType420] =
-      CflSubsampler_C<16, 32, 10, uint16_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflIntraPredictor
-  dsp->cfl_intra_predictors[kTransformSize32x8] =
-      CflIntraPredictor_C<32, 8, 10, uint16_t>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler444
-  dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType444] =
-      CflSubsampler_C<32, 8, 10, uint16_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler422
-  dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType422] =
-      CflSubsampler_C<32, 8, 10, uint16_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler420
-  dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType420] =
-      CflSubsampler_C<32, 8, 10, uint16_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflIntraPredictor
-  dsp->cfl_intra_predictors[kTransformSize32x16] =
-      CflIntraPredictor_C<32, 16, 10, uint16_t>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler444
-  dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType444] =
-      CflSubsampler_C<32, 16, 10, uint16_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler422
-  dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType422] =
-      CflSubsampler_C<32, 16, 10, uint16_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler420
-  dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType420] =
-      CflSubsampler_C<32, 16, 10, uint16_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflIntraPredictor
-  dsp->cfl_intra_predictors[kTransformSize32x32] =
-      CflIntraPredictor_C<32, 32, 10, uint16_t>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler444
-  dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType444] =
-      CflSubsampler_C<32, 32, 10, uint16_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler422
-  dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType422] =
-      CflSubsampler_C<32, 32, 10, uint16_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler420
-  dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType420] =
-      CflSubsampler_C<32, 32, 10, uint16_t, 1, 1>;
-#endif
-
 #endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
-  // Cfl predictors are available only for transform sizes with max(width,
-  // height) <= 32. Set all others to nullptr.
-  for (const auto i : kTransformSizesLargerThan32x32) {
-    dsp->cfl_intra_predictors[i] = nullptr;
-    for (int j = 0; j < kNumSubsamplingTypes; ++j) {
-      dsp->cfl_subsamplers[i][j] = nullptr;
-    }
-  }
 }  // NOLINT(readability/fn_size)
 #endif  // LIBGAV1_MAX_BITDEPTH >= 10
 
-#undef INIT_CFL_INTRAPREDICTOR_WxH
-#undef INIT_CFL_INTRAPREDICTORS
 #undef INIT_INTRAPREDICTORS_WxH
 #undef INIT_INTRAPREDICTORS
-
 }  // namespace
 
 void IntraPredInit_C() {
diff --git a/src/dsp/intrapred.h b/src/dsp/intrapred.h
index c5286ef..2cb625d 100644
--- a/src/dsp/intrapred.h
+++ b/src/dsp/intrapred.h
@@ -38,9 +38,7 @@
 namespace libgav1 {
 namespace dsp {
 
-// Initializes Dsp::intra_predictors, Dsp::directional_intra_predictor_zone*,
-// Dsp::cfl_intra_predictors, Dsp::cfl_subsamplers and
-// Dsp::filter_intra_predictor. This function is not thread-safe.
+// Initializes Dsp::intra_predictors. This function is not thread-safe.
 void IntraPredInit_C();
 
 }  // namespace dsp
diff --git a/src/dsp/intrapred_cfl.cc b/src/dsp/intrapred_cfl.cc
new file mode 100644
index 0000000..948c0c0
--- /dev/null
+++ b/src/dsp/intrapred_cfl.cc
@@ -0,0 +1,654 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred_cfl.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr TransformSize kTransformSizesLargerThan32x32[] = {
+    kTransformSize16x64, kTransformSize32x64, kTransformSize64x16,
+    kTransformSize64x32, kTransformSize64x64};
+
+//------------------------------------------------------------------------------
+// CflIntraPredictor_C
+
+// |luma| can be within +/-(((1 << bitdepth) - 1) << 3), inclusive.
+// |alpha| can be -16 to 16 (inclusive).
+template <int block_width, int block_height, int bitdepth, typename Pixel>
+void CflIntraPredictor_C(
+    void* const dest, ptrdiff_t stride,
+    const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int alpha) {
+  auto* dst = static_cast<Pixel*>(dest);
+  const int dc = dst[0];
+  stride /= sizeof(Pixel);
+  const int max_value = (1 << bitdepth) - 1;
+  for (int y = 0; y < block_height; ++y) {
+    for (int x = 0; x < block_width; ++x) {
+      assert(luma[y][x] >= -(((1 << bitdepth) - 1) << 3));
+      assert(luma[y][x] <= ((1 << bitdepth) - 1) << 3);
+      dst[x] = Clip3(dc + RightShiftWithRoundingSigned(alpha * luma[y][x], 6),
+                     0, max_value);
+    }
+    dst += stride;
+  }
+}
+
+//------------------------------------------------------------------------------
+// CflSubsampler_C
+
+template <int block_width, int block_height, int bitdepth, typename Pixel,
+          int subsampling_x, int subsampling_y>
+void CflSubsampler_C(int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+                     const int max_luma_width, const int max_luma_height,
+                     const void* const source, ptrdiff_t stride) {
+  assert(max_luma_width >= 4);
+  assert(max_luma_height >= 4);
+  const auto* src = static_cast<const Pixel*>(source);
+  stride /= sizeof(Pixel);
+  int sum = 0;
+  for (int y = 0; y < block_height; ++y) {
+    for (int x = 0; x < block_width; ++x) {
+      const ptrdiff_t luma_x =
+          std::min(x << subsampling_x, max_luma_width - (1 << subsampling_x));
+      const ptrdiff_t luma_x_next = luma_x + stride;
+      luma[y][x] =
+          (src[luma_x] + ((subsampling_x != 0) ? src[luma_x + 1] : 0) +
+           ((subsampling_y != 0) ? (src[luma_x_next] + src[luma_x_next + 1])
+                                 : 0))
+          << (3 - subsampling_x - subsampling_y);
+      sum += luma[y][x];
+    }
+    if ((y << subsampling_y) < (max_luma_height - (1 << subsampling_y))) {
+      src += stride << subsampling_y;
+    }
+  }
+  const int average = RightShiftWithRounding(
+      sum, FloorLog2(block_width) + FloorLog2(block_height));
+  for (int y = 0; y < block_height; ++y) {
+    for (int x = 0; x < block_width; ++x) {
+      luma[y][x] -= average;
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+
+// Initializes dsp entries for kTransformSize|W|x|H|.
+#define INIT_CFL_INTRAPREDICTOR_WxH(W, H, BITDEPTH, PIXEL)             \
+  dsp->cfl_intra_predictors[kTransformSize##W##x##H] =                 \
+      CflIntraPredictor_C<W, H, BITDEPTH, PIXEL>;                      \
+  dsp->cfl_subsamplers[kTransformSize##W##x##H][kSubsamplingType444] = \
+      CflSubsampler_C<W, H, BITDEPTH, PIXEL, 0, 0>;                    \
+  dsp->cfl_subsamplers[kTransformSize##W##x##H][kSubsamplingType422] = \
+      CflSubsampler_C<W, H, BITDEPTH, PIXEL, 1, 0>;                    \
+  dsp->cfl_subsamplers[kTransformSize##W##x##H][kSubsamplingType420] = \
+      CflSubsampler_C<W, H, BITDEPTH, PIXEL, 1, 1>
+
+#define INIT_CFL_INTRAPREDICTORS(BITDEPTH, PIXEL)       \
+  INIT_CFL_INTRAPREDICTOR_WxH(4, 4, BITDEPTH, PIXEL);   \
+  INIT_CFL_INTRAPREDICTOR_WxH(4, 8, BITDEPTH, PIXEL);   \
+  INIT_CFL_INTRAPREDICTOR_WxH(4, 16, BITDEPTH, PIXEL);  \
+  INIT_CFL_INTRAPREDICTOR_WxH(8, 4, BITDEPTH, PIXEL);   \
+  INIT_CFL_INTRAPREDICTOR_WxH(8, 8, BITDEPTH, PIXEL);   \
+  INIT_CFL_INTRAPREDICTOR_WxH(8, 16, BITDEPTH, PIXEL);  \
+  INIT_CFL_INTRAPREDICTOR_WxH(8, 32, BITDEPTH, PIXEL);  \
+  INIT_CFL_INTRAPREDICTOR_WxH(16, 4, BITDEPTH, PIXEL);  \
+  INIT_CFL_INTRAPREDICTOR_WxH(16, 8, BITDEPTH, PIXEL);  \
+  INIT_CFL_INTRAPREDICTOR_WxH(16, 16, BITDEPTH, PIXEL); \
+  INIT_CFL_INTRAPREDICTOR_WxH(16, 32, BITDEPTH, PIXEL); \
+  INIT_CFL_INTRAPREDICTOR_WxH(32, 8, BITDEPTH, PIXEL);  \
+  INIT_CFL_INTRAPREDICTOR_WxH(32, 16, BITDEPTH, PIXEL); \
+  INIT_CFL_INTRAPREDICTOR_WxH(32, 32, BITDEPTH, PIXEL)
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  INIT_CFL_INTRAPREDICTORS(8, uint8_t);
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize4x4] =
+      CflIntraPredictor_C<4, 4, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType444] =
+      CflSubsampler_C<4, 4, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType422] =
+      CflSubsampler_C<4, 4, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType420] =
+      CflSubsampler_C<4, 4, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize4x8] =
+      CflIntraPredictor_C<4, 8, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType444] =
+      CflSubsampler_C<4, 8, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType422] =
+      CflSubsampler_C<4, 8, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType420] =
+      CflSubsampler_C<4, 8, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize4x16] =
+      CflIntraPredictor_C<4, 16, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType444] =
+      CflSubsampler_C<4, 16, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType422] =
+      CflSubsampler_C<4, 16, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType420] =
+      CflSubsampler_C<4, 16, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize8x4] =
+      CflIntraPredictor_C<8, 4, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType444] =
+      CflSubsampler_C<8, 4, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType422] =
+      CflSubsampler_C<8, 4, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType420] =
+      CflSubsampler_C<8, 4, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize8x8] =
+      CflIntraPredictor_C<8, 8, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType444] =
+      CflSubsampler_C<8, 8, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType422] =
+      CflSubsampler_C<8, 8, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType420] =
+      CflSubsampler_C<8, 8, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize8x16] =
+      CflIntraPredictor_C<8, 16, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType444] =
+      CflSubsampler_C<8, 16, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType422] =
+      CflSubsampler_C<8, 16, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType420] =
+      CflSubsampler_C<8, 16, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize8x32] =
+      CflIntraPredictor_C<8, 32, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType444] =
+      CflSubsampler_C<8, 32, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType422] =
+      CflSubsampler_C<8, 32, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType420] =
+      CflSubsampler_C<8, 32, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize16x4] =
+      CflIntraPredictor_C<16, 4, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType444] =
+      CflSubsampler_C<16, 4, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType422] =
+      CflSubsampler_C<16, 4, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType420] =
+      CflSubsampler_C<16, 4, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize16x8] =
+      CflIntraPredictor_C<16, 8, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType444] =
+      CflSubsampler_C<16, 8, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType422] =
+      CflSubsampler_C<16, 8, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType420] =
+      CflSubsampler_C<16, 8, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize16x16] =
+      CflIntraPredictor_C<16, 16, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType444] =
+      CflSubsampler_C<16, 16, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType422] =
+      CflSubsampler_C<16, 16, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType420] =
+      CflSubsampler_C<16, 16, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize16x32] =
+      CflIntraPredictor_C<16, 32, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType444] =
+      CflSubsampler_C<16, 32, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType422] =
+      CflSubsampler_C<16, 32, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType420] =
+      CflSubsampler_C<16, 32, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize32x8] =
+      CflIntraPredictor_C<32, 8, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType444] =
+      CflSubsampler_C<32, 8, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType422] =
+      CflSubsampler_C<32, 8, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType420] =
+      CflSubsampler_C<32, 8, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize32x16] =
+      CflIntraPredictor_C<32, 16, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType444] =
+      CflSubsampler_C<32, 16, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType422] =
+      CflSubsampler_C<32, 16, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType420] =
+      CflSubsampler_C<32, 16, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize32x32] =
+      CflIntraPredictor_C<32, 32, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType444] =
+      CflSubsampler_C<32, 32, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType422] =
+      CflSubsampler_C<32, 32, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType420] =
+      CflSubsampler_C<32, 32, 8, uint8_t, 1, 1>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  // Cfl predictors are available only for transform sizes with max(width,
+  // height) <= 32. Set all others to nullptr.
+  for (const auto i : kTransformSizesLargerThan32x32) {
+    dsp->cfl_intra_predictors[i] = nullptr;
+    for (int j = 0; j < kNumSubsamplingTypes; ++j) {
+      dsp->cfl_subsamplers[i][j] = nullptr;
+    }
+  }
+}  // NOLINT(readability/fn_size)
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  INIT_CFL_INTRAPREDICTORS(10, uint16_t);
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize4x4] =
+      CflIntraPredictor_C<4, 4, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType444] =
+      CflSubsampler_C<4, 4, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType422] =
+      CflSubsampler_C<4, 4, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType420] =
+      CflSubsampler_C<4, 4, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize4x8] =
+      CflIntraPredictor_C<4, 8, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType444] =
+      CflSubsampler_C<4, 8, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType422] =
+      CflSubsampler_C<4, 8, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType420] =
+      CflSubsampler_C<4, 8, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize4x16] =
+      CflIntraPredictor_C<4, 16, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType444] =
+      CflSubsampler_C<4, 16, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType422] =
+      CflSubsampler_C<4, 16, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType420] =
+      CflSubsampler_C<4, 16, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize8x4] =
+      CflIntraPredictor_C<8, 4, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType444] =
+      CflSubsampler_C<8, 4, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType422] =
+      CflSubsampler_C<8, 4, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType420] =
+      CflSubsampler_C<8, 4, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize8x8] =
+      CflIntraPredictor_C<8, 8, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType444] =
+      CflSubsampler_C<8, 8, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType422] =
+      CflSubsampler_C<8, 8, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType420] =
+      CflSubsampler_C<8, 8, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize8x16] =
+      CflIntraPredictor_C<8, 16, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType444] =
+      CflSubsampler_C<8, 16, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType422] =
+      CflSubsampler_C<8, 16, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType420] =
+      CflSubsampler_C<8, 16, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize8x32] =
+      CflIntraPredictor_C<8, 32, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType444] =
+      CflSubsampler_C<8, 32, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType422] =
+      CflSubsampler_C<8, 32, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType420] =
+      CflSubsampler_C<8, 32, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize16x4] =
+      CflIntraPredictor_C<16, 4, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType444] =
+      CflSubsampler_C<16, 4, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType422] =
+      CflSubsampler_C<16, 4, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType420] =
+      CflSubsampler_C<16, 4, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize16x8] =
+      CflIntraPredictor_C<16, 8, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType444] =
+      CflSubsampler_C<16, 8, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType422] =
+      CflSubsampler_C<16, 8, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType420] =
+      CflSubsampler_C<16, 8, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize16x16] =
+      CflIntraPredictor_C<16, 16, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType444] =
+      CflSubsampler_C<16, 16, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType422] =
+      CflSubsampler_C<16, 16, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType420] =
+      CflSubsampler_C<16, 16, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize16x32] =
+      CflIntraPredictor_C<16, 32, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType444] =
+      CflSubsampler_C<16, 32, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType422] =
+      CflSubsampler_C<16, 32, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType420] =
+      CflSubsampler_C<16, 32, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize32x8] =
+      CflIntraPredictor_C<32, 8, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType444] =
+      CflSubsampler_C<32, 8, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType422] =
+      CflSubsampler_C<32, 8, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType420] =
+      CflSubsampler_C<32, 8, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize32x16] =
+      CflIntraPredictor_C<32, 16, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType444] =
+      CflSubsampler_C<32, 16, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType422] =
+      CflSubsampler_C<32, 16, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType420] =
+      CflSubsampler_C<32, 16, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize32x32] =
+      CflIntraPredictor_C<32, 32, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType444] =
+      CflSubsampler_C<32, 32, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType422] =
+      CflSubsampler_C<32, 32, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType420] =
+      CflSubsampler_C<32, 32, 10, uint16_t, 1, 1>;
+#endif
+
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  // Cfl predictors are available only for transform sizes with max(width,
+  // height) <= 32. Set all others to nullptr.
+  for (const auto i : kTransformSizesLargerThan32x32) {
+    dsp->cfl_intra_predictors[i] = nullptr;
+    for (int j = 0; j < kNumSubsamplingTypes; ++j) {
+      dsp->cfl_subsamplers[i][j] = nullptr;
+    }
+  }
+}  // NOLINT(readability/fn_size)
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+#undef INIT_CFL_INTRAPREDICTOR_WxH
+#undef INIT_CFL_INTRAPREDICTORS
+
+}  // namespace
+
+void IntraPredCflInit_C() {
+  Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  Init10bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/intrapred_cfl.h b/src/dsp/intrapred_cfl.h
new file mode 100644
index 0000000..4e8a11f
--- /dev/null
+++ b/src/dsp/intrapred_cfl.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_INTRAPRED_CFL_H_
+#define LIBGAV1_SRC_DSP_INTRAPRED_CFL_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/intrapred_cfl_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/intrapred_cfl_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::cfl_intra_predictors and Dsp::cfl_subsamplers.
+// This function is not thread-safe.
+void IntraPredCflInit_C();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DSP_INTRAPRED_CFL_H_
diff --git a/src/dsp/intrapred_cfl_test.cc b/src/dsp/intrapred_cfl_test.cc
new file mode 100644
index 0000000..e700a5b
--- /dev/null
+++ b/src/dsp/intrapred_cfl_test.cc
@@ -0,0 +1,923 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred_cfl.h"
+
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <ostream>
+
+#include "absl/strings/match.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/memory.h"
+#include "tests/block_utils.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kMaxBlockSize = 64;
+constexpr int kTotalPixels = kMaxBlockSize * kMaxBlockSize;
+
+const char* const kCflIntraPredName = "kCflIntraPredictor";
+
+template <int bitdepth, typename Pixel>
+class IntraPredTestBase : public testing::TestWithParam<TransformSize>,
+                          public test_utils::MaxAlignedAllocable {
+ public:
+  IntraPredTestBase() {
+    switch (tx_size_) {
+      case kNumTransformSizes:
+        EXPECT_NE(tx_size_, kNumTransformSizes);
+        break;
+      default:
+        block_width_ = kTransformWidth[tx_size_];
+        block_height_ = kTransformHeight[tx_size_];
+        break;
+    }
+  }
+
+  IntraPredTestBase(const IntraPredTestBase&) = delete;
+  IntraPredTestBase& operator=(const IntraPredTestBase&) = delete;
+  ~IntraPredTestBase() override = default;
+
+ protected:
+  struct IntraPredMem {
+    void Reset(libvpx_test::ACMRandom* rnd) {
+      ASSERT_NE(rnd, nullptr);
+      Pixel* const left = left_mem + 16;
+      Pixel* const top = top_mem + 16;
+      const int mask = (1 << bitdepth) - 1;
+      for (auto& r : ref_src) r = rnd->Rand16() & mask;
+      for (int i = 0; i < kMaxBlockSize; ++i) left[i] = rnd->Rand16() & mask;
+      for (int i = -1; i < kMaxBlockSize; ++i) top[i] = rnd->Rand16() & mask;
+
+      // Some directional predictors require top-right, bottom-left.
+      for (int i = kMaxBlockSize; i < 2 * kMaxBlockSize; ++i) {
+        left[i] = rnd->Rand16() & mask;
+        top[i] = rnd->Rand16() & mask;
+      }
+      // TODO(jzern): reorder this and regenerate the digests after switching
+      // random number generators.
+      // Upsampling in the directional predictors extends left/top[-1] to [-2].
+      left[-1] = rnd->Rand16() & mask;
+      left[-2] = rnd->Rand16() & mask;
+      top[-2] = rnd->Rand16() & mask;
+      memset(left_mem, 0, sizeof(left_mem[0]) * 14);
+      memset(top_mem, 0, sizeof(top_mem[0]) * 14);
+      memset(top_mem + kMaxBlockSize * 2 + 16, 0,
+             sizeof(top_mem[0]) * kTopMemPadding);
+    }
+
+    // Set ref_src, top-left, top and left to |pixel|.
+    void Set(const Pixel pixel) {
+      Pixel* const left = left_mem + 16;
+      Pixel* const top = top_mem + 16;
+      for (auto& r : ref_src) r = pixel;
+      // Upsampling in the directional predictors extends left/top[-1] to [-2].
+      for (int i = -2; i < 2 * kMaxBlockSize; ++i) {
+        left[i] = top[i] = pixel;
+      }
+    }
+
+    // DirectionalZone1_Large() overreads up to 7 pixels in |top_mem|.
+    static constexpr int kTopMemPadding = 7;
+    alignas(kMaxAlignment) Pixel dst[kTotalPixels];
+    alignas(kMaxAlignment) Pixel ref_src[kTotalPixels];
+    alignas(kMaxAlignment) Pixel left_mem[kMaxBlockSize * 2 + 16];
+    alignas(
+        kMaxAlignment) Pixel top_mem[kMaxBlockSize * 2 + 16 + kTopMemPadding];
+  };
+
+  void SetUp() override { test_utils::ResetDspTable(bitdepth); }
+
+  const TransformSize tx_size_ = GetParam();
+  int block_width_;
+  int block_height_;
+  IntraPredMem intra_pred_mem_;
+};
+
+//------------------------------------------------------------------------------
+// CflIntraPredTest
+
+template <int bitdepth, typename Pixel>
+class CflIntraPredTest : public IntraPredTestBase<bitdepth, Pixel> {
+ public:
+  CflIntraPredTest() = default;
+  CflIntraPredTest(const CflIntraPredTest&) = delete;
+  CflIntraPredTest& operator=(const CflIntraPredTest&) = delete;
+  ~CflIntraPredTest() override = default;
+
+ protected:
+  using IntraPredTestBase<bitdepth, Pixel>::tx_size_;
+  using IntraPredTestBase<bitdepth, Pixel>::block_width_;
+  using IntraPredTestBase<bitdepth, Pixel>::block_height_;
+  using IntraPredTestBase<bitdepth, Pixel>::intra_pred_mem_;
+
+  void SetUp() override {
+    IntraPredTestBase<bitdepth, Pixel>::SetUp();
+    IntraPredCflInit_C();
+
+    const Dsp* const dsp = GetDspTable(bitdepth);
+    ASSERT_NE(dsp, nullptr);
+    base_cfl_intra_pred_ = dsp->cfl_intra_predictors[tx_size_];
+
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    const char* const test_case = test_info->test_suite_name();
+    if (absl::StartsWith(test_case, "C/")) {
+      base_cfl_intra_pred_ = nullptr;
+    } else if (absl::StartsWith(test_case, "NEON/")) {
+      IntraPredCflInit_NEON();
+    } else if (absl::StartsWith(test_case, "SSE41/")) {
+      if ((GetCpuInfo() & kSSE4_1) != 0) {
+        IntraPredCflInit_SSE4_1();
+      }
+    } else {
+      FAIL() << "Unrecognized architecture prefix in test case name: "
+             << test_case;
+    }
+
+    cur_cfl_intra_pred_ = dsp->cfl_intra_predictors[tx_size_];
+
+    if (cur_cfl_intra_pred_ == base_cfl_intra_pred_) {
+      cur_cfl_intra_pred_ = nullptr;
+    }
+  }
+
+  // This test modifies intra_pred_mem_.
+  void TestSpeed(const char* digest, int num_runs);
+  void TestSaturatedValues();
+  void TestRandomValues();
+
+  CflIntraPredictorFunc base_cfl_intra_pred_;
+  CflIntraPredictorFunc cur_cfl_intra_pred_;
+};
+
+template <int bitdepth, typename Pixel>
+void CflIntraPredTest<bitdepth, Pixel>::TestSpeed(const char* const digest,
+                                                  const int num_runs) {
+  if (cur_cfl_intra_pred_ == nullptr) return;
+  libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+  int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride] = {};
+  const int alpha = rnd(33) - 16;
+  const int dc = rnd(1 << bitdepth);
+  const int max_luma = ((1 << bitdepth) - 1) << 3;
+  for (int i = 0; i < block_height_; ++i) {
+    for (int j = 0; j < block_width_; ++j) {
+      if (i < kCflLumaBufferStride && j < kCflLumaBufferStride) {
+        luma[i][j] = max_luma - rnd(max_luma << 1);
+      }
+    }
+  }
+  for (auto& r : intra_pred_mem_.ref_src) r = dc;
+
+  absl::Duration elapsed_time;
+  for (int run = 0; run < num_runs; ++run) {
+    const ptrdiff_t stride = kMaxBlockSize * sizeof(Pixel);
+    memcpy(intra_pred_mem_.dst, intra_pred_mem_.ref_src,
+           sizeof(intra_pred_mem_.dst));
+    const absl::Time start = absl::Now();
+    cur_cfl_intra_pred_(intra_pred_mem_.dst, stride, luma, alpha);
+    elapsed_time += absl::Now() - start;
+  }
+  test_utils::CheckMd5Digest(ToString(tx_size_), kCflIntraPredName, digest,
+                             intra_pred_mem_.dst, sizeof(intra_pred_mem_.dst),
+                             elapsed_time);
+}
+
+template <int bitdepth, typename Pixel>
+void CflIntraPredTest<bitdepth, Pixel>::TestSaturatedValues() {
+  // Skip the 'C' test case as this is used as the reference.
+  if (base_cfl_intra_pred_ == nullptr) return;
+
+  int16_t luma_buffer[kCflLumaBufferStride][kCflLumaBufferStride];
+  for (auto& line : luma_buffer) {
+    for (auto& luma : line) luma = ((1 << bitdepth) - 1) << 3;
+  }
+
+  libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+  static constexpr int kSaturatedAlpha[] = {-16, 16};
+  for (const int alpha : kSaturatedAlpha) {
+    for (auto& r : intra_pred_mem_.ref_src) r = (1 << bitdepth) - 1;
+    memcpy(intra_pred_mem_.dst, intra_pred_mem_.ref_src,
+           sizeof(intra_pred_mem_.dst));
+    const ptrdiff_t stride = kMaxBlockSize * sizeof(Pixel);
+    base_cfl_intra_pred_(intra_pred_mem_.ref_src, stride, luma_buffer, alpha);
+    cur_cfl_intra_pred_(intra_pred_mem_.dst, stride, luma_buffer, alpha);
+    if (!test_utils::CompareBlocks(intra_pred_mem_.dst, intra_pred_mem_.ref_src,
+                                   block_width_, block_height_, kMaxBlockSize,
+                                   kMaxBlockSize, true)) {
+      ADD_FAILURE() << "Result from optimized version of CFL with alpha "
+                    << alpha << " differs from reference.";
+      break;
+    }
+  }
+}
+
+template <int bitdepth, typename Pixel>
+void CflIntraPredTest<bitdepth, Pixel>::TestRandomValues() {
+  // Skip the 'C' test case as this is used as the reference.
+  if (base_cfl_intra_pred_ == nullptr) return;
+  int16_t luma_buffer[kCflLumaBufferStride][kCflLumaBufferStride];
+
+  const int max_luma = ((1 << bitdepth) - 1) << 3;
+  // Use an alternate seed to differentiate this test from TestSpeed().
+  libvpx_test::ACMRandom rnd(test_utils::kAlternateDeterministicSeed);
+  for (auto& line : luma_buffer) {
+    for (auto& luma : line) luma = max_luma - rnd(max_luma << 1);
+  }
+  const int dc = rnd(1 << bitdepth);
+  for (auto& r : intra_pred_mem_.ref_src) r = dc;
+  static constexpr int kSaturatedAlpha[] = {-16, 16};
+  for (const int alpha : kSaturatedAlpha) {
+    intra_pred_mem_.Reset(&rnd);
+    memcpy(intra_pred_mem_.dst, intra_pred_mem_.ref_src,
+           sizeof(intra_pred_mem_.dst));
+    const ptrdiff_t stride = kMaxBlockSize * sizeof(Pixel);
+    base_cfl_intra_pred_(intra_pred_mem_.ref_src, stride, luma_buffer, alpha);
+    cur_cfl_intra_pred_(intra_pred_mem_.dst, stride, luma_buffer, alpha);
+    if (!test_utils::CompareBlocks(intra_pred_mem_.dst, intra_pred_mem_.ref_src,
+                                   block_width_, block_height_, kMaxBlockSize,
+                                   kMaxBlockSize, true)) {
+      ADD_FAILURE() << "Result from optimized version of CFL with alpha "
+                    << alpha << " differs from reference.";
+      break;
+    }
+  }
+}
+
+template <int bitdepth, typename Pixel, SubsamplingType subsampling_type>
+class CflSubsamplerTest : public IntraPredTestBase<bitdepth, Pixel> {
+ public:
+  CflSubsamplerTest() = default;
+  CflSubsamplerTest(const CflSubsamplerTest&) = delete;
+  CflSubsamplerTest& operator=(const CflSubsamplerTest&) = delete;
+  ~CflSubsamplerTest() override = default;
+
+ protected:
+  using IntraPredTestBase<bitdepth, Pixel>::tx_size_;
+  using IntraPredTestBase<bitdepth, Pixel>::block_width_;
+  using IntraPredTestBase<bitdepth, Pixel>::block_height_;
+  using IntraPredTestBase<bitdepth, Pixel>::intra_pred_mem_;
+
+  void SetUp() override {
+    IntraPredTestBase<bitdepth, Pixel>::SetUp();
+    IntraPredCflInit_C();
+
+    const Dsp* const dsp = GetDspTable(bitdepth);
+    ASSERT_NE(dsp, nullptr);
+    base_cfl_subsampler_ = dsp->cfl_subsamplers[tx_size_][subsampling_type];
+
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    const char* const test_case = test_info->test_suite_name();
+    if (absl::StartsWith(test_case, "C/")) {
+      base_cfl_subsampler_ = nullptr;
+    } else if (absl::StartsWith(test_case, "NEON/")) {
+      IntraPredCflInit_NEON();
+    } else if (absl::StartsWith(test_case, "SSE41/")) {
+      if ((GetCpuInfo() & kSSE4_1) != 0) {
+        IntraPredCflInit_SSE4_1();
+      }
+    } else {
+      FAIL() << "Unrecognized architecture prefix in test case name: "
+             << test_case;
+    }
+    cur_cfl_subsampler_ = dsp->cfl_subsamplers[tx_size_][subsampling_type];
+  }
+
+  // This test modifies intra_pred_mem_.
+  void TestSpeed(const char* digest, int num_runs);
+  void TestSaturatedValues();
+  void TestRandomValues();
+
+  enum SubsamplingType SubsamplingType() const { return subsampling_type; }
+
+  CflSubsamplerFunc base_cfl_subsampler_;
+  CflSubsamplerFunc cur_cfl_subsampler_;
+};
+
+// There is no case where both source and output have lowest height or width
+// when that dimension is subsampled.
+int GetLumaWidth(int block_width, SubsamplingType subsampling_type) {
+  if (block_width == 4) {
+    const int width_shift =
+        static_cast<int>(subsampling_type != kSubsamplingType444);
+    return block_width << width_shift;
+  }
+  return block_width;
+}
+
+int GetLumaHeight(int block_height, SubsamplingType subsampling_type) {
+  if (block_height == 4) {
+    const int height_shift =
+        static_cast<int>(subsampling_type == kSubsamplingType420);
+    return block_height << height_shift;
+  }
+  return block_height;
+}
+
+template <int bitdepth, typename Pixel, SubsamplingType subsampling_type>
+void CflSubsamplerTest<bitdepth, Pixel, subsampling_type>::TestSpeed(
+    const char* const digest, const int num_runs) {
+  // C declines initializing the table in normal circumstances because there are
+  // assembly implementations.
+  if (cur_cfl_subsampler_ == nullptr) return;
+  libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+
+  const int width = GetLumaWidth(block_width_, subsampling_type);
+  const int height = GetLumaHeight(block_height_, subsampling_type);
+  Pixel* src = intra_pred_mem_.ref_src;
+  for (int i = 0; i < height; ++i) {
+    for (int j = 0; j < width; ++j) {
+      src[j] = rnd.RandRange(1 << bitdepth);
+    }
+    src += kMaxBlockSize;
+  }
+  const absl::Time start = absl::Now();
+  int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride] = {};
+  const ptrdiff_t stride = kMaxBlockSize * sizeof(Pixel);
+  for (int run = 0; run < num_runs; ++run) {
+    cur_cfl_subsampler_(luma, width, height, intra_pred_mem_.ref_src, stride);
+  }
+  const absl::Duration elapsed_time = absl::Now() - start;
+  test_utils::CheckMd5Digest(ToString(tx_size_), kCflIntraPredName, digest,
+                             luma, sizeof(luma), elapsed_time);
+}
+
+template <int bitdepth, typename Pixel, SubsamplingType subsampling_type>
+void CflSubsamplerTest<bitdepth, Pixel,
+                       subsampling_type>::TestSaturatedValues() {
+  if (base_cfl_subsampler_ == nullptr) return;
+  const ptrdiff_t stride = kMaxBlockSize * sizeof(Pixel);
+  for (int width = GetLumaWidth(block_width_, subsampling_type); width > 0;
+       width -= 8) {
+    for (int height = GetLumaHeight(block_height_, subsampling_type);
+         height > 0; height -= 8) {
+      Pixel* src = intra_pred_mem_.ref_src;
+      for (int y = 0; y < height; ++y) {
+        Memset(src, (1 << bitdepth) - 1, width);
+        Memset(src + width, 0, kMaxBlockSize - width);
+        src += kMaxBlockSize;
+      }
+      Memset(intra_pred_mem_.ref_src + kMaxBlockSize * height, 0,
+             kMaxBlockSize * (kMaxBlockSize - height));
+
+      int16_t luma_base[kCflLumaBufferStride][kCflLumaBufferStride] = {};
+      int16_t luma_cur[kCflLumaBufferStride][kCflLumaBufferStride] = {};
+      base_cfl_subsampler_(luma_base, width, height, intra_pred_mem_.ref_src,
+                           stride);
+      cur_cfl_subsampler_(luma_cur, width, height, intra_pred_mem_.ref_src,
+                          stride);
+      if (!test_utils::CompareBlocks(reinterpret_cast<uint16_t*>(luma_cur[0]),
+                                     reinterpret_cast<uint16_t*>(luma_base[0]),
+                                     block_width_, block_height_,
+                                     kCflLumaBufferStride, kCflLumaBufferStride,
+                                     true)) {
+        FAIL() << "Result from optimized version of CFL subsampler"
+               << " differs from reference. max_luma_width: " << width
+               << " max_luma_height: " << height;
+      }
+    }
+  }
+}
+
+template <int bitdepth, typename Pixel, SubsamplingType subsampling_type>
+void CflSubsamplerTest<bitdepth, Pixel, subsampling_type>::TestRandomValues() {
+  if (base_cfl_subsampler_ == nullptr) return;
+  const ptrdiff_t stride = kMaxBlockSize * sizeof(Pixel);
+  // Use an alternate seed to differentiate this test from TestSpeed().
+  libvpx_test::ACMRandom rnd(test_utils::kAlternateDeterministicSeed);
+  for (int width = GetLumaWidth(block_width_, subsampling_type); width > 0;
+       width -= 8) {
+    for (int height = GetLumaHeight(block_height_, subsampling_type);
+         height > 0; height -= 8) {
+      Pixel* src = intra_pred_mem_.ref_src;
+      for (int i = 0; i < height; ++i) {
+        for (int j = 0; j < width; ++j) {
+          src[j] = rnd.RandRange(1 << bitdepth);
+        }
+        Memset(src + width, 0, kMaxBlockSize - width);
+        src += kMaxBlockSize;
+      }
+      Memset(intra_pred_mem_.ref_src + kMaxBlockSize * height, 0,
+             kMaxBlockSize * (kMaxBlockSize - height));
+
+      int16_t luma_base[kCflLumaBufferStride][kCflLumaBufferStride] = {};
+      int16_t luma_cur[kCflLumaBufferStride][kCflLumaBufferStride] = {};
+      base_cfl_subsampler_(luma_base, width, height, intra_pred_mem_.ref_src,
+                           stride);
+      cur_cfl_subsampler_(luma_cur, width, height, intra_pred_mem_.ref_src,
+                          stride);
+      if (!test_utils::CompareBlocks(reinterpret_cast<uint16_t*>(luma_cur[0]),
+                                     reinterpret_cast<uint16_t*>(luma_base[0]),
+                                     block_width_, block_height_,
+                                     kCflLumaBufferStride, kCflLumaBufferStride,
+                                     true)) {
+        FAIL() << "Result from optimized version of CFL subsampler"
+               << " differs from reference. max_luma_width: " << width
+               << " max_luma_height: " << height;
+      }
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+
+using CflIntraPredTest8bpp = CflIntraPredTest<8, uint8_t>;
+
+const char* GetCflIntraPredDigest8bpp(TransformSize tx_size) {
+  static const char* const kDigest4x4 = "9ea7088e082867fd5ae394ca549fe1ed";
+  static const char* const kDigest4x8 = "323b0b4784b6658da781398e61f2da3d";
+  static const char* const kDigest4x16 = "99eb9c65f227ca7f71dcac24645a4fec";
+  static const char* const kDigest8x4 = "e8e782e31c94f3974b87b93d455262d8";
+  static const char* const kDigest8x8 = "23ab9fb65e7bbbdb985709e115115eb5";
+  static const char* const kDigest8x16 = "52f5add2fc4bbb2ff893148645e95b9c";
+  static const char* const kDigest8x32 = "283fdee9af8afdb76f72dd7339c92c3c";
+  static const char* const kDigest16x4 = "eead35f515b1aa8b5175b283192b86e6";
+  static const char* const kDigest16x8 = "5778e934254eaab04230bc370f64f778";
+  static const char* const kDigest16x16 = "4e8ed38ccba0d62f1213171da2212ed3";
+  static const char* const kDigest16x32 = "61a29bd7699e18ca6ea5641d1d023bfd";
+  static const char* const kDigest32x8 = "7f31607bd4f9ec879aa47f4daf9c7bb0";
+  static const char* const kDigest32x16 = "eb84dfab900fa6a90e132b186b4c6c36";
+  static const char* const kDigest32x32 = "e0ff35d407cb214578d61ef419c94237";
+
+  switch (tx_size) {
+    case kTransformSize4x4:
+      return kDigest4x4;
+    case kTransformSize4x8:
+      return kDigest4x8;
+    case kTransformSize4x16:
+      return kDigest4x16;
+    case kTransformSize8x4:
+      return kDigest8x4;
+    case kTransformSize8x8:
+      return kDigest8x8;
+    case kTransformSize8x16:
+      return kDigest8x16;
+    case kTransformSize8x32:
+      return kDigest8x32;
+    case kTransformSize16x4:
+      return kDigest16x4;
+    case kTransformSize16x8:
+      return kDigest16x8;
+    case kTransformSize16x16:
+      return kDigest16x16;
+    case kTransformSize16x32:
+      return kDigest16x32;
+    case kTransformSize32x8:
+      return kDigest32x8;
+    case kTransformSize32x16:
+      return kDigest32x16;
+    case kTransformSize32x32:
+      return kDigest32x32;
+    default:
+      ADD_FAILURE() << "Unknown transform size: " << tx_size;
+      return nullptr;
+  }
+}
+
+TEST_P(CflIntraPredTest8bpp, DISABLED_Speed) {
+  const auto num_runs =
+      static_cast<int>(2.0e9 / (block_width_ * block_height_));
+  TestSpeed(GetCflIntraPredDigest8bpp(tx_size_), num_runs);
+}
+
+TEST_P(CflIntraPredTest8bpp, FixedInput) {
+  TestSpeed(GetCflIntraPredDigest8bpp(tx_size_), 1);
+}
+
+TEST_P(CflIntraPredTest8bpp, Overflow) { TestSaturatedValues(); }
+
+TEST_P(CflIntraPredTest8bpp, Random) { TestRandomValues(); }
+
+//------------------------------------------------------------------------------
+
+using CflSubsamplerTest8bpp444 =
+    CflSubsamplerTest<8, uint8_t, kSubsamplingType444>;
+using CflSubsamplerTest8bpp422 =
+    CflSubsamplerTest<8, uint8_t, kSubsamplingType422>;
+using CflSubsamplerTest8bpp420 =
+    CflSubsamplerTest<8, uint8_t, kSubsamplingType420>;
+
+const char* GetCflSubsamplerDigest8bpp(TransformSize tx_size,
+                                       SubsamplingType subsampling_type) {
+  static const char* const kDigests4x4[3] = {
+      "a8fa98d76cc3ccffcffc0d02dfae052c", "929cf2c23d926b500616797f8b1baf5b",
+      "1d03f091956838e7f2b113aabd8b9da9"};
+  static const char* const kDigests4x8[3] = {
+      "717b84f867f413c87c90a7c5d0125c8c", "6ccd9f48842b1a802e128b46b8f4885d",
+      "68a334f5d2abecbc78562b3280b5fb0c"};
+  static const char* const kDigests4x16[3] = {
+      "ecd1340b7e065dd8807fd9861abb7d99", "042c3fee17df7ef8fb8cef616f212a91",
+      "b0600f0bc3fbfc374bb3628360dcae5c"};
+  static const char* const kDigests8x4[3] = {
+      "4ea5617f4ed8e9edc2fff88d0ab8e53f", "b02288905f218c9f54ce4a472ec7b22e",
+      "3522d3a4dd3839d1a86fb39b31a86d52"};
+  static const char* const kDigests8x8[3] = {
+      "a0488493e6bcdb868713a95f9b4a0091", "ff6c1ac1d94fce63c282ba49186529bf",
+      "082e34ba04d04d7cd6fe408823987602"};
+  static const char* const kDigests8x16[3] = {
+      "e01dd4bb21daaa6e991cd5b1e6f30300", "2a1b13f932e39cc5f561afea9956f47a",
+      "d8d266282cb7123f780bd7266e8f5913"};
+  static const char* const kDigests8x32[3] = {
+      "0fc95e4ab798b95ccd2966ff75028b03", "6bc6e45ef2f664134449342fe76006ff",
+      "d294fb6399edaa267aa167407c0ebccb"};
+  static const char* const kDigests16x4[3] = {
+      "4798c2cf649b786bd153ad88353d52aa", "43a4bfa3b8caf4b72f58c6a1d1054f64",
+      "a928ebbec2db1508c8831a440d82eb98"};
+  static const char* const kDigests16x8[3] = {
+      "736b7f5b603cb34abcbe1b7e69b6ce93", "90422000ab20ecb519e4d277a9b3ea2b",
+      "c8e71c2fddbb850c5a50592ee5975368"};
+  static const char* const kDigests16x16[3] = {
+      "4f15a694966ee50a9e987e9a0aa2423b", "9e31e2f5a7ce7bef738b135755e25dcd",
+      "2ffeed4d592a0455f6d888913969827f"};
+  static const char* const kDigests16x32[3] = {
+      "3a10438bfe17ea39efad20608a0520eb", "79e8e8732a6ffc29dfbb0b3fc29c2883",
+      "185ca976ccbef7fb5f3f8c6aa22d5a79"};
+  static const char* const kDigests32x8[3] = {
+      "683704f08839a15e42603e4977a3e815", "13d311635372aee8998fca1758e75e20",
+      "9847d88eaaa57c086a2e6aed583048d3"};
+  static const char* const kDigests32x16[3] = {
+      "14b6761bf9f1156cf2496f532512aa99", "ee57bb7f0aa2302d29cdc1bfce72d5fc",
+      "a4189655fe714b82eb88cb5092c0ad76"};
+  static const char* const kDigests32x32[3] = {
+      "dcfbe71b70a37418ccb90dbf27f04226", "c578556a584019c1bdc2d0c3b9fd0c88",
+      "db200bc8ccbeacd6a42d6b8e5ad1d931"};
+
+  switch (tx_size) {
+    case kTransformSize4x4:
+      return kDigests4x4[subsampling_type];
+    case kTransformSize4x8:
+      return kDigests4x8[subsampling_type];
+    case kTransformSize4x16:
+      return kDigests4x16[subsampling_type];
+    case kTransformSize8x4:
+      return kDigests8x4[subsampling_type];
+    case kTransformSize8x8:
+      return kDigests8x8[subsampling_type];
+    case kTransformSize8x16:
+      return kDigests8x16[subsampling_type];
+    case kTransformSize8x32:
+      return kDigests8x32[subsampling_type];
+    case kTransformSize16x4:
+      return kDigests16x4[subsampling_type];
+    case kTransformSize16x8:
+      return kDigests16x8[subsampling_type];
+    case kTransformSize16x16:
+      return kDigests16x16[subsampling_type];
+    case kTransformSize16x32:
+      return kDigests16x32[subsampling_type];
+    case kTransformSize32x8:
+      return kDigests32x8[subsampling_type];
+    case kTransformSize32x16:
+      return kDigests32x16[subsampling_type];
+    case kTransformSize32x32:
+      return kDigests32x32[subsampling_type];
+    default:
+      ADD_FAILURE() << "Unknown transform size: " << tx_size;
+      return nullptr;
+  }
+}
+
+TEST_P(CflSubsamplerTest8bpp444, DISABLED_Speed) {
+  const auto num_runs =
+      static_cast<int>(2.0e9 / (block_width_ * block_height_));
+  TestSpeed(GetCflSubsamplerDigest8bpp(tx_size_, SubsamplingType()), num_runs);
+}
+
+TEST_P(CflSubsamplerTest8bpp444, FixedInput) {
+  TestSpeed(GetCflSubsamplerDigest8bpp(tx_size_, SubsamplingType()), 1);
+}
+
+TEST_P(CflSubsamplerTest8bpp444, Overflow) { TestSaturatedValues(); }
+
+TEST_P(CflSubsamplerTest8bpp444, Random) { TestRandomValues(); }
+
+TEST_P(CflSubsamplerTest8bpp422, DISABLED_Speed) {
+  const auto num_runs =
+      static_cast<int>(2.0e9 / (block_width_ * block_height_));
+  TestSpeed(GetCflSubsamplerDigest8bpp(tx_size_, SubsamplingType()), num_runs);
+}
+
+TEST_P(CflSubsamplerTest8bpp422, FixedInput) {
+  TestSpeed(GetCflSubsamplerDigest8bpp(tx_size_, SubsamplingType()), 1);
+}
+
+TEST_P(CflSubsamplerTest8bpp422, Overflow) { TestSaturatedValues(); }
+
+TEST_P(CflSubsamplerTest8bpp422, Random) { TestRandomValues(); }
+
+TEST_P(CflSubsamplerTest8bpp420, DISABLED_Speed) {
+  const auto num_runs =
+      static_cast<int>(2.0e9 / (block_width_ * block_height_));
+  TestSpeed(GetCflSubsamplerDigest8bpp(tx_size_, SubsamplingType()), num_runs);
+}
+
+TEST_P(CflSubsamplerTest8bpp420, FixedInput) {
+  TestSpeed(GetCflSubsamplerDigest8bpp(tx_size_, SubsamplingType()), 1);
+}
+
+TEST_P(CflSubsamplerTest8bpp420, Overflow) { TestSaturatedValues(); }
+
+TEST_P(CflSubsamplerTest8bpp420, Random) { TestRandomValues(); }
+
+//------------------------------------------------------------------------------
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+//------------------------------------------------------------------------------
+
+using CflIntraPredTest10bpp = CflIntraPredTest<10, uint16_t>;
+
+const char* GetCflIntraPredDigest10bpp(TransformSize tx_size) {
+  static const char* const kDigest4x4 = "b4ca5f6fbb643a94eb05d59976d44c5d";
+  static const char* const kDigest4x8 = "040139b76ee22af05c56baf887d3d43b";
+  static const char* const kDigest4x16 = "4a1d59ace84ff07e68a0d30e9b1cebdd";
+  static const char* const kDigest8x4 = "c2c149cea5fdcd18bfe5c19ec2a8aa90";
+  static const char* const kDigest8x8 = "68ad90bd6f409548fa5551496b7cb0d0";
+  static const char* const kDigest8x16 = "bdc54eff4de8c5d597b03afaa705d3fe";
+  static const char* const kDigest8x32 = "362aebc6d68ff0d312d55dcd6a8a927d";
+  static const char* const kDigest16x4 = "349e813aedd211581c5e64ba1938eaa7";
+  static const char* const kDigest16x8 = "35c64f6da17f836618b5804185cf3eef";
+  static const char* const kDigest16x16 = "95be0c78dbd8dda793c62c6635b4bfb7";
+  static const char* const kDigest16x32 = "4752b9eda069854d3f5c56d3f2057e79";
+  static const char* const kDigest32x8 = "dafc5e973e4b6a55861f4586a11b7dd1";
+  static const char* const kDigest32x16 = "1e177ed3914a165183916aca1d01bb74";
+  static const char* const kDigest32x32 = "4c9ab3cf9baa27bb34e29729dabc1ea6";
+
+  switch (tx_size) {
+    case kTransformSize4x4:
+      return kDigest4x4;
+    case kTransformSize4x8:
+      return kDigest4x8;
+    case kTransformSize4x16:
+      return kDigest4x16;
+    case kTransformSize8x4:
+      return kDigest8x4;
+    case kTransformSize8x8:
+      return kDigest8x8;
+    case kTransformSize8x16:
+      return kDigest8x16;
+    case kTransformSize8x32:
+      return kDigest8x32;
+    case kTransformSize16x4:
+      return kDigest16x4;
+    case kTransformSize16x8:
+      return kDigest16x8;
+    case kTransformSize16x16:
+      return kDigest16x16;
+    case kTransformSize16x32:
+      return kDigest16x32;
+    case kTransformSize32x8:
+      return kDigest32x8;
+    case kTransformSize32x16:
+      return kDigest32x16;
+    case kTransformSize32x32:
+      return kDigest32x32;
+    default:
+      ADD_FAILURE() << "Unknown transform size: " << tx_size;
+      return nullptr;
+  }
+}
+
+TEST_P(CflIntraPredTest10bpp, DISABLED_Speed) {
+  const auto num_runs =
+      static_cast<int>(2.0e9 / (block_width_ * block_height_));
+  TestSpeed(GetCflIntraPredDigest10bpp(tx_size_), num_runs);
+}
+
+TEST_P(CflIntraPredTest10bpp, FixedInput) {
+  TestSpeed(GetCflIntraPredDigest10bpp(tx_size_), 1);
+}
+
+TEST_P(CflIntraPredTest10bpp, Overflow) { TestSaturatedValues(); }
+
+TEST_P(CflIntraPredTest10bpp, Random) { TestRandomValues(); }
+
+//------------------------------------------------------------------------------
+
+using CflSubsamplerTest10bpp444 =
+    CflSubsamplerTest<10, uint16_t, kSubsamplingType444>;
+using CflSubsamplerTest10bpp422 =
+    CflSubsamplerTest<10, uint16_t, kSubsamplingType422>;
+using CflSubsamplerTest10bpp420 =
+    CflSubsamplerTest<10, uint16_t, kSubsamplingType420>;
+
+const char* GetCflSubsamplerDigest10bpp(TransformSize tx_size,
+                                        SubsamplingType subsampling_type) {
+  static const char* const kDigests4x4[3] = {
+      "a8abcad9a6c9b046a100689135a108cb", "01081c2a0d0c15dabdbc725be5660451",
+      "93d1d9df2861240d88f5618e42178654"};
+  static const char* const kDigests4x8[3] = {
+      "d1fd8cd0709ca6634ad85f3e331672e1", "0d603fcc910aca3db41fc7f64e826c27",
+      "cf88b6d1b7b025cfa0082361775aeb75"};
+  static const char* const kDigests4x16[3] = {
+      "ce2e036a950388a564d8637b1416a6c6", "6c36c46cd72057a6b36bc12188b6d22c",
+      "0884a0e53384cd5173035ad8966d8f2f"};
+  static const char* const kDigests8x4[3] = {
+      "174e961983ed71fb105ed71aa3f9daf5", "330946cc369a534618a1014b4e3f6f18",
+      "8070668aa389c1d09f8aaf43c1223e8c"};
+  static const char* const kDigests8x8[3] = {
+      "86884feb35217010f73ccdbadecb635e", "b8cbc646e1bf1352e5b4b599eaef1193",
+      "4a1110382e56b42d3b7a4132bccc01ee"};
+  static const char* const kDigests8x16[3] = {
+      "a694c4e1f89648ffb49efd6a1d35b300", "864b9da67d23a2f8284b28b2a1e5aa30",
+      "bd012ca1cea256dd02c231339a4cf200"};
+  static const char* const kDigests8x32[3] = {
+      "60c42201bc24e518c1a3b3b6306d8125", "4d530e47c2b7555d5f311ee910d61842",
+      "71888b17b832ef55c0cd9449c0e6b077"};
+  static const char* const kDigests16x4[3] = {
+      "6b6d5ae4cc294c070ce65ab31c5a7d4f", "0fbecee20d294939e7a0183c2b4a0b96",
+      "917cd884923139d5c05a11000722e3b6"};
+  static const char* const kDigests16x8[3] = {
+      "688c41726d9ac35fb5b18c57bca76b9c", "d439a2e0a60d672b644cd1189e2858b9",
+      "edded6d166a77a6c3ff46fddc13f372f"};
+  static const char* const kDigests16x16[3] = {
+      "feb2bad9f6bb3f60eaeaf6c1bfd89ca5", "d65cabce5fcd9a29d1dfc530e4764f3a",
+      "2f1a91898812d2c9320c7506b3a72eb4"};
+  static const char* const kDigests16x32[3] = {
+      "6f23b1851444d29633e62ce77bf09559", "4a449fd078bd0c9657cdc24b709c0796",
+      "e44e18cb8bda2d34b52c96d5b6b510be"};
+  static const char* const kDigests32x8[3] = {
+      "77bf9ba56f7e1d2f04068a8a00b139da", "a85a1dea82963dedab9a2f7ad4169b5f",
+      "d12746071bee96ddc075c6368bc9fbaf"};
+  static const char* const kDigests32x16[3] = {
+      "cce3422f7f8cf57145f979359ac92f98", "1c18738d40bfa91296e5fdb7230bf9a7",
+      "02513142d109aee10f081cacfb33d1c5"};
+  static const char* const kDigests32x32[3] = {
+      "789008e49d0276de186af968196dd4a7", "b8848b00968a7ba4787765b7214da05f",
+      "12d13828db57605b00ce99469489651d"};
+
+  switch (tx_size) {
+    case kTransformSize4x4:
+      return kDigests4x4[subsampling_type];
+    case kTransformSize4x8:
+      return kDigests4x8[subsampling_type];
+    case kTransformSize4x16:
+      return kDigests4x16[subsampling_type];
+    case kTransformSize8x4:
+      return kDigests8x4[subsampling_type];
+    case kTransformSize8x8:
+      return kDigests8x8[subsampling_type];
+    case kTransformSize8x16:
+      return kDigests8x16[subsampling_type];
+    case kTransformSize8x32:
+      return kDigests8x32[subsampling_type];
+    case kTransformSize16x4:
+      return kDigests16x4[subsampling_type];
+    case kTransformSize16x8:
+      return kDigests16x8[subsampling_type];
+    case kTransformSize16x16:
+      return kDigests16x16[subsampling_type];
+    case kTransformSize16x32:
+      return kDigests16x32[subsampling_type];
+    case kTransformSize32x8:
+      return kDigests32x8[subsampling_type];
+    case kTransformSize32x16:
+      return kDigests32x16[subsampling_type];
+    case kTransformSize32x32:
+      return kDigests32x32[subsampling_type];
+    default:
+      ADD_FAILURE() << "Unknown transform size: " << tx_size;
+      return nullptr;
+  }
+}
+
+TEST_P(CflSubsamplerTest10bpp444, DISABLED_Speed) {
+  const auto num_runs =
+      static_cast<int>(2.0e9 / (block_width_ * block_height_));
+  TestSpeed(GetCflSubsamplerDigest10bpp(tx_size_, SubsamplingType()), num_runs);
+}
+
+TEST_P(CflSubsamplerTest10bpp444, FixedInput) {
+  TestSpeed(GetCflSubsamplerDigest10bpp(tx_size_, SubsamplingType()), 1);
+}
+
+TEST_P(CflSubsamplerTest10bpp444, Overflow) { TestSaturatedValues(); }
+
+TEST_P(CflSubsamplerTest10bpp444, Random) { TestRandomValues(); }
+
+TEST_P(CflSubsamplerTest10bpp422, DISABLED_Speed) {
+  const auto num_runs =
+      static_cast<int>(2.0e9 / (block_width_ * block_height_));
+  TestSpeed(GetCflSubsamplerDigest10bpp(tx_size_, SubsamplingType()), num_runs);
+}
+
+TEST_P(CflSubsamplerTest10bpp422, FixedInput) {
+  TestSpeed(GetCflSubsamplerDigest10bpp(tx_size_, SubsamplingType()), 1);
+}
+
+TEST_P(CflSubsamplerTest10bpp422, Overflow) { TestSaturatedValues(); }
+
+TEST_P(CflSubsamplerTest10bpp422, Random) { TestRandomValues(); }
+
+TEST_P(CflSubsamplerTest10bpp420, DISABLED_Speed) {
+  const auto num_runs =
+      static_cast<int>(2.0e9 / (block_width_ * block_height_));
+  TestSpeed(GetCflSubsamplerDigest10bpp(tx_size_, SubsamplingType()), num_runs);
+}
+
+TEST_P(CflSubsamplerTest10bpp420, FixedInput) {
+  TestSpeed(GetCflSubsamplerDigest10bpp(tx_size_, SubsamplingType()), 1);
+}
+
+TEST_P(CflSubsamplerTest10bpp420, Overflow) { TestSaturatedValues(); }
+
+TEST_P(CflSubsamplerTest10bpp420, Random) { TestRandomValues(); }
+
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+// Cfl predictors are available only for transform sizes with
+// max(width, height) <= 32.
+constexpr TransformSize kTransformSizesSmallerThan32x32[] = {
+    kTransformSize4x4,   kTransformSize4x8,   kTransformSize4x16,
+    kTransformSize8x4,   kTransformSize8x8,   kTransformSize8x16,
+    kTransformSize8x32,  kTransformSize16x4,  kTransformSize16x8,
+    kTransformSize16x16, kTransformSize16x32, kTransformSize32x8,
+    kTransformSize32x16, kTransformSize32x32};
+
+INSTANTIATE_TEST_SUITE_P(C, CflIntraPredTest8bpp,
+                         testing::ValuesIn(kTransformSizesSmallerThan32x32));
+INSTANTIATE_TEST_SUITE_P(C, CflSubsamplerTest8bpp444,
+                         testing::ValuesIn(kTransformSizesSmallerThan32x32));
+INSTANTIATE_TEST_SUITE_P(C, CflSubsamplerTest8bpp422,
+                         testing::ValuesIn(kTransformSizesSmallerThan32x32));
+INSTANTIATE_TEST_SUITE_P(C, CflSubsamplerTest8bpp420,
+                         testing::ValuesIn(kTransformSizesSmallerThan32x32));
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, CflIntraPredTest8bpp,
+                         testing::ValuesIn(kTransformSizesSmallerThan32x32));
+INSTANTIATE_TEST_SUITE_P(SSE41, CflSubsamplerTest8bpp444,
+                         testing::ValuesIn(kTransformSizesSmallerThan32x32));
+INSTANTIATE_TEST_SUITE_P(SSE41, CflSubsamplerTest8bpp420,
+                         testing::ValuesIn(kTransformSizesSmallerThan32x32));
+#endif  // LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, CflIntraPredTest8bpp,
+                         testing::ValuesIn(kTransformSizesSmallerThan32x32));
+INSTANTIATE_TEST_SUITE_P(NEON, CflSubsamplerTest8bpp444,
+                         testing::ValuesIn(kTransformSizesSmallerThan32x32));
+INSTANTIATE_TEST_SUITE_P(NEON, CflSubsamplerTest8bpp420,
+                         testing::ValuesIn(kTransformSizesSmallerThan32x32));
+#endif  // LIBGAV1_ENABLE_NEON
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+INSTANTIATE_TEST_SUITE_P(C, CflIntraPredTest10bpp,
+                         testing::ValuesIn(kTransformSizesSmallerThan32x32));
+INSTANTIATE_TEST_SUITE_P(C, CflSubsamplerTest10bpp444,
+                         testing::ValuesIn(kTransformSizesSmallerThan32x32));
+INSTANTIATE_TEST_SUITE_P(C, CflSubsamplerTest10bpp422,
+                         testing::ValuesIn(kTransformSizesSmallerThan32x32));
+INSTANTIATE_TEST_SUITE_P(C, CflSubsamplerTest10bpp420,
+                         testing::ValuesIn(kTransformSizesSmallerThan32x32));
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, CflIntraPredTest10bpp,
+                         testing::ValuesIn(kTransformSizesSmallerThan32x32));
+INSTANTIATE_TEST_SUITE_P(SSE41, CflSubsamplerTest10bpp444,
+                         testing::ValuesIn(kTransformSizesSmallerThan32x32));
+INSTANTIATE_TEST_SUITE_P(SSE41, CflSubsamplerTest10bpp420,
+                         testing::ValuesIn(kTransformSizesSmallerThan32x32));
+#endif  // LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, CflIntraPredTest10bpp,
+                         testing::ValuesIn(kTransformSizesSmallerThan32x32));
+INSTANTIATE_TEST_SUITE_P(NEON, CflSubsamplerTest10bpp444,
+                         testing::ValuesIn(kTransformSizesSmallerThan32x32));
+INSTANTIATE_TEST_SUITE_P(NEON, CflSubsamplerTest10bpp420,
+                         testing::ValuesIn(kTransformSizesSmallerThan32x32));
+#endif  // LIBGAV1_ENABLE_NEON
+
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+}  // namespace
+}  // namespace dsp
+
+static std::ostream& operator<<(std::ostream& os, const TransformSize tx_size) {
+  return os << ToString(tx_size);
+}
+
+}  // namespace libgav1
diff --git a/src/dsp/intrapred_directional.cc b/src/dsp/intrapred_directional.cc
new file mode 100644
index 0000000..e670769
--- /dev/null
+++ b/src/dsp/intrapred_directional.cc
@@ -0,0 +1,252 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred_directional.h"
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/memory.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+//------------------------------------------------------------------------------
+// 7.11.2.4. Directional intra prediction process
+
+template <typename Pixel>
+void DirectionalIntraPredictorZone1_C(void* const dest, ptrdiff_t stride,
+                                      const void* const top_row,
+                                      const int width, const int height,
+                                      const int xstep,
+                                      const bool upsampled_top) {
+  const auto* const top = static_cast<const Pixel*>(top_row);
+  auto* dst = static_cast<Pixel*>(dest);
+  stride /= sizeof(Pixel);
+
+  assert(xstep > 0);
+
+  // If xstep == 64 then |shift| always evaluates to 0 which sets |val| to
+  // |top[top_base_x]|. This corresponds to a 45 degree prediction.
+  if (xstep == 64) {
+    // 7.11.2.10. Intra edge upsample selection process
+    // if ( d <= 0 || d >= 40 ) useUpsample = 0
+    // For |upsampled_top| the delta is |predictor_angle - 90|. Since the
+    // |predictor_angle| is 45 the delta is also 45.
+    assert(!upsampled_top);
+    const Pixel* top_ptr = top + 1;
+    for (int y = 0; y < height; ++y, dst += stride, ++top_ptr) {
+      memcpy(dst, top_ptr, sizeof(*top_ptr) * width);
+    }
+    return;
+  }
+
+  const int upsample_shift = static_cast<int>(upsampled_top);
+  const int max_base_x = ((width + height) - 1) << upsample_shift;
+  const int scale_bits = 6 - upsample_shift;
+  const int base_step = 1 << upsample_shift;
+  int top_x = xstep;
+  int y = 0;
+  do {
+    int top_base_x = top_x >> scale_bits;
+
+    if (top_base_x >= max_base_x) {
+      for (int i = y; i < height; ++i) {
+        Memset(dst, top[max_base_x], width);
+        dst += stride;
+      }
+      return;
+    }
+
+    const int shift = ((top_x << upsample_shift) & 0x3F) >> 1;
+    int x = 0;
+    do {
+      if (top_base_x >= max_base_x) {
+        Memset(dst + x, top[max_base_x], width - x);
+        break;
+      }
+
+      const int val =
+          top[top_base_x] * (32 - shift) + top[top_base_x + 1] * shift;
+      dst[x] = RightShiftWithRounding(val, 5 /*log2(32)*/);
+      top_base_x += base_step;
+    } while (++x < width);
+
+    dst += stride;
+    top_x += xstep;
+  } while (++y < height);
+}
+
+template <typename Pixel>
+void DirectionalIntraPredictorZone2_C(void* const dest, ptrdiff_t stride,
+                                      const void* const top_row,
+                                      const void* const left_column,
+                                      const int width, const int height,
+                                      const int xstep, const int ystep,
+                                      const bool upsampled_top,
+                                      const bool upsampled_left) {
+  const auto* const top = static_cast<const Pixel*>(top_row);
+  const auto* const left = static_cast<const Pixel*>(left_column);
+  auto* dst = static_cast<Pixel*>(dest);
+  stride /= sizeof(Pixel);
+
+  assert(xstep > 0);
+  assert(ystep > 0);
+
+  const int upsample_top_shift = static_cast<int>(upsampled_top);
+  const int upsample_left_shift = static_cast<int>(upsampled_left);
+  const int scale_bits_x = 6 - upsample_top_shift;
+  const int scale_bits_y = 6 - upsample_left_shift;
+  const int min_base_x = -(1 << upsample_top_shift);
+  const int base_step_x = 1 << upsample_top_shift;
+  int y = 0;
+  int top_x = -xstep;
+  do {
+    int top_base_x = top_x >> scale_bits_x;
+    int left_y = (y << 6) - ystep;
+    int x = 0;
+    do {
+      int val;
+      if (top_base_x >= min_base_x) {
+        const int shift = ((top_x * (1 << upsample_top_shift)) & 0x3F) >> 1;
+        val = top[top_base_x] * (32 - shift) + top[top_base_x + 1] * shift;
+      } else {
+        // Note this assumes an arithmetic shift to handle negative values.
+        const int left_base_y = left_y >> scale_bits_y;
+        const int shift = ((left_y * (1 << upsample_left_shift)) & 0x3F) >> 1;
+        assert(left_base_y >= -(1 << upsample_left_shift));
+        val = left[left_base_y] * (32 - shift) + left[left_base_y + 1] * shift;
+      }
+      dst[x] = RightShiftWithRounding(val, 5);
+      top_base_x += base_step_x;
+      left_y -= ystep;
+    } while (++x < width);
+
+    top_x -= xstep;
+    dst += stride;
+  } while (++y < height);
+}
+
+template <typename Pixel>
+void DirectionalIntraPredictorZone3_C(void* const dest, ptrdiff_t stride,
+                                      const void* const left_column,
+                                      const int width, const int height,
+                                      const int ystep,
+                                      const bool upsampled_left) {
+  const auto* const left = static_cast<const Pixel*>(left_column);
+  stride /= sizeof(Pixel);
+
+  assert(ystep > 0);
+
+  const int upsample_shift = static_cast<int>(upsampled_left);
+  const int scale_bits = 6 - upsample_shift;
+  const int base_step = 1 << upsample_shift;
+  // Zone3 never runs out of left_column values.
+  assert((width + height - 1) << upsample_shift >  // max_base_y
+         ((ystep * width) >> scale_bits) +
+             base_step * (height - 1));  // left_base_y
+
+  int left_y = ystep;
+  int x = 0;
+  do {
+    auto* dst = static_cast<Pixel*>(dest);
+
+    int left_base_y = left_y >> scale_bits;
+    int y = 0;
+    do {
+      const int shift = ((left_y << upsample_shift) & 0x3F) >> 1;
+      const int val =
+          left[left_base_y] * (32 - shift) + left[left_base_y + 1] * shift;
+      dst[x] = RightShiftWithRounding(val, 5);
+      dst += stride;
+      left_base_y += base_step;
+    } while (++y < height);
+
+    left_y += ystep;
+  } while (++x < width);
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->directional_intra_predictor_zone1 =
+      DirectionalIntraPredictorZone1_C<uint8_t>;
+  dsp->directional_intra_predictor_zone2 =
+      DirectionalIntraPredictorZone2_C<uint8_t>;
+  dsp->directional_intra_predictor_zone3 =
+      DirectionalIntraPredictorZone3_C<uint8_t>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone1
+  dsp->directional_intra_predictor_zone1 =
+      DirectionalIntraPredictorZone1_C<uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone2
+  dsp->directional_intra_predictor_zone2 =
+      DirectionalIntraPredictorZone2_C<uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone3
+  dsp->directional_intra_predictor_zone3 =
+      DirectionalIntraPredictorZone3_C<uint8_t>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->directional_intra_predictor_zone1 =
+      DirectionalIntraPredictorZone1_C<uint16_t>;
+  dsp->directional_intra_predictor_zone2 =
+      DirectionalIntraPredictorZone2_C<uint16_t>;
+  dsp->directional_intra_predictor_zone3 =
+      DirectionalIntraPredictorZone3_C<uint16_t>;
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone1
+  dsp->directional_intra_predictor_zone1 =
+      DirectionalIntraPredictorZone1_C<uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone2
+  dsp->directional_intra_predictor_zone2 =
+      DirectionalIntraPredictorZone2_C<uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone3
+  dsp->directional_intra_predictor_zone3 =
+      DirectionalIntraPredictorZone3_C<uint16_t>;
+#endif
+}
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+}  // namespace
+
+void IntraPredDirectionalInit_C() {
+  Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  Init10bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/intrapred_directional.h b/src/dsp/intrapred_directional.h
new file mode 100644
index 0000000..bcd1bc1
--- /dev/null
+++ b/src/dsp/intrapred_directional.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_INTRAPRED_DIRECTIONAL_H_
+#define LIBGAV1_SRC_DSP_INTRAPRED_DIRECTIONAL_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/intrapred_directional_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/intrapred_directional_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::directional_intra_predictor_zone*. This function is not
+// thread-safe.
+void IntraPredDirectionalInit_C();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DSP_INTRAPRED_DIRECTIONAL_H_
diff --git a/src/dsp/intrapred_directional_test.cc b/src/dsp/intrapred_directional_test.cc
new file mode 100644
index 0000000..ebf9da0
--- /dev/null
+++ b/src/dsp/intrapred_directional_test.cc
@@ -0,0 +1,929 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred_directional.h"
+
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <ostream>
+
+#include "absl/strings/match.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/memory.h"
+#include "tests/block_utils.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kMaxBlockSize = 64;
+constexpr int kTotalPixels = kMaxBlockSize * kMaxBlockSize;
+constexpr int kNumDirectionalIntraPredictors = 3;
+
+constexpr int kBaseAngles[] = {45, 67, 90, 113, 135, 157, 180, 203};
+
+const char* const kDirectionalPredNames[kNumDirectionalIntraPredictors] = {
+    "kDirectionalIntraPredictorZone1", "kDirectionalIntraPredictorZone2",
+    "kDirectionalIntraPredictorZone3"};
+
+int16_t GetDirectionalIntraPredictorDerivative(const int angle) {
+  EXPECT_GE(angle, 3);
+  EXPECT_LE(angle, 87);
+  return kDirectionalIntraPredictorDerivative[DivideBy2(angle) - 1];
+}
+
+template <int bitdepth, typename Pixel>
+class IntraPredTestBase : public testing::TestWithParam<TransformSize>,
+                          public test_utils::MaxAlignedAllocable {
+ public:
+  IntraPredTestBase() {
+    switch (tx_size_) {
+      case kNumTransformSizes:
+        EXPECT_NE(tx_size_, kNumTransformSizes);
+        break;
+      default:
+        block_width_ = kTransformWidth[tx_size_];
+        block_height_ = kTransformHeight[tx_size_];
+        break;
+    }
+  }
+
+  IntraPredTestBase(const IntraPredTestBase&) = delete;
+  IntraPredTestBase& operator=(const IntraPredTestBase&) = delete;
+  ~IntraPredTestBase() override = default;
+
+ protected:
+  struct IntraPredMem {
+    void Reset(libvpx_test::ACMRandom* rnd) {
+      ASSERT_NE(rnd, nullptr);
+      Pixel* const left = left_mem + 16;
+      Pixel* const top = top_mem + 16;
+      const int mask = (1 << bitdepth) - 1;
+      for (auto& r : ref_src) r = rnd->Rand16() & mask;
+      for (int i = 0; i < kMaxBlockSize; ++i) left[i] = rnd->Rand16() & mask;
+      for (int i = -1; i < kMaxBlockSize; ++i) top[i] = rnd->Rand16() & mask;
+
+      // Some directional predictors require top-right, bottom-left.
+      for (int i = kMaxBlockSize; i < 2 * kMaxBlockSize; ++i) {
+        left[i] = rnd->Rand16() & mask;
+        top[i] = rnd->Rand16() & mask;
+      }
+      // TODO(jzern): reorder this and regenerate the digests after switching
+      // random number generators.
+      // Upsampling in the directional predictors extends left/top[-1] to [-2].
+      left[-1] = rnd->Rand16() & mask;
+      left[-2] = rnd->Rand16() & mask;
+      top[-2] = rnd->Rand16() & mask;
+      memset(left_mem, 0, sizeof(left_mem[0]) * 14);
+      memset(top_mem, 0, sizeof(top_mem[0]) * 14);
+      memset(top_mem + kMaxBlockSize * 2 + 16, 0,
+             sizeof(top_mem[0]) * kTopMemPadding);
+    }
+
+    // Set ref_src, top-left, top and left to |pixel|.
+    void Set(const Pixel pixel) {
+      Pixel* const left = left_mem + 16;
+      Pixel* const top = top_mem + 16;
+      for (auto& r : ref_src) r = pixel;
+      // Upsampling in the directional predictors extends left/top[-1] to [-2].
+      for (int i = -2; i < 2 * kMaxBlockSize; ++i) {
+        left[i] = top[i] = pixel;
+      }
+    }
+
+    // DirectionalZone1_Large() overreads up to 7 pixels in |top_mem|.
+    static constexpr int kTopMemPadding = 7;
+    alignas(kMaxAlignment) Pixel dst[kTotalPixels];
+    alignas(kMaxAlignment) Pixel ref_src[kTotalPixels];
+    alignas(kMaxAlignment) Pixel left_mem[kMaxBlockSize * 2 + 16];
+    alignas(
+        kMaxAlignment) Pixel top_mem[kMaxBlockSize * 2 + 16 + kTopMemPadding];
+  };
+
+  void SetUp() override { test_utils::ResetDspTable(bitdepth); }
+
+  const TransformSize tx_size_ = GetParam();
+  int block_width_;
+  int block_height_;
+  IntraPredMem intra_pred_mem_;
+};
+
+//------------------------------------------------------------------------------
+// DirectionalIntraPredTest
+
+template <int bitdepth, typename Pixel>
+class DirectionalIntraPredTest : public IntraPredTestBase<bitdepth, Pixel> {
+ public:
+  DirectionalIntraPredTest() = default;
+  DirectionalIntraPredTest(const DirectionalIntraPredTest&) = delete;
+  DirectionalIntraPredTest& operator=(const DirectionalIntraPredTest&) = delete;
+  ~DirectionalIntraPredTest() override = default;
+
+ protected:
+  using IntraPredTestBase<bitdepth, Pixel>::tx_size_;
+  using IntraPredTestBase<bitdepth, Pixel>::block_width_;
+  using IntraPredTestBase<bitdepth, Pixel>::block_height_;
+  using IntraPredTestBase<bitdepth, Pixel>::intra_pred_mem_;
+
+  enum Zone { kZone1, kZone2, kZone3, kNumZones };
+
+  enum { kAngleDeltaStart = -9, kAngleDeltaStop = 9, kAngleDeltaStep = 3 };
+
+  void SetUp() override {
+    IntraPredTestBase<bitdepth, Pixel>::SetUp();
+    IntraPredDirectionalInit_C();
+
+    const Dsp* const dsp = GetDspTable(bitdepth);
+    ASSERT_NE(dsp, nullptr);
+    base_directional_intra_pred_zone1_ = dsp->directional_intra_predictor_zone1;
+    base_directional_intra_pred_zone2_ = dsp->directional_intra_predictor_zone2;
+    base_directional_intra_pred_zone3_ = dsp->directional_intra_predictor_zone3;
+
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    const char* const test_case = test_info->test_suite_name();
+    if (absl::StartsWith(test_case, "C/")) {
+      base_directional_intra_pred_zone1_ = nullptr;
+      base_directional_intra_pred_zone2_ = nullptr;
+      base_directional_intra_pred_zone3_ = nullptr;
+    } else if (absl::StartsWith(test_case, "NEON/")) {
+      IntraPredDirectionalInit_NEON();
+    } else if (absl::StartsWith(test_case, "SSE41/")) {
+      if ((GetCpuInfo() & kSSE4_1) != 0) {
+        IntraPredDirectionalInit_SSE4_1();
+      }
+    } else {
+      FAIL() << "Unrecognized architecture prefix in test case name: "
+             << test_case;
+    }
+
+    cur_directional_intra_pred_zone1_ = dsp->directional_intra_predictor_zone1;
+    cur_directional_intra_pred_zone2_ = dsp->directional_intra_predictor_zone2;
+    cur_directional_intra_pred_zone3_ = dsp->directional_intra_predictor_zone3;
+
+    // Skip functions that haven't been specialized for this particular
+    // architecture.
+    if (cur_directional_intra_pred_zone1_ ==
+        base_directional_intra_pred_zone1_) {
+      cur_directional_intra_pred_zone1_ = nullptr;
+    }
+    if (cur_directional_intra_pred_zone2_ ==
+        base_directional_intra_pred_zone2_) {
+      cur_directional_intra_pred_zone2_ = nullptr;
+    }
+    if (cur_directional_intra_pred_zone3_ ==
+        base_directional_intra_pred_zone3_) {
+      cur_directional_intra_pred_zone3_ = nullptr;
+    }
+  }
+
+  bool IsEdgeUpsampled(int delta, const int filter_type) const {
+    delta = std::abs(delta);
+    if (delta == 0 || delta >= 40) return false;
+    const int block_wh = block_width_ + block_height_;
+    return (filter_type == 1) ? block_wh <= 8 : block_wh <= 16;
+  }
+
+  // Returns the minimum and maximum (exclusive) range of angles that the
+  // predictor should be applied to.
+  void GetZoneAngleRange(const Zone zone, int* const min_angle,
+                         int* const max_angle) const {
+    ASSERT_NE(min_angle, nullptr);
+    ASSERT_NE(max_angle, nullptr);
+    switch (zone) {
+        // The overall minimum angle comes from mode D45_PRED, yielding:
+        // min_angle = 45-(MAX_ANGLE_DELTA*ANGLE_STEP) = 36
+        // The overall maximum angle comes from mode D203_PRED, yielding:
+        // max_angle = 203+(MAX_ANGLE_DELTA*ANGLE_STEP) = 212
+        // The angles 180 and 90 are not permitted because they correspond to
+        // V_PRED and H_PRED, which are handled in distinct functions.
+      case kZone1:
+        *min_angle = 36;
+        *max_angle = 87;
+        break;
+      case kZone2:
+        *min_angle = 93;
+        *max_angle = 177;
+        break;
+      case kZone3:
+        *min_angle = 183;
+        *max_angle = 212;
+        break;
+      case kNumZones:
+        FAIL() << "Invalid zone value: " << zone;
+        break;
+    }
+  }
+
+  // These tests modify intra_pred_mem_.
+  void TestSpeed(const char* const digests[kNumDirectionalIntraPredictors],
+                 Zone zone, int num_runs);
+  void TestSaturatedValues();
+  void TestRandomValues();
+
+  DirectionalIntraPredictorZone1Func base_directional_intra_pred_zone1_;
+  DirectionalIntraPredictorZone2Func base_directional_intra_pred_zone2_;
+  DirectionalIntraPredictorZone3Func base_directional_intra_pred_zone3_;
+  DirectionalIntraPredictorZone1Func cur_directional_intra_pred_zone1_;
+  DirectionalIntraPredictorZone2Func cur_directional_intra_pred_zone2_;
+  DirectionalIntraPredictorZone3Func cur_directional_intra_pred_zone3_;
+};
+
+template <int bitdepth, typename Pixel>
+void DirectionalIntraPredTest<bitdepth, Pixel>::TestSpeed(
+    const char* const digests[kNumDirectionalIntraPredictors], const Zone zone,
+    const int num_runs) {
+  switch (zone) {
+    case kZone1:
+      if (cur_directional_intra_pred_zone1_ == nullptr) return;
+      break;
+    case kZone2:
+      if (cur_directional_intra_pred_zone2_ == nullptr) return;
+      break;
+    case kZone3:
+      if (cur_directional_intra_pred_zone3_ == nullptr) return;
+      break;
+    case kNumZones:
+      FAIL() << "Invalid zone value: " << zone;
+      break;
+  }
+  ASSERT_NE(digests, nullptr);
+  const Pixel* const left = intra_pred_mem_.left_mem + 16;
+  const Pixel* const top = intra_pred_mem_.top_mem + 16;
+
+  libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+  intra_pred_mem_.Reset(&rnd);
+
+  // Allocate separate blocks for each angle + filter + upsampled combination.
+  // Add a 1 pixel right border to test for overwrites.
+  static constexpr int kMaxZoneAngles = 27;  // zone 2
+  static constexpr int kMaxFilterTypes = 2;
+  static constexpr int kBlockBorder = 1;
+  static constexpr int kBorderSize =
+      kBlockBorder * kMaxZoneAngles * kMaxFilterTypes;
+  const int ref_stride =
+      kMaxZoneAngles * kMaxFilterTypes * block_width_ + kBorderSize;
+  const size_t ref_alloc_size = sizeof(Pixel) * ref_stride * block_height_;
+
+  using AlignedPtr = std::unique_ptr<Pixel[], decltype(&AlignedFree)>;
+  AlignedPtr ref_src(static_cast<Pixel*>(AlignedAlloc(16, ref_alloc_size)),
+                     &AlignedFree);
+  AlignedPtr dest(static_cast<Pixel*>(AlignedAlloc(16, ref_alloc_size)),
+                  &AlignedFree);
+  ASSERT_NE(ref_src, nullptr);
+  ASSERT_NE(dest, nullptr);
+
+  const int mask = (1 << bitdepth) - 1;
+  for (size_t i = 0; i < ref_alloc_size / sizeof(ref_src[0]); ++i) {
+    ref_src[i] = rnd.Rand16() & mask;
+  }
+
+  int min_angle = 0, max_angle = 0;
+  ASSERT_NO_FATAL_FAILURE(GetZoneAngleRange(zone, &min_angle, &max_angle));
+
+  absl::Duration elapsed_time;
+  for (int run = 0; run < num_runs; ++run) {
+    Pixel* dst = dest.get();
+    memcpy(dst, ref_src.get(), ref_alloc_size);
+    for (const auto& base_angle : kBaseAngles) {
+      for (int filter_type = 0; filter_type <= 1; ++filter_type) {
+        for (int angle_delta = kAngleDeltaStart; angle_delta <= kAngleDeltaStop;
+             angle_delta += kAngleDeltaStep) {
+          const int predictor_angle = base_angle + angle_delta;
+          if (predictor_angle < min_angle || predictor_angle > max_angle) {
+            continue;
+          }
+
+          ASSERT_GT(predictor_angle, 0) << "base_angle: " << base_angle
+                                        << " angle_delta: " << angle_delta;
+          const bool upsampled_left =
+              IsEdgeUpsampled(predictor_angle - 180, filter_type);
+          const bool upsampled_top =
+              IsEdgeUpsampled(predictor_angle - 90, filter_type);
+          const ptrdiff_t stride = ref_stride * sizeof(ref_src[0]);
+          if (predictor_angle < 90) {
+            ASSERT_EQ(zone, kZone1);
+            const int xstep =
+                GetDirectionalIntraPredictorDerivative(predictor_angle);
+            const absl::Time start = absl::Now();
+            cur_directional_intra_pred_zone1_(dst, stride, top, block_width_,
+                                              block_height_, xstep,
+                                              upsampled_top);
+            elapsed_time += absl::Now() - start;
+          } else if (predictor_angle < 180) {
+            ASSERT_EQ(zone, kZone2);
+            const int xstep =
+                GetDirectionalIntraPredictorDerivative(180 - predictor_angle);
+            const int ystep =
+                GetDirectionalIntraPredictorDerivative(predictor_angle - 90);
+            const absl::Time start = absl::Now();
+            cur_directional_intra_pred_zone2_(
+                dst, stride, top, left, block_width_, block_height_, xstep,
+                ystep, upsampled_top, upsampled_left);
+            elapsed_time += absl::Now() - start;
+          } else {
+            ASSERT_EQ(zone, kZone3);
+            ASSERT_LT(predictor_angle, 270);
+            const int ystep =
+                GetDirectionalIntraPredictorDerivative(270 - predictor_angle);
+            const absl::Time start = absl::Now();
+            cur_directional_intra_pred_zone3_(dst, stride, left, block_width_,
+                                              block_height_, ystep,
+                                              upsampled_left);
+            elapsed_time += absl::Now() - start;
+          }
+          dst += block_width_ + kBlockBorder;
+        }
+      }
+    }
+  }
+
+  test_utils::CheckMd5Digest(ToString(tx_size_), kDirectionalPredNames[zone],
+                             digests[zone], dest.get(), ref_alloc_size,
+                             elapsed_time);
+}
+
+template <int bitdepth, typename Pixel>
+void DirectionalIntraPredTest<bitdepth, Pixel>::TestSaturatedValues() {
+  const Pixel* const left = intra_pred_mem_.left_mem + 16;
+  const Pixel* const top = intra_pred_mem_.top_mem + 16;
+  const auto kMaxPixel = static_cast<Pixel>((1 << bitdepth) - 1);
+  intra_pred_mem_.Set(kMaxPixel);
+
+  for (int i = kZone1; i < kNumZones; ++i) {
+    switch (i) {
+      case kZone1:
+        if (cur_directional_intra_pred_zone1_ == nullptr) continue;
+        break;
+      case kZone2:
+        if (cur_directional_intra_pred_zone2_ == nullptr) continue;
+        break;
+      case kZone3:
+        if (cur_directional_intra_pred_zone3_ == nullptr) continue;
+        break;
+      case kNumZones:
+        FAIL() << "Invalid zone value: " << i;
+        break;
+    }
+    int min_angle = 0, max_angle = 0;
+    ASSERT_NO_FATAL_FAILURE(
+        GetZoneAngleRange(static_cast<Zone>(i), &min_angle, &max_angle));
+
+    for (const auto& base_angle : kBaseAngles) {
+      for (int filter_type = 0; filter_type <= 1; ++filter_type) {
+        for (int angle_delta = kAngleDeltaStart; angle_delta <= kAngleDeltaStop;
+             angle_delta += kAngleDeltaStep) {
+          const int predictor_angle = base_angle + angle_delta;
+          if (predictor_angle <= min_angle || predictor_angle >= max_angle) {
+            continue;
+          }
+          ASSERT_GT(predictor_angle, 0) << "base_angle: " << base_angle
+                                        << " angle_delta: " << angle_delta;
+
+          memcpy(intra_pred_mem_.dst, intra_pred_mem_.ref_src,
+                 sizeof(intra_pred_mem_.dst));
+
+          const bool upsampled_left =
+              IsEdgeUpsampled(predictor_angle - 180, filter_type);
+          const bool upsampled_top =
+              IsEdgeUpsampled(predictor_angle - 90, filter_type);
+          const ptrdiff_t stride = kMaxBlockSize * sizeof(Pixel);
+          if (predictor_angle < 90) {
+            const int xstep =
+                GetDirectionalIntraPredictorDerivative(predictor_angle);
+            cur_directional_intra_pred_zone1_(intra_pred_mem_.dst, stride, top,
+                                              block_width_, block_height_,
+                                              xstep, upsampled_top);
+          } else if (predictor_angle < 180) {
+            const int xstep =
+                GetDirectionalIntraPredictorDerivative(180 - predictor_angle);
+            const int ystep =
+                GetDirectionalIntraPredictorDerivative(predictor_angle - 90);
+            cur_directional_intra_pred_zone2_(
+                intra_pred_mem_.dst, stride, top, left, block_width_,
+                block_height_, xstep, ystep, upsampled_top, upsampled_left);
+          } else {
+            ASSERT_LT(predictor_angle, 270);
+            const int ystep =
+                GetDirectionalIntraPredictorDerivative(270 - predictor_angle);
+            cur_directional_intra_pred_zone3_(intra_pred_mem_.dst, stride, left,
+                                              block_width_, block_height_,
+                                              ystep, upsampled_left);
+          }
+
+          if (!test_utils::CompareBlocks(
+                  intra_pred_mem_.dst, intra_pred_mem_.ref_src, block_width_,
+                  block_height_, kMaxBlockSize, kMaxBlockSize, true)) {
+            ADD_FAILURE() << "Expected " << kDirectionalPredNames[i]
+                          << " (angle: " << predictor_angle
+                          << " filter type: " << filter_type
+                          << ") to produce a block containing '"
+                          << static_cast<int>(kMaxPixel) << "'";
+            return;
+          }
+        }
+      }
+    }
+  }
+}
+
+template <int bitdepth, typename Pixel>
+void DirectionalIntraPredTest<bitdepth, Pixel>::TestRandomValues() {
+  const Pixel* const left = intra_pred_mem_.left_mem + 16;
+  const Pixel* const top = intra_pred_mem_.top_mem + 16;
+  // Use an alternate seed to differentiate this test from TestSpeed().
+  libvpx_test::ACMRandom rnd(test_utils::kAlternateDeterministicSeed);
+
+  for (int i = kZone1; i < kNumZones; ++i) {
+    // Only run when there is a reference version (base) and a different
+    // optimized version (cur).
+    switch (i) {
+      case kZone1:
+        if (base_directional_intra_pred_zone1_ == nullptr ||
+            cur_directional_intra_pred_zone1_ == nullptr) {
+          continue;
+        }
+        break;
+      case kZone2:
+        if (base_directional_intra_pred_zone2_ == nullptr ||
+            cur_directional_intra_pred_zone2_ == nullptr) {
+          continue;
+        }
+        break;
+      case kZone3:
+        if (base_directional_intra_pred_zone3_ == nullptr ||
+            cur_directional_intra_pred_zone3_ == nullptr) {
+          continue;
+        }
+        break;
+      case kNumZones:
+        FAIL() << "Invalid zone value: " << i;
+        break;
+    }
+    int min_angle = 0, max_angle = 0;
+    ASSERT_NO_FATAL_FAILURE(
+        GetZoneAngleRange(static_cast<Zone>(i), &min_angle, &max_angle));
+
+    for (const auto& base_angle : kBaseAngles) {
+      for (int n = 0; n < 1000; ++n) {
+        for (int filter_type = 0; filter_type <= 1; ++filter_type) {
+          for (int angle_delta = kAngleDeltaStart;
+               angle_delta <= kAngleDeltaStop; angle_delta += kAngleDeltaStep) {
+            const int predictor_angle = base_angle + angle_delta;
+            if (predictor_angle <= min_angle || predictor_angle >= max_angle) {
+              continue;
+            }
+            ASSERT_GT(predictor_angle, 0) << "base_angle: " << base_angle
+                                          << " angle_delta: " << angle_delta;
+
+            intra_pred_mem_.Reset(&rnd);
+            memcpy(intra_pred_mem_.dst, intra_pred_mem_.ref_src,
+                   sizeof(intra_pred_mem_.dst));
+
+            const bool upsampled_left =
+                IsEdgeUpsampled(predictor_angle - 180, filter_type);
+            const bool upsampled_top =
+                IsEdgeUpsampled(predictor_angle - 90, filter_type);
+            const ptrdiff_t stride = kMaxBlockSize * sizeof(Pixel);
+            if (predictor_angle < 90) {
+              const int xstep =
+                  GetDirectionalIntraPredictorDerivative(predictor_angle);
+              base_directional_intra_pred_zone1_(
+                  intra_pred_mem_.ref_src, stride, top, block_width_,
+                  block_height_, xstep, upsampled_top);
+              cur_directional_intra_pred_zone1_(
+                  intra_pred_mem_.dst, stride, top, block_width_, block_height_,
+                  xstep, upsampled_top);
+            } else if (predictor_angle < 180) {
+              const int xstep =
+                  GetDirectionalIntraPredictorDerivative(180 - predictor_angle);
+              const int ystep =
+                  GetDirectionalIntraPredictorDerivative(predictor_angle - 90);
+              base_directional_intra_pred_zone2_(
+                  intra_pred_mem_.ref_src, stride, top, left, block_width_,
+                  block_height_, xstep, ystep, upsampled_top, upsampled_left);
+              cur_directional_intra_pred_zone2_(
+                  intra_pred_mem_.dst, stride, top, left, block_width_,
+                  block_height_, xstep, ystep, upsampled_top, upsampled_left);
+            } else {
+              ASSERT_LT(predictor_angle, 270);
+              const int ystep =
+                  GetDirectionalIntraPredictorDerivative(270 - predictor_angle);
+              base_directional_intra_pred_zone3_(
+                  intra_pred_mem_.ref_src, stride, left, block_width_,
+                  block_height_, ystep, upsampled_left);
+              cur_directional_intra_pred_zone3_(
+                  intra_pred_mem_.dst, stride, left, block_width_,
+                  block_height_, ystep, upsampled_left);
+            }
+
+            if (!test_utils::CompareBlocks(
+                    intra_pred_mem_.dst, intra_pred_mem_.ref_src, block_width_,
+                    block_height_, kMaxBlockSize, kMaxBlockSize, true)) {
+              ADD_FAILURE() << "Result from optimized version of "
+                            << kDirectionalPredNames[i]
+                            << " differs from reference at angle "
+                            << predictor_angle << " with filter type "
+                            << filter_type << " in iteration #" << n;
+              return;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+using DirectionalIntraPredTest8bpp = DirectionalIntraPredTest<8, uint8_t>;
+
+const char* const* GetDirectionalIntraPredDigests8bpp(TransformSize tx_size) {
+  static const char* const kDigests4x4[kNumDirectionalIntraPredictors] = {
+      "9cfc1da729ad08682e165826c29b280b",
+      "bb73539c7afbda7bddd2184723b932d6",
+      "9d2882800ffe948196e984a26a2da72c",
+  };
+  static const char* const kDigests4x8[kNumDirectionalIntraPredictors] = {
+      "090efe6f83cc6fa301f65d3bbd5c38d2",
+      "d0fba4cdfb90f8bd293a94cae9db1a15",
+      "f7ad0eeab4389d0baa485d30fec87617",
+  };
+  static const char* const kDigests4x16[kNumDirectionalIntraPredictors] = {
+      "1d32b33c75fe85248c48cdc8caa78d84",
+      "7000e18159443d366129a6cc6ef8fcee",
+      "06c02fac5f8575f687abb3f634eb0b4c",
+  };
+  static const char* const kDigests8x4[kNumDirectionalIntraPredictors] = {
+      "1b591799685bc135982114b731293f78",
+      "5cd9099acb9f7b2618dafa6712666580",
+      "d023883efede88f99c19d006044d9fa1",
+  };
+  static const char* const kDigests8x8[kNumDirectionalIntraPredictors] = {
+      "f1e46ecf62a2516852f30c5025adb7ea",
+      "864442a209c16998065af28d8cdd839a",
+      "411a6e554868982af577de69e53f12e8",
+  };
+  static const char* const kDigests8x16[kNumDirectionalIntraPredictors] = {
+      "89278302be913a85cfb06feaea339459",
+      "6c42f1a9493490cd4529fd40729cec3c",
+      "2516b5e1c681e5dcb1acedd5f3d41106",
+  };
+  static const char* const kDigests8x32[kNumDirectionalIntraPredictors] = {
+      "aea7078f3eeaa8afbfe6c959c9e676f1",
+      "cad30babf12729dda5010362223ba65c",
+      "ff384ebdc832007775af418a2aae1463",
+  };
+  static const char* const kDigests16x4[kNumDirectionalIntraPredictors] = {
+      "964a821c313c831e12f4d32e616c0b55",
+      "adf6dad3a84ab4d16c16eea218bec57a",
+      "a54fa008d43895e523474686c48a81c2",
+  };
+  static const char* const kDigests16x8[kNumDirectionalIntraPredictors] = {
+      "fe2851b4e4f9fcf924cf17d50415a4c0",
+      "50a0e279c481437ff315d08eb904c733",
+      "0682065c8fb6cbf9be4949316c87c9e5",
+  };
+  static const char* const kDigests16x16[kNumDirectionalIntraPredictors] = {
+      "ef15503b1943642e7a0bace1616c0e11",
+      "bf1a4d3f855f1072a902a88ec6ce0350",
+      "7e87a03e29cd7fd843fd71b729a18f3f",
+  };
+  static const char* const kDigests16x32[kNumDirectionalIntraPredictors] = {
+      "f7b636615d2e5bf289b5db452a6f188d",
+      "e95858c532c10d00b0ce7a02a02121dd",
+      "34a18ccf58ef490f32268e85ce8c7de4",
+  };
+  static const char* const kDigests16x64[kNumDirectionalIntraPredictors] = {
+      "b250099986c2fab9670748598058846b",
+      "f25d80af4da862a9b6b72979f1e17cb4",
+      "5347dc7bc346733b4887f6c8ad5e0898",
+  };
+  static const char* const kDigests32x8[kNumDirectionalIntraPredictors] = {
+      "72e4c9f8af043b1cb1263490351818ab",
+      "1fc010d2df011b9e4e3d0957107c78df",
+      "f4cbfa3ca941ef08b972a68d7e7bafc4",
+  };
+  static const char* const kDigests32x16[kNumDirectionalIntraPredictors] = {
+      "37e5a1aaf7549d2bce08eece9d20f0f6",
+      "6a2794025d0aca414ab17baa3cf8251a",
+      "63dd37a6efdc91eeefef166c99ce2db1",
+  };
+  static const char* const kDigests32x32[kNumDirectionalIntraPredictors] = {
+      "198aabc958992eb49cceab97d1acb43e",
+      "aee88b6c8bacfcf38799fe338e6c66e7",
+      "01e8f8f96696636f6d79d33951907a16",
+  };
+  static const char* const kDigests32x64[kNumDirectionalIntraPredictors] = {
+      "0611390202c4f90f7add7aec763ded58",
+      "960240c7ceda2ccfac7c90b71460578a",
+      "7e7d97594aab8ad56e8c01c340335607",
+  };
+  static const char* const kDigests64x16[kNumDirectionalIntraPredictors] = {
+      "7e1f567e7fc510757f2d89d638bc826f",
+      "c929d687352ce40a58670be2ce3c8c90",
+      "f6881e6a9ba3c3d3d730b425732656b1",
+  };
+  static const char* const kDigests64x32[kNumDirectionalIntraPredictors] = {
+      "27b4c2a7081d4139f22003ba8b6dfdf2",
+      "301e82740866b9274108a04c872fa848",
+      "98d3aa4fef838f4abf00dac33806659f",
+  };
+  static const char* const kDigests64x64[kNumDirectionalIntraPredictors] = {
+      "b31816db8fade3accfd975b21aa264c7",
+      "2adce01a03b9452633d5830e1a9b4e23",
+      "7b988fadba8b07c36e88d7be6b270494",
+  };
+
+  switch (tx_size) {
+    case kTransformSize4x4:
+      return kDigests4x4;
+    case kTransformSize4x8:
+      return kDigests4x8;
+    case kTransformSize4x16:
+      return kDigests4x16;
+    case kTransformSize8x4:
+      return kDigests8x4;
+    case kTransformSize8x8:
+      return kDigests8x8;
+    case kTransformSize8x16:
+      return kDigests8x16;
+    case kTransformSize8x32:
+      return kDigests8x32;
+    case kTransformSize16x4:
+      return kDigests16x4;
+    case kTransformSize16x8:
+      return kDigests16x8;
+    case kTransformSize16x16:
+      return kDigests16x16;
+    case kTransformSize16x32:
+      return kDigests16x32;
+    case kTransformSize16x64:
+      return kDigests16x64;
+    case kTransformSize32x8:
+      return kDigests32x8;
+    case kTransformSize32x16:
+      return kDigests32x16;
+    case kTransformSize32x32:
+      return kDigests32x32;
+    case kTransformSize32x64:
+      return kDigests32x64;
+    case kTransformSize64x16:
+      return kDigests64x16;
+    case kTransformSize64x32:
+      return kDigests64x32;
+    case kTransformSize64x64:
+      return kDigests64x64;
+    default:
+      ADD_FAILURE() << "Unknown transform size: " << tx_size;
+      return nullptr;
+  }
+}
+
+TEST_P(DirectionalIntraPredTest8bpp, DISABLED_Speed) {
+  const auto num_runs = static_cast<int>(5e7 / (block_width_ * block_height_));
+  for (int i = kZone1; i < kNumZones; ++i) {
+    TestSpeed(GetDirectionalIntraPredDigests8bpp(tx_size_),
+              static_cast<Zone>(i), num_runs);
+  }
+}
+
+TEST_P(DirectionalIntraPredTest8bpp, FixedInput) {
+  for (int i = kZone1; i < kNumZones; ++i) {
+    TestSpeed(GetDirectionalIntraPredDigests8bpp(tx_size_),
+              static_cast<Zone>(i), 1);
+  }
+}
+
+TEST_P(DirectionalIntraPredTest8bpp, Overflow) { TestSaturatedValues(); }
+TEST_P(DirectionalIntraPredTest8bpp, Random) { TestRandomValues(); }
+
+//------------------------------------------------------------------------------
+#if LIBGAV1_MAX_BITDEPTH >= 10
+
+using DirectionalIntraPredTest10bpp = DirectionalIntraPredTest<10, uint16_t>;
+
+const char* const* GetDirectionalIntraPredDigests10bpp(TransformSize tx_size) {
+  static const char* const kDigests4x4[kNumDirectionalIntraPredictors] = {
+      "a683f4d7ccd978737615f61ecb4d638d",
+      "90c94374eaf7e9501f197863937b8639",
+      "0d3969cd081523ac6a906eecc7980c43",
+  };
+  static const char* const kDigests4x8[kNumDirectionalIntraPredictors] = {
+      "c3ffa2979b325644e4a56c882fe27347",
+      "1f61f5ee413a9a3b8d1d93869ec2aee0",
+      "4795ea944779ec4a783408769394d874",
+  };
+  static const char* const kDigests4x16[kNumDirectionalIntraPredictors] = {
+      "45c3282c9aa51024c1d64a40f230aa45",
+      "5cd47dd69f8bd0b15365a0c5cfc0a49a",
+      "06336c507b05f98c1d6a21abc43e6182",
+  };
+  static const char* const kDigests8x4[kNumDirectionalIntraPredictors] = {
+      "7370476ff0abbdc5e92f811b8879c861",
+      "a239a50adb28a4791b52a0dfff3bee06",
+      "4779a17f958a9ca04e8ec08c5aba1d36",
+  };
+  static const char* const kDigests8x8[kNumDirectionalIntraPredictors] = {
+      "305463f346c376594f82aad8304e0362",
+      "0cd481e5bda286c87a645417569fd948",
+      "48c7899dc9b7163b0b1f61b3a2b4b73e",
+  };
+  static const char* const kDigests8x16[kNumDirectionalIntraPredictors] = {
+      "5c18fd5339be90628c82b1fb6af50d5e",
+      "35eaa566ebd3bb7c903cfead5dc9ac78",
+      "9fdb0e790e5965810d02c02713c84071",
+  };
+  static const char* const kDigests8x32[kNumDirectionalIntraPredictors] = {
+      "2168d6cc858c704748b7b343ced2ac3a",
+      "1d3ce273107447faafd2e55877e48ffb",
+      "d344164049d1fe9b65a3ae8764bbbd37",
+  };
+  static const char* const kDigests16x4[kNumDirectionalIntraPredictors] = {
+      "dcef2cf51abe3fe150f388a14c762d30",
+      "6a810b289b1c14f8eab8ca1274e91ecd",
+      "c94da7c11f3fb11963d85c8804fce2d9",
+  };
+  static const char* const kDigests16x8[kNumDirectionalIntraPredictors] = {
+      "50a0d08b0d99b7a574bad2cfb36efc39",
+      "2dcb55874db39da70c8ca1318559f9fe",
+      "6390bcd30ff3bc389ecc0a0952bea531",
+  };
+  static const char* const kDigests16x16[kNumDirectionalIntraPredictors] = {
+      "7146c83c2620935606d49f3cb5876f41",
+      "2318ddf30c070a53c9b9cf199cd1b2c5",
+      "e9042e2124925aa7c1b6110617cb10e8",
+  };
+  static const char* const kDigests16x32[kNumDirectionalIntraPredictors] = {
+      "c970f401de7b7c5bb4e3ad447fcbef8f",
+      "a18cc70730eecdaa31dbcf4306ff490f",
+      "32c1528ad4a576a2210399d6b4ccd46e",
+  };
+  static const char* const kDigests16x64[kNumDirectionalIntraPredictors] = {
+      "00b3f0007da2e5d01380594a3d7162d5",
+      "1971af519e4a18967b7311f93efdd1b8",
+      "e6139769ce5a9c4982cfab9363004516",
+  };
+  static const char* const kDigests32x8[kNumDirectionalIntraPredictors] = {
+      "08107ad971179cc9f465ae5966bd4901",
+      "b215212a3c0dfe9182c4f2e903d731f7",
+      "791274416a0da87c674e1ae318b3ce09",
+  };
+  static const char* const kDigests32x16[kNumDirectionalIntraPredictors] = {
+      "94ea6cccae35b5d08799aa003ac08ccf",
+      "ae105e20e63fb55d4fd9d9e59dc62dde",
+      "973d0b2358ea585e4f486e7e645c5310",
+  };
+  static const char* const kDigests32x32[kNumDirectionalIntraPredictors] = {
+      "d14c695c4853ddf5e5d8256bc1d1ed60",
+      "6bd0ebeb53adecc11442b1218b870cb7",
+      "e03bc402a9999aba8272275dce93e89f",
+  };
+  static const char* const kDigests32x64[kNumDirectionalIntraPredictors] = {
+      "b21a8a8723758392ee659eeeae518a1e",
+      "e50285454896210ce44d6f04dfde05a7",
+      "f0f8ea0c6c2acc8d7d390927c3a90370",
+  };
+  static const char* const kDigests64x16[kNumDirectionalIntraPredictors] = {
+      "ce51db16fd4fa56e601631397b098c89",
+      "aa87a8635e02c1e91d13158c61e443f6",
+      "4c1ee3afd46ef34bd711a34d0bf86f13",
+  };
+  static const char* const kDigests64x32[kNumDirectionalIntraPredictors] = {
+      "25aaf5971e24e543e3e69a47254af777",
+      "eb6f444b3df127d69460778ab5bf8fc1",
+      "2f846cc0d506f90c0a58438600819817",
+  };
+  static const char* const kDigests64x64[kNumDirectionalIntraPredictors] = {
+      "b26ce5b5f4b5d4a438b52e5987877fb8",
+      "35721a00a70938111939cf69988d928e",
+      "0af7ec35939483fac82c246a13845806",
+  };
+
+  switch (tx_size) {
+    case kTransformSize4x4:
+      return kDigests4x4;
+    case kTransformSize4x8:
+      return kDigests4x8;
+    case kTransformSize4x16:
+      return kDigests4x16;
+    case kTransformSize8x4:
+      return kDigests8x4;
+    case kTransformSize8x8:
+      return kDigests8x8;
+    case kTransformSize8x16:
+      return kDigests8x16;
+    case kTransformSize8x32:
+      return kDigests8x32;
+    case kTransformSize16x4:
+      return kDigests16x4;
+    case kTransformSize16x8:
+      return kDigests16x8;
+    case kTransformSize16x16:
+      return kDigests16x16;
+    case kTransformSize16x32:
+      return kDigests16x32;
+    case kTransformSize16x64:
+      return kDigests16x64;
+    case kTransformSize32x8:
+      return kDigests32x8;
+    case kTransformSize32x16:
+      return kDigests32x16;
+    case kTransformSize32x32:
+      return kDigests32x32;
+    case kTransformSize32x64:
+      return kDigests32x64;
+    case kTransformSize64x16:
+      return kDigests64x16;
+    case kTransformSize64x32:
+      return kDigests64x32;
+    case kTransformSize64x64:
+      return kDigests64x64;
+    default:
+      ADD_FAILURE() << "Unknown transform size: " << tx_size;
+      return nullptr;
+  }
+}
+
+TEST_P(DirectionalIntraPredTest10bpp, DISABLED_Speed) {
+  const auto num_runs = static_cast<int>(5e7 / (block_width_ * block_height_));
+  for (int i = kZone1; i < kNumZones; ++i) {
+    TestSpeed(GetDirectionalIntraPredDigests10bpp(tx_size_),
+              static_cast<Zone>(i), num_runs);
+  }
+}
+
+TEST_P(DirectionalIntraPredTest10bpp, FixedInput) {
+  for (int i = kZone1; i < kNumZones; ++i) {
+    TestSpeed(GetDirectionalIntraPredDigests10bpp(tx_size_),
+              static_cast<Zone>(i), 1);
+  }
+}
+
+TEST_P(DirectionalIntraPredTest10bpp, Overflow) { TestSaturatedValues(); }
+
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+constexpr TransformSize kTransformSizes[] = {
+    kTransformSize4x4,   kTransformSize4x8,   kTransformSize4x16,
+    kTransformSize8x4,   kTransformSize8x8,   kTransformSize8x16,
+    kTransformSize8x32,  kTransformSize16x4,  kTransformSize16x8,
+    kTransformSize16x16, kTransformSize16x32, kTransformSize16x64,
+    kTransformSize32x8,  kTransformSize32x16, kTransformSize32x32,
+    kTransformSize32x64, kTransformSize64x16, kTransformSize64x32,
+    kTransformSize64x64};
+
+INSTANTIATE_TEST_SUITE_P(C, DirectionalIntraPredTest8bpp,
+                         testing::ValuesIn(kTransformSizes));
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, DirectionalIntraPredTest8bpp,
+                         testing::ValuesIn(kTransformSizes));
+#endif  // LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, DirectionalIntraPredTest8bpp,
+                         testing::ValuesIn(kTransformSizes));
+#endif  // LIBGAV1_ENABLE_NEON
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+INSTANTIATE_TEST_SUITE_P(C, DirectionalIntraPredTest10bpp,
+                         testing::ValuesIn(kTransformSizes));
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, DirectionalIntraPredTest10bpp,
+                         testing::ValuesIn(kTransformSizes));
+#endif  // LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, DirectionalIntraPredTest10bpp,
+                         testing::ValuesIn(kTransformSizes));
+#endif  // LIBGAV1_ENABLE_NEON
+
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+}  // namespace
+}  // namespace dsp
+
+static std::ostream& operator<<(std::ostream& os, const TransformSize tx_size) {
+  return os << ToString(tx_size);
+}
+
+}  // namespace libgav1
diff --git a/src/dsp/intrapred_filter.cc b/src/dsp/intrapred_filter.cc
new file mode 100644
index 0000000..f4bd296
--- /dev/null
+++ b/src/dsp/intrapred_filter.cc
@@ -0,0 +1,144 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred_filter.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/memory.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+//------------------------------------------------------------------------------
+// FilterIntraPredictor_C
+
+// The recursive filter applies a different filter to the top 4 and 2 left
+// pixels to produce each pixel in a 4x2 sub-block. Each successive 4x2 uses the
+// prediction output of the blocks above and to the left, unless they are
+// adjacent to the |top_row| or |left_column|. The set of 8 filters is selected
+// according to |pred|.
+template <int bitdepth, typename Pixel>
+void FilterIntraPredictor_C(void* const dest, ptrdiff_t stride,
+                            const void* const top_row,
+                            const void* const left_column,
+                            const FilterIntraPredictor pred, const int width,
+                            const int height) {
+  const int kMaxPixel = (1 << bitdepth) - 1;
+  const auto* const top = static_cast<const Pixel*>(top_row);
+  const auto* const left = static_cast<const Pixel*>(left_column);
+
+  assert(width <= 32 && height <= 32);
+
+  Pixel buffer[3][33];  // cache 2 rows + top & left boundaries
+  memcpy(buffer[0], &top[-1], (width + 1) * sizeof(top[0]));
+
+  auto* dst = static_cast<Pixel*>(dest);
+  stride /= sizeof(Pixel);
+  int row0 = 0, row2 = 2;
+  int ystep = 1;
+  int y = 0;
+  do {
+    buffer[1][0] = left[y];
+    buffer[row2][0] = left[y + 1];
+    int x = 1;
+    do {
+      const Pixel p0 = buffer[row0][x - 1];  // top-left
+      const Pixel p1 = buffer[row0][x + 0];  // top 0
+      const Pixel p2 = buffer[row0][x + 1];  // top 1
+      const Pixel p3 = buffer[row0][x + 2];  // top 2
+      const Pixel p4 = buffer[row0][x + 3];  // top 3
+      const Pixel p5 = buffer[1][x - 1];     // left 0
+      const Pixel p6 = buffer[row2][x - 1];  // left 1
+      for (int i = 0; i < 8; ++i) {
+        const int xoffset = i & 0x03;
+        const int yoffset = (i >> 2) * ystep;
+        const int value = kFilterIntraTaps[pred][i][0] * p0 +
+                          kFilterIntraTaps[pred][i][1] * p1 +
+                          kFilterIntraTaps[pred][i][2] * p2 +
+                          kFilterIntraTaps[pred][i][3] * p3 +
+                          kFilterIntraTaps[pred][i][4] * p4 +
+                          kFilterIntraTaps[pred][i][5] * p5 +
+                          kFilterIntraTaps[pred][i][6] * p6;
+        // Section 7.11.2.3 specifies the right-hand side of the assignment as
+        //   Clip1( Round2Signed( pr, INTRA_FILTER_SCALE_BITS ) ).
+        // Since Clip1() clips a negative value to 0, it is safe to replace
+        // Round2Signed() with Round2().
+        buffer[1 + yoffset][x + xoffset] = static_cast<Pixel>(
+            Clip3(RightShiftWithRounding(value, 4), 0, kMaxPixel));
+      }
+      x += 4;
+    } while (x < width);
+    memcpy(dst, &buffer[1][1], width * sizeof(dst[0]));
+    dst += stride;
+    memcpy(dst, &buffer[row2][1], width * sizeof(dst[0]));
+    dst += stride;
+
+    // The final row becomes the top for the next pass.
+    row0 ^= 2;
+    row2 ^= 2;
+    ystep = -ystep;
+    y += 2;
+  } while (y < height);
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->filter_intra_predictor = FilterIntraPredictor_C<8, uint8_t>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_FilterIntraPredictor
+  dsp->filter_intra_predictor = FilterIntraPredictor_C<8, uint8_t>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->filter_intra_predictor = FilterIntraPredictor_C<10, uint16_t>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp10bpp_FilterIntraPredictor
+  dsp->filter_intra_predictor = FilterIntraPredictor_C<10, uint16_t>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+}  // namespace
+
+void IntraPredFilterInit_C() {
+  Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  Init10bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/intrapred_filter.h b/src/dsp/intrapred_filter.h
new file mode 100644
index 0000000..8146b82
--- /dev/null
+++ b/src/dsp/intrapred_filter.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_INTRAPRED_FILTER_H_
+#define LIBGAV1_SRC_DSP_INTRAPRED_FILTER_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/intrapred_filter_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/intrapred_filter_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::intra_predictors, Dsp::directional_intra_predictor_zone*,
+// Dsp::cfl_intra_predictors, Dsp::cfl_subsamplers and
+// Dsp::filter_intra_predictor. This function is not thread-safe.
+void IntraPredFilterInit_C();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DSP_INTRAPRED_FILTER_H_
diff --git a/src/dsp/intrapred_filter_test.cc b/src/dsp/intrapred_filter_test.cc
new file mode 100644
index 0000000..c420f0a
--- /dev/null
+++ b/src/dsp/intrapred_filter_test.cc
@@ -0,0 +1,554 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred_filter.h"
+
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <ostream>
+
+#include "absl/strings/match.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/memory.h"
+#include "tests/block_utils.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kMaxBlockSize = 64;
+constexpr int kTotalPixels = kMaxBlockSize * kMaxBlockSize;
+
+const char* const kFilterIntraPredNames[kNumFilterIntraPredictors] = {
+    "kFilterIntraPredictorDc",         "kFilterIntraPredictorVertical",
+    "kFilterIntraPredictorHorizontal", "kFilterIntraPredictorD157",
+    "kFilterIntraPredictorPaeth",
+};
+
+template <int bitdepth, typename Pixel>
+class IntraPredTestBase : public testing::TestWithParam<TransformSize>,
+                          public test_utils::MaxAlignedAllocable {
+ public:
+  IntraPredTestBase() {
+    switch (tx_size_) {
+      case kNumTransformSizes:
+        EXPECT_NE(tx_size_, kNumTransformSizes);
+        break;
+      default:
+        block_width_ = kTransformWidth[tx_size_];
+        block_height_ = kTransformHeight[tx_size_];
+        break;
+    }
+  }
+
+  IntraPredTestBase(const IntraPredTestBase&) = delete;
+  IntraPredTestBase& operator=(const IntraPredTestBase&) = delete;
+  ~IntraPredTestBase() override = default;
+
+ protected:
+  struct IntraPredMem {
+    void Reset(libvpx_test::ACMRandom* rnd) {
+      ASSERT_NE(rnd, nullptr);
+      Pixel* const left = left_mem + 16;
+      Pixel* const top = top_mem + 16;
+      const int mask = (1 << bitdepth) - 1;
+      for (auto& r : ref_src) r = rnd->Rand16() & mask;
+      for (int i = 0; i < kMaxBlockSize; ++i) left[i] = rnd->Rand16() & mask;
+      for (int i = -1; i < kMaxBlockSize; ++i) top[i] = rnd->Rand16() & mask;
+
+      // Some directional predictors require top-right, bottom-left.
+      for (int i = kMaxBlockSize; i < 2 * kMaxBlockSize; ++i) {
+        left[i] = rnd->Rand16() & mask;
+        top[i] = rnd->Rand16() & mask;
+      }
+      // TODO(jzern): reorder this and regenerate the digests after switching
+      // random number generators.
+      // Upsampling in the directional predictors extends left/top[-1] to [-2].
+      left[-1] = rnd->Rand16() & mask;
+      left[-2] = rnd->Rand16() & mask;
+      top[-2] = rnd->Rand16() & mask;
+      memset(left_mem, 0, sizeof(left_mem[0]) * 14);
+      memset(top_mem, 0, sizeof(top_mem[0]) * 14);
+      memset(top_mem + kMaxBlockSize * 2 + 16, 0,
+             sizeof(top_mem[0]) * kTopMemPadding);
+    }
+
+    // Set ref_src, top-left, top and left to |pixel|.
+    void Set(const Pixel pixel) {
+      Pixel* const left = left_mem + 16;
+      Pixel* const top = top_mem + 16;
+      for (auto& r : ref_src) r = pixel;
+      // Upsampling in the directional predictors extends left/top[-1] to [-2].
+      for (int i = -2; i < 2 * kMaxBlockSize; ++i) {
+        left[i] = top[i] = pixel;
+      }
+    }
+
+    // DirectionalZone1_Large() overreads up to 7 pixels in |top_mem|.
+    static constexpr int kTopMemPadding = 7;
+    alignas(kMaxAlignment) Pixel dst[kTotalPixels];
+    alignas(kMaxAlignment) Pixel ref_src[kTotalPixels];
+    alignas(kMaxAlignment) Pixel left_mem[kMaxBlockSize * 2 + 16];
+    alignas(
+        kMaxAlignment) Pixel top_mem[kMaxBlockSize * 2 + 16 + kTopMemPadding];
+  };
+
+  void SetUp() override { test_utils::ResetDspTable(bitdepth); }
+
+  const TransformSize tx_size_ = GetParam();
+  int block_width_;
+  int block_height_;
+  IntraPredMem intra_pred_mem_;
+};
+
+//------------------------------------------------------------------------------
+// FilterIntraPredTest
+
+template <int bitdepth, typename Pixel>
+class FilterIntraPredTest : public IntraPredTestBase<bitdepth, Pixel> {
+ public:
+  FilterIntraPredTest() = default;
+  FilterIntraPredTest(const FilterIntraPredTest&) = delete;
+  FilterIntraPredTest& operator=(const FilterIntraPredTest&) = delete;
+  ~FilterIntraPredTest() override = default;
+
+ protected:
+  using IntraPredTestBase<bitdepth, Pixel>::tx_size_;
+  using IntraPredTestBase<bitdepth, Pixel>::block_width_;
+  using IntraPredTestBase<bitdepth, Pixel>::block_height_;
+  using IntraPredTestBase<bitdepth, Pixel>::intra_pred_mem_;
+
+  void SetUp() override {
+    IntraPredTestBase<bitdepth, Pixel>::SetUp();
+    IntraPredFilterInit_C();
+
+    const Dsp* const dsp = GetDspTable(bitdepth);
+    ASSERT_NE(dsp, nullptr);
+    base_filter_intra_pred_ = dsp->filter_intra_predictor;
+
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    const char* const test_case = test_info->test_suite_name();
+    if (absl::StartsWith(test_case, "C/")) {
+      // No need to compare C with itself.
+      base_filter_intra_pred_ = nullptr;
+    } else if (absl::StartsWith(test_case, "SSE41/")) {
+      if ((GetCpuInfo() & kSSE4_1) != 0) {
+        IntraPredFilterInit_SSE4_1();
+      }
+    } else if (absl::StartsWith(test_case, "NEON/")) {
+      IntraPredFilterInit_NEON();
+    } else {
+      FAIL() << "Unrecognized architecture prefix in test case name: "
+             << test_case;
+    }
+
+    // Put the current architecture-specific implementation up for testing and
+    // comparison against C version.
+    cur_filter_intra_pred_ = dsp->filter_intra_predictor;
+  }
+
+  // These tests modify intra_pred_mem_.
+  void TestSpeed(const char* const digests[kNumFilterIntraPredictors],
+                 int num_runs);
+  void TestSaturatedValues();
+  void TestRandomValues();
+
+  FilterIntraPredictorFunc base_filter_intra_pred_;
+  FilterIntraPredictorFunc cur_filter_intra_pred_;
+};
+
+template <int bitdepth, typename Pixel>
+void FilterIntraPredTest<bitdepth, Pixel>::TestSpeed(
+    const char* const digests[kNumFilterIntraPredictors], const int num_runs) {
+  ASSERT_NE(digests, nullptr);
+  const Pixel* const left = intra_pred_mem_.left_mem + 16;
+  const Pixel* const top = intra_pred_mem_.top_mem + 16;
+
+  libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+  intra_pred_mem_.Reset(&rnd);
+
+  // IntraPredInit_C() leaves the filter function empty.
+  if (cur_filter_intra_pred_ == nullptr) return;
+  for (int i = 0; i < kNumFilterIntraPredictors; ++i) {
+    memcpy(intra_pred_mem_.dst, intra_pred_mem_.ref_src,
+           sizeof(intra_pred_mem_.dst));
+    const absl::Time start = absl::Now();
+    for (int run = 0; run < num_runs; ++run) {
+      const ptrdiff_t stride = kMaxBlockSize * sizeof(Pixel);
+      cur_filter_intra_pred_(intra_pred_mem_.dst, stride, top, left,
+                             static_cast<FilterIntraPredictor>(i), block_width_,
+                             block_height_);
+    }
+    const absl::Duration elapsed_time = absl::Now() - start;
+    test_utils::CheckMd5Digest(ToString(tx_size_), kFilterIntraPredNames[i],
+                               digests[i], intra_pred_mem_.dst,
+                               sizeof(intra_pred_mem_.dst), elapsed_time);
+  }
+}
+
+template <int bitdepth, typename Pixel>
+void FilterIntraPredTest<bitdepth, Pixel>::TestSaturatedValues() {
+  Pixel* const left = intra_pred_mem_.left_mem + 16;
+  Pixel* const top = intra_pred_mem_.top_mem + 16;
+  const auto kMaxPixel = static_cast<Pixel>((1 << bitdepth) - 1);
+  intra_pred_mem_.Set(kMaxPixel);
+
+  // IntraPredInit_C() leaves the filter function empty.
+  if (cur_filter_intra_pred_ == nullptr) return;
+  for (int i = 0; i < kNumFilterIntraPredictors; ++i) {
+    memcpy(intra_pred_mem_.dst, intra_pred_mem_.ref_src,
+           sizeof(intra_pred_mem_.dst));
+    const ptrdiff_t stride = kMaxBlockSize * sizeof(Pixel);
+    cur_filter_intra_pred_(intra_pred_mem_.dst, stride, top, left,
+                           static_cast<FilterIntraPredictor>(i), block_width_,
+                           block_height_);
+    if (!test_utils::CompareBlocks(intra_pred_mem_.dst, intra_pred_mem_.ref_src,
+                                   block_width_, block_height_, kMaxBlockSize,
+                                   kMaxBlockSize, true)) {
+      ADD_FAILURE() << "Expected " << kFilterIntraPredNames[i]
+                    << " to produce a block containing '"
+                    << static_cast<int>(kMaxPixel) << "'";
+    }
+  }
+}
+
+template <int bitdepth, typename Pixel>
+void FilterIntraPredTest<bitdepth, Pixel>::TestRandomValues() {
+  // Skip the 'C' test case as this is used as the reference.
+  if (base_filter_intra_pred_ == nullptr) return;
+
+  // Use an alternate seed to differentiate this test from TestSpeed().
+  libvpx_test::ACMRandom rnd(test_utils::kAlternateDeterministicSeed);
+  for (int i = 0; i < kNumFilterIntraPredictors; ++i) {
+    // It may be worthwhile to temporarily increase this loop size when testing
+    // changes that specifically affect this test.
+    for (int n = 0; n < 10000; ++n) {
+      intra_pred_mem_.Reset(&rnd);
+
+      memcpy(intra_pred_mem_.dst, intra_pred_mem_.ref_src,
+             sizeof(intra_pred_mem_.dst));
+      const Pixel* const top = intra_pred_mem_.top_mem + 16;
+      const Pixel* const left = intra_pred_mem_.left_mem + 16;
+      const ptrdiff_t stride = kMaxBlockSize * sizeof(Pixel);
+      base_filter_intra_pred_(intra_pred_mem_.ref_src, stride, top, left,
+                              static_cast<FilterIntraPredictor>(i),
+                              block_width_, block_height_);
+      cur_filter_intra_pred_(intra_pred_mem_.dst, stride, top, left,
+                             static_cast<FilterIntraPredictor>(i), block_width_,
+                             block_height_);
+      if (!test_utils::CompareBlocks(
+              intra_pred_mem_.dst, intra_pred_mem_.ref_src, block_width_,
+              block_height_, kMaxBlockSize, kMaxBlockSize, true)) {
+        ADD_FAILURE() << "Result from optimized version of "
+                      << kFilterIntraPredNames[i]
+                      << " differs from reference in iteration #" << n;
+        break;
+      }
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+using FilterIntraPredTest8bpp = FilterIntraPredTest<8, uint8_t>;
+
+const char* const* GetFilterIntraPredDigests8bpp(TransformSize tx_size) {
+  static const char* const kDigests4x4[kNumFilterIntraPredictors] = {
+      "a2486efcfb351d60a8941203073e89c6", "240716ae5ecaedc19edae1bdef49e05d",
+      "dacf4af66a966aca7c75abe24cd9ba99", "311888773676f3c2ae3334c4e0f141e5",
+      "2d3711616c8d8798f608e313cb07a72a",
+  };
+  static const char* const kDigests4x8[kNumFilterIntraPredictors] = {
+      "1cb74ba1abc68d936e87c13511ed5fbf", "d64c2c08586a762dbdfa8e1150bede06",
+      "73e9d1a9b6fa3e96fbd65c7dce507529", "e3ae17d9338e5aa3420d31d0e2d7ee87",
+      "750dbfe3bc5508b7031957a1d315b8bc",
+  };
+  static const char* const kDigests4x16[kNumFilterIntraPredictors] = {
+      "48a1060701bf68ec6342d6e24c10ef17", "0c91ff7988814d192ed95e840a87b4bf",
+      "efe586b891c8828c4116c9fbf50850cc", "a3bfa10be2b155826f107e9256ac3ba1",
+      "976273745b94a561fd52f5aa96fb280f",
+  };
+  static const char* const kDigests8x4[kNumFilterIntraPredictors] = {
+      "73f82633aeb28db1d254d077edefd8a9", "8eee505cdb5828e33b67ff5572445dac",
+      "9b0f101c28c66a916079fe5ed33b4021", "47fd44a7e5a5b55f067908192698e25c",
+      "eab59a3710d9bdeca8fa03a15d3f95d6",
+  };
+  static const char* const kDigests8x8[kNumFilterIntraPredictors] = {
+      "aa07b7a007c4c1d494ddb44a23c27bcd", "d27eee43f15dfcfe4c46cd46b681983b",
+      "1015d26022cf57acfdb11fd3f6b9ccb0", "4f0e00ef556fbcac2fb31e3b18869070",
+      "918c2553635763a0756b20154096bca6",
+  };
+  static const char* const kDigests8x16[kNumFilterIntraPredictors] = {
+      "a8ac58b2efb02092035cca206dbf5fbe", "0b22b000b7f124b32545bc86dd9f0142",
+      "cd6a08e023cad301c084b6ec2999da63", "c017f5f4fa5c05e7638ae4db98512b13",
+      "893e6995522e23ed3d613ef3797ca580",
+  };
+  static const char* const kDigests8x32[kNumFilterIntraPredictors] = {
+      "b3d5d4f09b778ae2b8cc0e9014c22320", "e473874a1e65228707489be9ca6477aa",
+      "91bda5a2d32780af345bb3d49324732f", "20f2ff26f004f02e8e2be49e6cadc32f",
+      "00c909b749e36142b133a7357271e83e",
+  };
+  static const char* const kDigests16x4[kNumFilterIntraPredictors] = {
+      "ef252f074fc3f5367748436e676e78ca", "cd436d8803ea40db3a849e7c869855c7",
+      "9cd8601b5d66e61fd002f8b11bfa58d9", "b982f17ee36ef0d1c2cfea20197d5666",
+      "9e350d1cd65d520194281633f566810d",
+  };
+  static const char* const kDigests16x8[kNumFilterIntraPredictors] = {
+      "9a7e0cf9b023a89ee619ee672ba2a219", "c20186bc642912ecd4d48bc4924a79b1",
+      "77de044f4c7f717f947a36fc0aa17946", "3f2fc68f11e6ee0220adb8d1ee085c8e",
+      "2f37e586769dfb88d9d4116b9c28c5ab",
+  };
+  static const char* const kDigests16x16[kNumFilterIntraPredictors] = {
+      "36c5b85b9a6b1d2e8f44f09c81adfe9c", "78494ce3a6a78aa2879ad2e24d43a005",
+      "aa30cd29a74407dbec80161745161eb2", "ae2a0975ef166e05e5e8c3701bd19e93",
+      "6322fba6f3bcb1f6c8e78160d200809c",
+  };
+  static const char* const kDigests16x32[kNumFilterIntraPredictors] = {
+      "82d54732c37424946bc73f5a78f64641", "071773c82869bb103c31e05f14ed3c2f",
+      "3a0094c150bd6e21ce1f17243b21e76b", "998ffef26fc65333ae407bbe9d41a252",
+      "6491add6b665aafc364c8c104a6a233d",
+  };
+  static const char* const kDigests32x8[kNumFilterIntraPredictors] = {
+      "c60062105dd727e94f744c35f0d2156e", "36a9e4d543701c4c546016e35e9c4337",
+      "05a8d07fe271023e63febfb44814d114", "0a28606925519d1ed067d64761619dc8",
+      "bb8c34b143910ba49b01d13e94d936ac",
+  };
+  static const char* const kDigests32x16[kNumFilterIntraPredictors] = {
+      "60e6caeec9194fcb409469e6e1393128", "5d764ead046443eb14f76822a569b056",
+      "b1bf22fcc282614354166fa1eb6e5f8b", "4b188e729fe49ae24100b3ddd8f17313",
+      "75f430fdea0b7b5b66866fd68a795a6a",
+  };
+  static const char* const kDigests32x32[kNumFilterIntraPredictors] = {
+      "5bb91a37b1979866eb23b59dd352229d", "589aa983109500749609d7be1cb79711",
+      "5e8fb1927cdbe21143494b56b5d400f6", "9e28f741d19c64b2a0577d83546d32d9",
+      "73c73237a5d891096066b186abf96854",
+  };
+
+  switch (tx_size) {
+    case kTransformSize4x4:
+      return kDigests4x4;
+    case kTransformSize4x8:
+      return kDigests4x8;
+    case kTransformSize4x16:
+      return kDigests4x16;
+    case kTransformSize8x4:
+      return kDigests8x4;
+    case kTransformSize8x8:
+      return kDigests8x8;
+    case kTransformSize8x16:
+      return kDigests8x16;
+    case kTransformSize8x32:
+      return kDigests8x32;
+    case kTransformSize16x4:
+      return kDigests16x4;
+    case kTransformSize16x8:
+      return kDigests16x8;
+    case kTransformSize16x16:
+      return kDigests16x16;
+    case kTransformSize16x32:
+      return kDigests16x32;
+    case kTransformSize32x8:
+      return kDigests32x8;
+    case kTransformSize32x16:
+      return kDigests32x16;
+    case kTransformSize32x32:
+      return kDigests32x32;
+    default:
+      ADD_FAILURE() << "Unknown transform size: " << tx_size;
+      return nullptr;
+  }
+}
+
+TEST_P(FilterIntraPredTest8bpp, DISABLED_Speed) {
+  const auto num_runs =
+      static_cast<int>(2.5e8 / (block_width_ * block_height_));
+  TestSpeed(GetFilterIntraPredDigests8bpp(tx_size_), num_runs);
+}
+
+TEST_P(FilterIntraPredTest8bpp, FixedInput) {
+  TestSpeed(GetFilterIntraPredDigests8bpp(tx_size_), 1);
+}
+
+TEST_P(FilterIntraPredTest8bpp, Overflow) { TestSaturatedValues(); }
+TEST_P(FilterIntraPredTest8bpp, Random) { TestRandomValues(); }
+
+//------------------------------------------------------------------------------
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using FilterIntraPredTest10bpp = FilterIntraPredTest<10, uint16_t>;
+
+const char* const* GetFilterIntraPredDigests10bpp(TransformSize tx_size) {
+  static const char* const kDigests4x4[kNumFilterIntraPredictors] = {
+      "13a9014d9e255cde8e3e85abf6ef5151", "aee33aa3f3baec87a8c019743fff40f1",
+      "fdd8ca2be424501f51fcdb603c2e757c", "aed00c082d1980d4bab45e9318b939f0",
+      "1b363db246aa5400f49479b7d5d41799",
+  };
+  static const char* const kDigests4x8[kNumFilterIntraPredictors] = {
+      "e718b9e31ba3da0392fd4b6cfba5d882", "31ba22989cdc3bb80749685f42c6c697",
+      "6bc5b3a55b94018117569cfdced17bf9", "ec29979fb4936116493dfa1cfc93901c",
+      "c6bcf564e63c42148d9917f089566432",
+  };
+  static const char* const kDigests4x16[kNumFilterIntraPredictors] = {
+      "404bddd88dff2c0414b5398287e54f18", "ff4fb3039cec6c9ffed6d259cbbfd854",
+      "7d6fa3ed9e728ff056a73c40bb6edeb6", "82845d942ad8048578e0037336905146",
+      "f3c07ea65db08c639136a5a9270f95ff",
+  };
+  static const char* const kDigests8x4[kNumFilterIntraPredictors] = {
+      "2008981638f27ba9123973a733e46c3d", "47efecf1f7628cbd8c22e168fcceb5ce",
+      "04c857ffbd1edd6e2788b17410a4a39c", "deb0236c4277b4d7b174fba407e1c9d7",
+      "5b58567f94ae9fa930f700c68c17399d",
+  };
+  static const char* const kDigests8x8[kNumFilterIntraPredictors] = {
+      "d9bab44a6d1373e758bfa0ee88239093", "29b10ddb32d9de2ff0cad6126f010ff6",
+      "1a03f9a18bdbab0811138cd969bf1f93", "e3273c24e77095ffa033a073f5bbcf7b",
+      "5187bb3df943d154cb01fb2f244ff86f",
+  };
+  static const char* const kDigests8x16[kNumFilterIntraPredictors] = {
+      "a2199f792634a56f1c4e88510e408773", "8fd8a98969d19832975ee7131cca9dbb",
+      "d897380941f75b04b1327e63f136d7d6", "d36f52a157027d53b15b7c02a7983436",
+      "0a8c23047b0364f5687b62b01f043359",
+  };
+  static const char* const kDigests8x32[kNumFilterIntraPredictors] = {
+      "5b74ea8e4f60151cf2db9b23d803a2e2", "e0d6bb5fa7d181589c31fcf2755d7c0b",
+      "42e590ffc88b8940b7aade22e13bbb6a", "e47c39ec1761aa7b5a9b1368ede7cfdc",
+      "6e963a89beac6f3a362c269d1017f9a8",
+  };
+  static const char* const kDigests16x4[kNumFilterIntraPredictors] = {
+      "9eaa079622b5dd95ad3a8feb68fa9bbb", "17e3aa6a0034e9eedcfc65b8ce6e7205",
+      "eac5a5337dbaf9bcbc3d320745c8e190", "c6ba9a7e518be04f725bc1dbd399c204",
+      "19020b82ce8bb49a511820c7e1d58e99",
+  };
+  static const char* const kDigests16x8[kNumFilterIntraPredictors] = {
+      "2d2c3255d5dfc1479a5d82a7d5a0d42e", "0fbb4ee851b4ee58c6d30dd820d19e38",
+      "fa77a1b056e8dc8efb702c7832531b32", "186269ca219dc663ad9b4a53e011a54b",
+      "c12180a6dcde0c3579befbb5304ff70b",
+  };
+  static const char* const kDigests16x16[kNumFilterIntraPredictors] = {
+      "dbb81d7ee7d3c83c271400d0160b2e83", "4da656a3ef238d90bb8339471a6fdb7e",
+      "d95006bf299b84a1b04e38d5fa8fb4f7", "742a03331f0fbd66c57df0ae31104aca",
+      "4d20aa440e38b6b7ac83c8c54d313169",
+  };
+  static const char* const kDigests16x32[kNumFilterIntraPredictors] = {
+      "6247730c93789cc25bcb837781dfa05b", "9a93e14b06dd145e35ab21a0353bdebe",
+      "6c5866353e30296a67d9bd7a65d6998d", "389d7f038d7997871745bb1305156ff9",
+      "e7640d81f891e1d06e7da75c6ae74d93",
+  };
+  static const char* const kDigests32x8[kNumFilterIntraPredictors] = {
+      "68f3a603b7c25dd78deffe91aef22834", "48c735e4aa951d6333d99e571bfeadc8",
+      "35239df0993a429fc599a3037c731e4b", "ba7dd72e04af1a1fc1b30784c11df783",
+      "78e9017f7434665d32ec59795aed0012",
+  };
+  static const char* const kDigests32x16[kNumFilterIntraPredictors] = {
+      "8cf2f11f7f77901cb0c522ad191eb998", "204c76d68c5117b89b5c3a05d5548883",
+      "f3751e41e7a595f43d8aaf9a40644e05", "81ea1a7d608d7b91dd3ede0f87e750ee",
+      "b5951334dfbe6229d828e03cd2d98538",
+  };
+  static const char* const kDigests32x32[kNumFilterIntraPredictors] = {
+      "9d8630188c3d1a4f28a6106e343c9380", "c6c92e059faa17163522409b7bf93230",
+      "62e4c959cb06ec661d98769981fbd555", "01e61673f11011571246668e36cc61c5",
+      "4530222ea1de546e202630fcf43f4526",
+  };
+
+  switch (tx_size) {
+    case kTransformSize4x4:
+      return kDigests4x4;
+    case kTransformSize4x8:
+      return kDigests4x8;
+    case kTransformSize4x16:
+      return kDigests4x16;
+    case kTransformSize8x4:
+      return kDigests8x4;
+    case kTransformSize8x8:
+      return kDigests8x8;
+    case kTransformSize8x16:
+      return kDigests8x16;
+    case kTransformSize8x32:
+      return kDigests8x32;
+    case kTransformSize16x4:
+      return kDigests16x4;
+    case kTransformSize16x8:
+      return kDigests16x8;
+    case kTransformSize16x16:
+      return kDigests16x16;
+    case kTransformSize16x32:
+      return kDigests16x32;
+    case kTransformSize32x8:
+      return kDigests32x8;
+    case kTransformSize32x16:
+      return kDigests32x16;
+    case kTransformSize32x32:
+      return kDigests32x32;
+    default:
+      ADD_FAILURE() << "Unknown transform size: " << tx_size;
+      return nullptr;
+  }
+}
+
+TEST_P(FilterIntraPredTest10bpp, DISABLED_Speed) {
+  const auto num_runs =
+      static_cast<int>(2.5e8 / (block_width_ * block_height_));
+  TestSpeed(GetFilterIntraPredDigests10bpp(tx_size_), num_runs);
+}
+
+TEST_P(FilterIntraPredTest10bpp, FixedInput) {
+  TestSpeed(GetFilterIntraPredDigests10bpp(tx_size_), 1);
+}
+
+TEST_P(FilterIntraPredTest10bpp, Overflow) { TestSaturatedValues(); }
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+// Filter-intra and Cfl predictors are available only for transform sizes
+// with max(width, height) <= 32.
+constexpr TransformSize kTransformSizesSmallerThan32x32[] = {
+    kTransformSize4x4,   kTransformSize4x8,   kTransformSize4x16,
+    kTransformSize8x4,   kTransformSize8x8,   kTransformSize8x16,
+    kTransformSize8x32,  kTransformSize16x4,  kTransformSize16x8,
+    kTransformSize16x16, kTransformSize16x32, kTransformSize32x8,
+    kTransformSize32x16, kTransformSize32x32};
+
+INSTANTIATE_TEST_SUITE_P(C, FilterIntraPredTest8bpp,
+                         testing::ValuesIn(kTransformSizesSmallerThan32x32));
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, FilterIntraPredTest8bpp,
+                         testing::ValuesIn(kTransformSizesSmallerThan32x32));
+#endif  // LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, FilterIntraPredTest8bpp,
+                         testing::ValuesIn(kTransformSizesSmallerThan32x32));
+#endif  // LIBGAV1_ENABLE_NEON
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+INSTANTIATE_TEST_SUITE_P(C, FilterIntraPredTest10bpp,
+                         testing::ValuesIn(kTransformSizesSmallerThan32x32));
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+}  // namespace
+}  // namespace dsp
+
+static std::ostream& operator<<(std::ostream& os, const TransformSize tx_size) {
+  return os << ToString(tx_size);
+}
+
+}  // namespace libgav1
diff --git a/src/dsp/intrapred_smooth.cc b/src/dsp/intrapred_smooth.cc
new file mode 100644
index 0000000..83c005e
--- /dev/null
+++ b/src/dsp/intrapred_smooth.cc
@@ -0,0 +1,738 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred_smooth.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdlib>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+template <int block_width, int block_height, typename Pixel>
+struct SmoothFuncs_C {
+  SmoothFuncs_C() = delete;
+
+  static void Smooth(void* dest, ptrdiff_t stride, const void* top_row,
+                     const void* left_column);
+  static void SmoothVertical(void* dest, ptrdiff_t stride, const void* top_row,
+                             const void* left_column);
+  static void SmoothHorizontal(void* dest, ptrdiff_t stride,
+                               const void* top_row, const void* left_column);
+};
+
+constexpr uint8_t kSmoothWeights[] = {
+    // block dimension = 4
+    255, 149, 85, 64,
+    // block dimension = 8
+    255, 197, 146, 105, 73, 50, 37, 32,
+    // block dimension = 16
+    255, 225, 196, 170, 145, 123, 102, 84, 68, 54, 43, 33, 26, 20, 17, 16,
+    // block dimension = 32
+    255, 240, 225, 210, 196, 182, 169, 157, 145, 133, 122, 111, 101, 92, 83, 74,
+    66, 59, 52, 45, 39, 34, 29, 25, 21, 17, 14, 12, 10, 9, 8, 8,
+    // block dimension = 64
+    255, 248, 240, 233, 225, 218, 210, 203, 196, 189, 182, 176, 169, 163, 156,
+    150, 144, 138, 133, 127, 121, 116, 111, 106, 101, 96, 91, 86, 82, 77, 73,
+    69, 65, 61, 57, 54, 50, 47, 44, 41, 38, 35, 32, 29, 27, 25, 22, 20, 18, 16,
+    15, 13, 12, 10, 9, 8, 7, 6, 6, 5, 5, 4, 4, 4};
+
+// SmoothFuncs_C::Smooth
+template <int block_width, int block_height, typename Pixel>
+void SmoothFuncs_C<block_width, block_height, Pixel>::Smooth(
+    void* const dest, ptrdiff_t stride, const void* const top_row,
+    const void* const left_column) {
+  const auto* const top = static_cast<const Pixel*>(top_row);
+  const auto* const left = static_cast<const Pixel*>(left_column);
+  const Pixel top_right = top[block_width - 1];
+  const Pixel bottom_left = left[block_height - 1];
+  static_assert(
+      block_width >= 4 && block_height >= 4,
+      "Weights for smooth predictor undefined for block width/height < 4");
+  const uint8_t* const weights_x = kSmoothWeights + block_width - 4;
+  const uint8_t* const weights_y = kSmoothWeights + block_height - 4;
+  const uint16_t scale_value = (1 << kSmoothWeightScale);
+  auto* dst = static_cast<Pixel*>(dest);
+  stride /= sizeof(Pixel);
+
+  for (int y = 0; y < block_height; ++y) {
+    for (int x = 0; x < block_width; ++x) {
+      assert(scale_value >= weights_y[y] && scale_value >= weights_x[x]);
+      uint32_t pred = weights_y[y] * top[x];
+      pred += weights_x[x] * left[y];
+      pred += static_cast<uint8_t>(scale_value - weights_y[y]) * bottom_left;
+      pred += static_cast<uint8_t>(scale_value - weights_x[x]) * top_right;
+      // The maximum value of pred with the rounder is 2^9 * (2^bitdepth - 1)
+      // + 256. With the descale there's no need for saturation.
+      dst[x] = static_cast<Pixel>(
+          RightShiftWithRounding(pred, kSmoothWeightScale + 1));
+    }
+    dst += stride;
+  }
+}
+
+// SmoothFuncs_C::SmoothVertical
+template <int block_width, int block_height, typename Pixel>
+void SmoothFuncs_C<block_width, block_height, Pixel>::SmoothVertical(
+    void* const dest, ptrdiff_t stride, const void* const top_row,
+    const void* const left_column) {
+  const auto* const top = static_cast<const Pixel*>(top_row);
+  const auto* const left = static_cast<const Pixel*>(left_column);
+  const Pixel bottom_left = left[block_height - 1];
+  static_assert(block_height >= 4,
+                "Weights for smooth predictor undefined for block height < 4");
+  const uint8_t* const weights_y = kSmoothWeights + block_height - 4;
+  const uint16_t scale_value = (1 << kSmoothWeightScale);
+  auto* dst = static_cast<Pixel*>(dest);
+  stride /= sizeof(Pixel);
+
+  for (int y = 0; y < block_height; ++y) {
+    for (int x = 0; x < block_width; ++x) {
+      assert(scale_value >= weights_y[y]);
+      uint32_t pred = weights_y[y] * top[x];
+      pred += static_cast<uint8_t>(scale_value - weights_y[y]) * bottom_left;
+      dst[x] =
+          static_cast<Pixel>(RightShiftWithRounding(pred, kSmoothWeightScale));
+    }
+    dst += stride;
+  }
+}
+
+// SmoothFuncs_C::SmoothHorizontal
+template <int block_width, int block_height, typename Pixel>
+void SmoothFuncs_C<block_width, block_height, Pixel>::SmoothHorizontal(
+    void* const dest, ptrdiff_t stride, const void* const top_row,
+    const void* const left_column) {
+  const auto* const top = static_cast<const Pixel*>(top_row);
+  const auto* const left = static_cast<const Pixel*>(left_column);
+  const Pixel top_right = top[block_width - 1];
+  static_assert(block_width >= 4,
+                "Weights for smooth predictor undefined for block width < 4");
+  const uint8_t* const weights_x = kSmoothWeights + block_width - 4;
+  const uint16_t scale_value = (1 << kSmoothWeightScale);
+  auto* dst = static_cast<Pixel*>(dest);
+  stride /= sizeof(Pixel);
+
+  for (int y = 0; y < block_height; ++y) {
+    for (int x = 0; x < block_width; ++x) {
+      assert(scale_value >= weights_x[x]);
+      uint32_t pred = weights_x[x] * left[y];
+      pred += static_cast<uint8_t>(scale_value - weights_x[x]) * top_right;
+      dst[x] =
+          static_cast<Pixel>(RightShiftWithRounding(pred, kSmoothWeightScale));
+    }
+    dst += stride;
+  }
+}
+
+// -----------------------------------------------------------------------------
+
+template <typename Pixel>
+struct SmoothDefs {
+  SmoothDefs() = delete;
+
+  using _4x4 = SmoothFuncs_C<4, 4, Pixel>;
+  using _4x8 = SmoothFuncs_C<4, 8, Pixel>;
+  using _4x16 = SmoothFuncs_C<4, 16, Pixel>;
+  using _8x4 = SmoothFuncs_C<8, 4, Pixel>;
+  using _8x8 = SmoothFuncs_C<8, 8, Pixel>;
+  using _8x16 = SmoothFuncs_C<8, 16, Pixel>;
+  using _8x32 = SmoothFuncs_C<8, 32, Pixel>;
+  using _16x4 = SmoothFuncs_C<16, 4, Pixel>;
+  using _16x8 = SmoothFuncs_C<16, 8, Pixel>;
+  using _16x16 = SmoothFuncs_C<16, 16, Pixel>;
+  using _16x32 = SmoothFuncs_C<16, 32, Pixel>;
+  using _16x64 = SmoothFuncs_C<16, 64, Pixel>;
+  using _32x8 = SmoothFuncs_C<32, 8, Pixel>;
+  using _32x16 = SmoothFuncs_C<32, 16, Pixel>;
+  using _32x32 = SmoothFuncs_C<32, 32, Pixel>;
+  using _32x64 = SmoothFuncs_C<32, 64, Pixel>;
+  using _64x16 = SmoothFuncs_C<64, 16, Pixel>;
+  using _64x32 = SmoothFuncs_C<64, 32, Pixel>;
+  using _64x64 = SmoothFuncs_C<64, 64, Pixel>;
+};
+
+using Defs = SmoothDefs<uint8_t>;
+
+// Initializes dsp entries for kTransformSize|W|x|H| from |DEFS| of
+// the same size.
+#define INIT_SMOOTH_WxH(DEFS, W, H)                                       \
+  dsp->intra_predictors[kTransformSize##W##x##H][kIntraPredictorSmooth] = \
+      DEFS::_##W##x##H::Smooth;                                           \
+  dsp->intra_predictors[kTransformSize##W##x##H]                          \
+                       [kIntraPredictorSmoothVertical] =                  \
+      DEFS::_##W##x##H::SmoothVertical;                                   \
+  dsp->intra_predictors[kTransformSize##W##x##H]                          \
+                       [kIntraPredictorSmoothHorizontal] =                \
+      DEFS::_##W##x##H::SmoothHorizontal
+
+#define INIT_SMOOTH(DEFS)        \
+  INIT_SMOOTH_WxH(DEFS, 4, 4);   \
+  INIT_SMOOTH_WxH(DEFS, 4, 8);   \
+  INIT_SMOOTH_WxH(DEFS, 4, 16);  \
+  INIT_SMOOTH_WxH(DEFS, 8, 4);   \
+  INIT_SMOOTH_WxH(DEFS, 8, 8);   \
+  INIT_SMOOTH_WxH(DEFS, 8, 16);  \
+  INIT_SMOOTH_WxH(DEFS, 8, 32);  \
+  INIT_SMOOTH_WxH(DEFS, 16, 4);  \
+  INIT_SMOOTH_WxH(DEFS, 16, 8);  \
+  INIT_SMOOTH_WxH(DEFS, 16, 16); \
+  INIT_SMOOTH_WxH(DEFS, 16, 32); \
+  INIT_SMOOTH_WxH(DEFS, 16, 64); \
+  INIT_SMOOTH_WxH(DEFS, 32, 8);  \
+  INIT_SMOOTH_WxH(DEFS, 32, 16); \
+  INIT_SMOOTH_WxH(DEFS, 32, 32); \
+  INIT_SMOOTH_WxH(DEFS, 32, 64); \
+  INIT_SMOOTH_WxH(DEFS, 64, 16); \
+  INIT_SMOOTH_WxH(DEFS, 64, 32); \
+  INIT_SMOOTH_WxH(DEFS, 64, 64)
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  INIT_SMOOTH(Defs);
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmooth] =
+      Defs::_4x4::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothVertical] =
+      Defs::_4x4::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothHorizontal] =
+      Defs::_4x4::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmooth] =
+      Defs::_4x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothVertical] =
+      Defs::_4x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothHorizontal] =
+      Defs::_4x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmooth] =
+      Defs::_4x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothVertical] =
+      Defs::_4x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothHorizontal] =
+      Defs::_4x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmooth] =
+      Defs::_8x4::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothVertical] =
+      Defs::_8x4::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothHorizontal] =
+      Defs::_8x4::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmooth] =
+      Defs::_8x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothVertical] =
+      Defs::_8x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothHorizontal] =
+      Defs::_8x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmooth] =
+      Defs::_8x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothVertical] =
+      Defs::_8x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothHorizontal] =
+      Defs::_8x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmooth] =
+      Defs::_8x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothVertical] =
+      Defs::_8x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothHorizontal] =
+      Defs::_8x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmooth] =
+      Defs::_16x4::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothVertical] =
+      Defs::_16x4::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothHorizontal] =
+      Defs::_16x4::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmooth] =
+      Defs::_16x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothVertical] =
+      Defs::_16x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothHorizontal] =
+      Defs::_16x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmooth] =
+      Defs::_16x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothVertical] =
+      Defs::_16x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothHorizontal] =
+      Defs::_16x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmooth] =
+      Defs::_16x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothVertical] =
+      Defs::_16x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothHorizontal] =
+      Defs::_16x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmooth] =
+      Defs::_16x64::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothVertical] =
+      Defs::_16x64::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothHorizontal] =
+      Defs::_16x64::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmooth] =
+      Defs::_32x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothVertical] =
+      Defs::_32x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothHorizontal] =
+      Defs::_32x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmooth] =
+      Defs::_32x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothVertical] =
+      Defs::_32x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothHorizontal] =
+      Defs::_32x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmooth] =
+      Defs::_32x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothVertical] =
+      Defs::_32x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothHorizontal] =
+      Defs::_32x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmooth] =
+      Defs::_32x64::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothVertical] =
+      Defs::_32x64::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothHorizontal] =
+      Defs::_32x64::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmooth] =
+      Defs::_64x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothVertical] =
+      Defs::_64x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothHorizontal] =
+      Defs::_64x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmooth] =
+      Defs::_64x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothVertical] =
+      Defs::_64x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothHorizontal] =
+      Defs::_64x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmooth] =
+      Defs::_64x64::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothVertical] =
+      Defs::_64x64::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothHorizontal] =
+      Defs::_64x64::SmoothHorizontal;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}  // NOLINT(readability/fn_size)
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using DefsHbd = SmoothDefs<uint16_t>;
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  INIT_SMOOTH(DefsHbd);
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmooth] =
+      DefsHbd::_4x4::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothVertical] =
+      DefsHbd::_4x4::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_4x4::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmooth] =
+      DefsHbd::_4x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothVertical] =
+      DefsHbd::_4x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_4x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmooth] =
+      DefsHbd::_4x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothVertical] =
+      DefsHbd::_4x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_4x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmooth] =
+      DefsHbd::_8x4::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothVertical] =
+      DefsHbd::_8x4::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_8x4::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmooth] =
+      DefsHbd::_8x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothVertical] =
+      DefsHbd::_8x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_8x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmooth] =
+      DefsHbd::_8x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothVertical] =
+      DefsHbd::_8x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_8x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmooth] =
+      DefsHbd::_8x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothVertical] =
+      DefsHbd::_8x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_8x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmooth] =
+      DefsHbd::_16x4::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothVertical] =
+      DefsHbd::_16x4::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_16x4::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmooth] =
+      DefsHbd::_16x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothVertical] =
+      DefsHbd::_16x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_16x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmooth] =
+      DefsHbd::_16x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothVertical] =
+      DefsHbd::_16x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_16x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmooth] =
+      DefsHbd::_16x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothVertical] =
+      DefsHbd::_16x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_16x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmooth] =
+      DefsHbd::_16x64::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothVertical] =
+      DefsHbd::_16x64::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_16x64::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmooth] =
+      DefsHbd::_32x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothVertical] =
+      DefsHbd::_32x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_32x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmooth] =
+      DefsHbd::_32x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothVertical] =
+      DefsHbd::_32x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_32x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmooth] =
+      DefsHbd::_32x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothVertical] =
+      DefsHbd::_32x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_32x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmooth] =
+      DefsHbd::_32x64::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothVertical] =
+      DefsHbd::_32x64::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_32x64::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmooth] =
+      DefsHbd::_64x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothVertical] =
+      DefsHbd::_64x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_64x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmooth] =
+      DefsHbd::_64x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothVertical] =
+      DefsHbd::_64x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_64x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmooth] =
+      DefsHbd::_64x64::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothVertical] =
+      DefsHbd::_64x64::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_64x64::SmoothHorizontal;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}  // NOLINT(readability/fn_size)
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+#undef INIT_SMOOTH_WxH
+#undef INIT_SMOOTH
+}  // namespace
+
+void IntraPredSmoothInit_C() {
+  Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  Init10bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/intrapred_smooth.h b/src/dsp/intrapred_smooth.h
new file mode 100644
index 0000000..6802003
--- /dev/null
+++ b/src/dsp/intrapred_smooth.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_INTRAPRED_SMOOTH_H_
+#define LIBGAV1_SRC_DSP_INTRAPRED_SMOOTH_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/intrapred_smooth_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/intrapred_smooth_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::intra_predictors[][kIntraPredictorSmooth.*].
+// This function is not thread-safe.
+void IntraPredSmoothInit_C();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DSP_INTRAPRED_SMOOTH_H_
diff --git a/src/dsp/intrapred_test.cc b/src/dsp/intrapred_test.cc
new file mode 100644
index 0000000..335aa2f
--- /dev/null
+++ b/src/dsp/intrapred_test.cc
@@ -0,0 +1,710 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred.h"
+
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <ostream>
+
+#include "absl/strings/match.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/intrapred_smooth.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/memory.h"
+#include "tests/block_utils.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kMaxBlockSize = 64;
+constexpr int kTotalPixels = kMaxBlockSize * kMaxBlockSize;
+
+template <int bitdepth, typename Pixel>
+class IntraPredTestBase : public testing::TestWithParam<TransformSize>,
+                          public test_utils::MaxAlignedAllocable {
+ public:
+  IntraPredTestBase() {
+    switch (tx_size_) {
+      case kNumTransformSizes:
+        EXPECT_NE(tx_size_, kNumTransformSizes);
+        break;
+      default:
+        block_width_ = kTransformWidth[tx_size_];
+        block_height_ = kTransformHeight[tx_size_];
+        break;
+    }
+  }
+
+  IntraPredTestBase(const IntraPredTestBase&) = delete;
+  IntraPredTestBase& operator=(const IntraPredTestBase&) = delete;
+  ~IntraPredTestBase() override = default;
+
+ protected:
+  struct IntraPredMem {
+    void Reset(libvpx_test::ACMRandom* rnd) {
+      ASSERT_NE(rnd, nullptr);
+      Pixel* const left = left_mem + 16;
+      Pixel* const top = top_mem + 16;
+      const int mask = (1 << bitdepth) - 1;
+      for (auto& r : ref_src) r = rnd->Rand16() & mask;
+      for (int i = 0; i < kMaxBlockSize; ++i) left[i] = rnd->Rand16() & mask;
+      for (int i = -1; i < kMaxBlockSize; ++i) top[i] = rnd->Rand16() & mask;
+
+      // Some directional predictors require top-right, bottom-left.
+      for (int i = kMaxBlockSize; i < 2 * kMaxBlockSize; ++i) {
+        left[i] = rnd->Rand16() & mask;
+        top[i] = rnd->Rand16() & mask;
+      }
+      // TODO(jzern): reorder this and regenerate the digests after switching
+      // random number generators.
+      // Upsampling in the directional predictors extends left/top[-1] to [-2].
+      left[-1] = rnd->Rand16() & mask;
+      left[-2] = rnd->Rand16() & mask;
+      top[-2] = rnd->Rand16() & mask;
+      memset(left_mem, 0, sizeof(left_mem[0]) * 14);
+      memset(top_mem, 0, sizeof(top_mem[0]) * 14);
+      memset(top_mem + kMaxBlockSize * 2 + 16, 0,
+             sizeof(top_mem[0]) * kTopMemPadding);
+    }
+
+    // Set ref_src, top-left, top and left to |pixel|.
+    void Set(const Pixel pixel) {
+      Pixel* const left = left_mem + 16;
+      Pixel* const top = top_mem + 16;
+      for (auto& r : ref_src) r = pixel;
+      // Upsampling in the directional predictors extends left/top[-1] to [-2].
+      for (int i = -2; i < 2 * kMaxBlockSize; ++i) {
+        left[i] = top[i] = pixel;
+      }
+    }
+
+    // DirectionalZone1_Large() overreads up to 7 pixels in |top_mem|.
+    static constexpr int kTopMemPadding = 7;
+    alignas(kMaxAlignment) Pixel dst[kTotalPixels];
+    alignas(kMaxAlignment) Pixel ref_src[kTotalPixels];
+    alignas(kMaxAlignment) Pixel left_mem[kMaxBlockSize * 2 + 16];
+    alignas(
+        kMaxAlignment) Pixel top_mem[kMaxBlockSize * 2 + 16 + kTopMemPadding];
+  };
+
+  void SetUp() override { test_utils::ResetDspTable(bitdepth); }
+
+  const TransformSize tx_size_ = GetParam();
+  int block_width_;
+  int block_height_;
+  IntraPredMem intra_pred_mem_;
+};
+
+//------------------------------------------------------------------------------
+// IntraPredTest
+
+template <int bitdepth, typename Pixel>
+class IntraPredTest : public IntraPredTestBase<bitdepth, Pixel> {
+ public:
+  IntraPredTest() = default;
+  IntraPredTest(const IntraPredTest&) = delete;
+  IntraPredTest& operator=(const IntraPredTest&) = delete;
+  ~IntraPredTest() override = default;
+
+ protected:
+  using IntraPredTestBase<bitdepth, Pixel>::tx_size_;
+  using IntraPredTestBase<bitdepth, Pixel>::block_width_;
+  using IntraPredTestBase<bitdepth, Pixel>::block_height_;
+  using IntraPredTestBase<bitdepth, Pixel>::intra_pred_mem_;
+
+  void SetUp() override {
+    IntraPredTestBase<bitdepth, Pixel>::SetUp();
+    IntraPredInit_C();
+    IntraPredSmoothInit_C();
+
+    const Dsp* const dsp = GetDspTable(bitdepth);
+    ASSERT_NE(dsp, nullptr);
+    memcpy(base_intrapreds_, dsp->intra_predictors[tx_size_],
+           sizeof(base_intrapreds_));
+
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    const char* const test_case = test_info->test_suite_name();
+    if (absl::StartsWith(test_case, "C/")) {
+      memset(base_intrapreds_, 0, sizeof(base_intrapreds_));
+    } else if (absl::StartsWith(test_case, "SSE41/")) {
+      if ((GetCpuInfo() & kSSE4_1) != 0) {
+        IntraPredInit_SSE4_1();
+        IntraPredSmoothInit_SSE4_1();
+      }
+    } else if (absl::StartsWith(test_case, "NEON/")) {
+      IntraPredInit_NEON();
+      IntraPredSmoothInit_NEON();
+    } else {
+      FAIL() << "Unrecognized architecture prefix in test case name: "
+             << test_case;
+    }
+
+    memcpy(cur_intrapreds_, dsp->intra_predictors[tx_size_],
+           sizeof(cur_intrapreds_));
+
+    for (int i = 0; i < kNumIntraPredictors; ++i) {
+      // skip functions that haven't been specialized for this particular
+      // architecture.
+      if (cur_intrapreds_[i] == base_intrapreds_[i]) {
+        cur_intrapreds_[i] = nullptr;
+      }
+    }
+  }
+
+  // These tests modify intra_pred_mem_.
+  void TestSpeed(const char* const digests[kNumIntraPredictors], int num_runs);
+  void TestSaturatedValues();
+  void TestRandomValues();
+
+  IntraPredictorFunc base_intrapreds_[kNumIntraPredictors];
+  IntraPredictorFunc cur_intrapreds_[kNumIntraPredictors];
+};
+
+template <int bitdepth, typename Pixel>
+void IntraPredTest<bitdepth, Pixel>::TestSpeed(
+    const char* const digests[kNumIntraPredictors], const int num_runs) {
+  ASSERT_NE(digests, nullptr);
+  const auto* const left =
+      reinterpret_cast<const uint8_t*>(intra_pred_mem_.left_mem + 16);
+  const auto* const top =
+      reinterpret_cast<const uint8_t*>(intra_pred_mem_.top_mem + 16);
+
+  libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+  intra_pred_mem_.Reset(&rnd);
+
+  for (int i = 0; i < kNumIntraPredictors; ++i) {
+    if (cur_intrapreds_[i] == nullptr) continue;
+    memcpy(intra_pred_mem_.dst, intra_pred_mem_.ref_src,
+           sizeof(intra_pred_mem_.dst));
+    const absl::Time start = absl::Now();
+    for (int run = 0; run < num_runs; ++run) {
+      const ptrdiff_t stride = kMaxBlockSize * sizeof(Pixel);
+      cur_intrapreds_[i](intra_pred_mem_.dst, stride, top, left);
+    }
+    const absl::Duration elapsed_time = absl::Now() - start;
+    test_utils::CheckMd5Digest(ToString(tx_size_),
+                               ToString(static_cast<IntraPredictor>(i)),
+                               digests[i], intra_pred_mem_.dst,
+                               sizeof(intra_pred_mem_.dst), elapsed_time);
+  }
+}
+
+template <int bitdepth, typename Pixel>
+void IntraPredTest<bitdepth, Pixel>::TestSaturatedValues() {
+  Pixel* const left = intra_pred_mem_.left_mem + 16;
+  Pixel* const top = intra_pred_mem_.top_mem + 16;
+  const auto kMaxPixel = static_cast<Pixel>((1 << bitdepth) - 1);
+  intra_pred_mem_.Set(kMaxPixel);
+
+  // skip DcFill
+  for (int i = 1; i < kNumIntraPredictors; ++i) {
+    if (cur_intrapreds_[i] == nullptr) continue;
+    memcpy(intra_pred_mem_.dst, intra_pred_mem_.ref_src,
+           sizeof(intra_pred_mem_.dst));
+    const ptrdiff_t stride = kMaxBlockSize * sizeof(Pixel);
+    cur_intrapreds_[i](intra_pred_mem_.dst, stride, top, left);
+    if (!test_utils::CompareBlocks(intra_pred_mem_.dst, intra_pred_mem_.ref_src,
+                                   block_width_, block_height_, kMaxBlockSize,
+                                   kMaxBlockSize, true)) {
+      ADD_FAILURE() << "Expected " << ToString(static_cast<IntraPredictor>(i))
+                    << " to produce a block containing '"
+                    << static_cast<int>(kMaxPixel) << "'";
+    }
+  }
+}
+
+template <int bitdepth, typename Pixel>
+void IntraPredTest<bitdepth, Pixel>::TestRandomValues() {
+  // Use an alternate seed to differentiate this test from TestSpeed().
+  libvpx_test::ACMRandom rnd(test_utils::kAlternateDeterministicSeed);
+  for (int i = 0; i < kNumIntraPredictors; ++i) {
+    // Skip the 'C' test case as this is used as the reference.
+    if (base_intrapreds_[i] == nullptr) continue;
+    if (cur_intrapreds_[i] == nullptr) continue;
+    // It may be worthwhile to temporarily increase this loop size when testing
+    // changes that specifically affect this test.
+    for (int n = 0; n < 10000; ++n) {
+      intra_pred_mem_.Reset(&rnd);
+
+      memcpy(intra_pred_mem_.dst, intra_pred_mem_.ref_src,
+             sizeof(intra_pred_mem_.dst));
+      const Pixel* const top = intra_pred_mem_.top_mem + 16;
+      const Pixel* const left = intra_pred_mem_.left_mem + 16;
+      const ptrdiff_t stride = kMaxBlockSize * sizeof(Pixel);
+      base_intrapreds_[i](intra_pred_mem_.ref_src, stride, top, left);
+      cur_intrapreds_[i](intra_pred_mem_.dst, stride, top, left);
+      if (!test_utils::CompareBlocks(
+              intra_pred_mem_.dst, intra_pred_mem_.ref_src, block_width_,
+              block_height_, kMaxBlockSize, kMaxBlockSize, true)) {
+        ADD_FAILURE() << "Result from optimized version of "
+                      << ToString(static_cast<IntraPredictor>(i))
+                      << " differs from reference in iteration #" << n;
+        break;
+      }
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+
+using IntraPredTest8bpp = IntraPredTest<8, uint8_t>;
+
+const char* const* GetIntraPredDigests8bpp(TransformSize tx_size) {
+  static const char* const kDigests4x4[kNumIntraPredictors] = {
+      "7b1c762e28747f885d2b7d83cb8aa75c", "73353f179207f1432d40a132809e3a50",
+      "80c9237c838b0ec0674ccb070df633d5", "1cd79116b41fda884e7fa047f5eb14df",
+      "33211425772ee539a59981a2e9dc10c1", "d6f5f65a267f0e9a2752e8151cc1dcd7",
+      "7ff8c762cb766eb0665682152102ce4b", "2276b861ae4599de15938651961907ec",
+      "766982bc69f4aaaa8e71014c2dc219bc", "e2c31b5fd2199c49e17c31610339ab3f",
+  };
+  static const char* const kDigests4x8[kNumIntraPredictors] = {
+      "0a0d8641ecfa0e82f541acdc894d5574", "1a40371af6cff9c278c5b0def9e4b3e7",
+      "3631a7a99569663b514f15b590523822", "646c7b592136285bd31501494e7393e7",
+      "ecbe89cc64dc2688123d3cfe865b5237", "79048e70ecbb7d43a4703f62718588c0",
+      "f3de11bf1198a00675d806d29c41d676", "32bb6cd018f6e871c342fcc21c7180cf",
+      "6f076a1e5ab3d69cf08811d62293e4be", "2a84460a8b189b4589824cf6b3b39954",
+  };
+  static const char* const kDigests4x16[kNumIntraPredictors] = {
+      "cb8240be98444ede5ae98ca94afc1557", "460acbcf825a1fa0d8f2aa6bf2d6a21c",
+      "7896fdbbfe538dce1dc3a5b0873d74b0", "504aea29c6b27f21555d5516b8de2d8a",
+      "c5738e7fa82b91ea0e39232120da56ea", "19abbd934c243a6d9df7585d81332dd5",
+      "9e42b7b342e45c842dfa8aedaddbdfaa", "0e9eb07a89f8bf96bc219d5d1c3d9f6d",
+      "659393c31633e0f498bae384c9df5c7b", "bee3a28312da99dd550ec309ae4fff25",
+  };
+  static const char* const kDigests8x4[kNumIntraPredictors] = {
+      "5950744064518f77867c8e14ebd8b5d7", "46b6cbdc76efd03f4ac77870d54739f7",
+      "efe21fd1b98cb1663950e0bf49483b3b", "3c647b64760b298092cbb8e2f5c06bfd",
+      "c3595929687ffb04c59b128d56e2632f", "d89ad2ddf8a74a520fdd1d7019fd75b4",
+      "53907cb70ad597ee5885f6c58201f98b", "09d2282a29008b7fb47eb60ed6653d06",
+      "e341fc1c910d7cb2dac5dbc58b9c9af9", "a8fabd4c259b607a90a2e4d18cae49de",
+  };
+  static const char* const kDigests8x8[kNumIntraPredictors] = {
+      "06fb7cb52719855a38b4883b4b241749", "2013aafd42a4303efb553e42264ab8b0",
+      "2f070511d5680c12ca73a20e47fd6e23", "9923705af63e454392625794d5459fe0",
+      "04007a0d39778621266e2208a22c4fac", "2d296c202d36b4a53f1eaddda274e4a1",
+      "c87806c220d125c7563c2928e836fbbd", "339b49710a0099087e51ab5afc8d8713",
+      "c90fbc020afd9327bf35dccae099bf77", "95b356a7c346334d29294a5e2d13cfd9",
+  };
+  static const char* const kDigests8x16[kNumIntraPredictors] = {
+      "3c5a4574d96b5bb1013429636554e761", "8cf56b17c52d25eb785685f2ab48b194",
+      "7911e2e02abfbe226f17529ac5db08fc", "064e509948982f66a14293f406d88d42",
+      "5c443aa713891406d5be3af4b3cf67c6", "5d2cb98e532822ca701110cda9ada968",
+      "3d58836e17918b8890012dd96b95bb9d", "20e8d61ddc451b9e553a294073349ffd",
+      "a9aa6cf9d0dcf1977a1853ccc264e40b", "103859f85750153f47b81f68ab7881f2",
+  };
+  static const char* const kDigests8x32[kNumIntraPredictors] = {
+      "b393a2db7a76acaccc39e04d9dc3e8ac", "bbda713ee075a7ef095f0f479b5a1f82",
+      "f337dce3980f70730d6f6c2c756e3b62", "796189b05dc026e865c9e95491b255d1",
+      "ea932c21e7189eeb215c1990491320ab", "a9fffdf9455eba5e3b01317cae140289",
+      "9525dbfdbf5fba61ef9c7aa5fe887503", "8c6a7e3717ff8a459f415c79bb17341c",
+      "3761071bfaa2363a315fe07223f95a2d", "0e5aeb9b3f485b90df750469f60c15aa",
+  };
+  static const char* const kDigests16x4[kNumIntraPredictors] = {
+      "1c0a950b3ac500def73b165b6a38467c", "95e7f7300f19da280c6a506e40304462",
+      "28a6af15e31f76d3ff189012475d78f5", "e330d67b859bceef62b96fc9e1f49a34",
+      "36eca3b8083ce2fb5f7e6227dfc34e71", "08f567d2abaa8e83e4d9b33b3f709538",
+      "dc2d0ba13aa9369446932f03b53dc77d", "9ab342944c4b1357aa79d39d7bebdd3a",
+      "77ec278c5086c88b91d68eef561ed517", "60fbe11bfe216c182aaacdec326c4dae",
+  };
+  static const char* const kDigests16x8[kNumIntraPredictors] = {
+      "053a2bc4b5b7287fee524af4e77f077a", "619b720b13f14f32391a99ea7ff550d5",
+      "728d61c11b06baf7fe77881003a918b9", "889997b89a44c9976cb34f573e2b1eea",
+      "b43bfc31d1c770bb9ca5ca158c9beec4", "9d3fe9f762e0c6e4f114042147c50c7f",
+      "c74fdd7c9938603b01e7ecf9fdf08d61", "870c7336db1102f80f74526bd5a7cf4e",
+      "3fd5354a6190903d6a0b661fe177daf6", "409ca6b0b2558aeadf5ef2b8a887e67a",
+  };
+  static const char* const kDigests16x16[kNumIntraPredictors] = {
+      "1fa9e2086f6594bda60c30384fbf1635", "2098d2a030cd7c6be613edc74dc2faf8",
+      "f3c72b0c8e73f1ddca04d14f52d194d8", "6b31f2ee24cf88d3844a2fc67e1f39f3",
+      "d91a22a83575e9359c5e4871ab30ddca", "24c32a0d38b4413d2ef9bf1f842c8634",
+      "6e9e47bf9da9b2b9ae293e0bbd8ff086", "968b82804b5200b074bcdba9718140d4",
+      "4e6d7e612c5ae0bbdcc51a453cd1db3f", "ce763a41977647d072f33e277d69c7b9",
+  };
+  static const char* const kDigests16x32[kNumIntraPredictors] = {
+      "01afd04432026ff56327d6226b720be2", "a6e7be906cc6f1e7a520151bfa7c303d",
+      "bc05c46f18d0638f0228f1de64f07cd5", "204e613e429935f721a5b29cec7d44bb",
+      "aa0a7c9a7482dfc06d9685072fc5bafd", "ffb60f090d83c624bb4f7dc3a630ac4f",
+      "36bcb9ca9bb5eac520b050409de25da5", "34d9a5dd3363668391bc3bd05b468182",
+      "1e149c28db8b234e43931c347a523794", "6e8aff02470f177c3ff4416db79fc508",
+  };
+  static const char* const kDigests16x64[kNumIntraPredictors] = {
+      "727797ef15ccd8d325476fe8f12006a3", "f77c544ac8035e01920deae40cee7b07",
+      "12b0c69595328c465e0b25e0c9e3e9fc", "3b2a053ee8b05a8ac35ad23b0422a151",
+      "f3be77c0fe67eb5d9d515e92bec21eb7", "f1ece6409e01e9dd98b800d49628247d",
+      "efd2ec9bfbbd4fd1f6604ea369df1894", "ec703de918422b9e03197ba0ed60a199",
+      "739418efb89c07f700895deaa5d0b3e3", "9943ae1bbeeebfe1d3a92dc39e049d63",
+  };
+  static const char* const kDigests32x8[kNumIntraPredictors] = {
+      "4da55401331ed98acec0c516d7307513", "0ae6f3974701a5e6c20baccd26b4ca52",
+      "79b799f1eb77d5189535dc4e18873a0e", "90e943adf3de4f913864dce4e52b4894",
+      "5e1b9cc800a89ef45f5bdcc9e99e4e96", "3103405df20d254cbf32ac30872ead4b",
+      "648550e369b77687bff3c7d6f249b02f", "f9f73bcd8aadfc059fa260325df957a1",
+      "204cef70d741c25d4fe2b1d10d2649a5", "04c05e18488496eba64100faa25e8baf",
+  };
+  static const char* const kDigests32x16[kNumIntraPredictors] = {
+      "86ad1e1047abaf9959150222e8f19593", "1908cbe04eb4e5c9d35f1af7ffd7ee72",
+      "6ad3bb37ebe8374b0a4c2d18fe3ebb6a", "08d3cfe7a1148bff55eb6166da3378c6",
+      "656a722394764d17b6c42401b9e0ad3b", "4aa00c192102efeb325883737e562f0d",
+      "9881a90ca88bca4297073e60b3bb771a", "8cd74aada398a3d770fc3ace38ecd311",
+      "0a927e3f5ff8e8338984172cc0653b13", "d881d68b4eb3ee844e35e04ad6721f5f",
+  };
+  static const char* const kDigests32x32[kNumIntraPredictors] = {
+      "1303ca680644e3d8c9ffd4185bb2835b", "2a4d9f5cc8da307d4cf7dc021df10ba9",
+      "ced60d3f4e4b011a6a0314dd8a4b1fd8", "ced60d3f4e4b011a6a0314dd8a4b1fd8",
+      "1464b01aa928e9bd82c66bad0f921693", "90deadfb13d7c3b855ba21b326c1e202",
+      "af96a74f8033dff010e53a8521bc6f63", "9f1039f2ef082aaee69fcb7d749037c2",
+      "3f82893e478e204f2d254b34222d14dc", "ddb2b95ffb65b84dd4ff1f7256223305",
+  };
+  static const char* const kDigests32x64[kNumIntraPredictors] = {
+      "e1e8ed803236367821981500a3d9eebe", "0f46d124ba9f48cdd5d5290acf786d6d",
+      "4e2a2cfd8f56f15939bdfc753145b303", "0ce332b343934b34cd4417725faa85cb",
+      "1d2f8e48e3adb7c448be05d9f66f4954", "9fb2e176636a5689b26f73ca73fcc512",
+      "e720ebccae7e25e36f23da53ae5b5d6a", "86fe4364734169aaa4520d799890d530",
+      "b1870290764bb1b100d1974e2bd70f1d", "ce5b238e19d85ef69d85badfab4e63ae",
+  };
+  static const char* const kDigests64x16[kNumIntraPredictors] = {
+      "de1b736e9d99129609d6ef3a491507a0", "516d8f6eb054d74d150e7b444185b6b9",
+      "69e462c3338a9aaf993c3f7cfbc15649", "821b76b1494d4f84d20817840f719a1a",
+      "fd9b4276e7affe1e0e4ce4f428058994", "cd82fd361a4767ac29a9f406b480b8f3",
+      "2792c2f810157a4a6cb13c28529ff779", "1220442d90c4255ba0969d28b91e93a6",
+      "c7253e10b45f7f67dfee3256c9b94825", "879792198071c7e0b50b9b5010d8c18f",
+  };
+  static const char* const kDigests64x32[kNumIntraPredictors] = {
+      "e48e1ac15e97191a8fda08d62fff343e", "80c15b303235f9bc2259027bb92dfdc4",
+      "538424b24bd0830f21788e7238ca762f", "a6c5aeb722615089efbca80b02951ceb",
+      "12604b37875533665078405ef4582e35", "0048afa17bd3e1632d68b96048836530",
+      "07a0cfcb56a5eed50c4bd6c26814336b", "529d8a070de5bc6531fa3ee8f450c233",
+      "33c50a11c7d78f72434064f634305e95", "e0ef7f0559c1a50ec5a8c12011b962f7",
+  };
+  static const char* const kDigests64x64[kNumIntraPredictors] = {
+      "a1650dbcd56e10288c3e269eca37967d", "be91585259bc37bf4dc1651936e90b3e",
+      "afe020786b83b793c2bbd9468097ff6e", "6e1094fa7b50bc813aa2ba29f5df8755",
+      "9e5c34f3797e0cdd3cd9d4c05b0d8950", "bc87be7ac899cc6a28f399d7516c49fe",
+      "9811fd0d2dd515f06122f5d1bd18b784", "3c140e466f2c2c0d9cb7d2157ab8dc27",
+      "9543de76c925a8f6adc884cc7f98dc91", "df1df0376cc944afe7e74e94f53e575a",
+  };
+
+  switch (tx_size) {
+    case kTransformSize4x4:
+      return kDigests4x4;
+    case kTransformSize4x8:
+      return kDigests4x8;
+    case kTransformSize4x16:
+      return kDigests4x16;
+    case kTransformSize8x4:
+      return kDigests8x4;
+    case kTransformSize8x8:
+      return kDigests8x8;
+    case kTransformSize8x16:
+      return kDigests8x16;
+    case kTransformSize8x32:
+      return kDigests8x32;
+    case kTransformSize16x4:
+      return kDigests16x4;
+    case kTransformSize16x8:
+      return kDigests16x8;
+    case kTransformSize16x16:
+      return kDigests16x16;
+    case kTransformSize16x32:
+      return kDigests16x32;
+    case kTransformSize16x64:
+      return kDigests16x64;
+    case kTransformSize32x8:
+      return kDigests32x8;
+    case kTransformSize32x16:
+      return kDigests32x16;
+    case kTransformSize32x32:
+      return kDigests32x32;
+    case kTransformSize32x64:
+      return kDigests32x64;
+    case kTransformSize64x16:
+      return kDigests64x16;
+    case kTransformSize64x32:
+      return kDigests64x32;
+    case kTransformSize64x64:
+      return kDigests64x64;
+    default:
+      ADD_FAILURE() << "Unknown transform size: " << tx_size;
+      return nullptr;
+  }
+}
+
+TEST_P(IntraPredTest8bpp, DISABLED_Speed) {
+  const auto num_runs =
+      static_cast<int>(2.0e9 / (block_width_ * block_height_));
+  TestSpeed(GetIntraPredDigests8bpp(tx_size_), num_runs);
+}
+
+TEST_P(IntraPredTest8bpp, FixedInput) {
+  TestSpeed(GetIntraPredDigests8bpp(tx_size_), 1);
+}
+
+TEST_P(IntraPredTest8bpp, Overflow) { TestSaturatedValues(); }
+TEST_P(IntraPredTest8bpp, Random) { TestRandomValues(); }
+
+//------------------------------------------------------------------------------
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using IntraPredTest10bpp = IntraPredTest<10, uint16_t>;
+
+const char* const* GetIntraPredDigests10bpp(TransformSize tx_size) {
+  static const char* const kDigests4x4[kNumIntraPredictors] = {
+      "432bf9e762416bec582cb3654cbc4545", "8b9707ff4d506e0cb326f2d9a8d78705",
+      "a076275258cc5af87ed8b075136fb219", "f9587004012a8d2cecaa347331ccdf96",
+      "1c4e6890c5e6eed495fe54a6b6df8d6f", "0ae15fae8969a3c972ee895f325955a3",
+      "97db177738b831da8066df4f3fb7adbd", "4add5685b8a56991c9dce4ff7086ec25",
+      "75c6a655256188e378e70658b8f1631f", "14a27db20f9d5594ef74a7ea10c3e5ef",
+  };
+  static const char* const kDigests4x8[kNumIntraPredictors] = {
+      "9cbd7c18aca2737fa41db27150798819", "13d1e734692e27339c10b07da33c1113",
+      "0617cf74e2dd5d34ea517af1767fa47e", "c6a7b01228ccdf74af8528ef8f5f55c6",
+      "13b05d87b3d566b2f7a4b332cd8a762e", "b26ae0e8da1fe8989dfe2900fa2c3847",
+      "c30f3acdd386bdac91028fe48b751810", "04d2baf5192c5af97ca18d3b9b0d5968",
+      "a0ef82983822fc815bf1e8326cd41e33", "20bf218bae5f6b5c6d56b85f3f9bbadb",
+  };
+  static const char* const kDigests4x16[kNumIntraPredictors] = {
+      "d9b47bdddaa5e22312ff9ece7a3cae08", "cb76c79971b502dd8999a7047b3e2f86",
+      "3b09a3ff431d03b379acfdc444602540", "88608f6fcd687831e871053723cf76c3",
+      "a7bd2a17de1cf19c9a4b2c550f277a5c", "29b389f564f266a67687b8d2bc750418",
+      "4680847c30fe93c06f87e2ee1da544d6", "0e4eda11e1fe6ebe8526c2a2c5390bbb",
+      "bf3e20197282885acabb158f3a77ba59", "fccea71d1a253316b905f4a073c84a36",
+  };
+  static const char* const kDigests8x4[kNumIntraPredictors] = {
+      "05ba0ed96aac48cd94e7597f12184320", "d97d04e791904d3cedc34d5430a4d1d2",
+      "49217081a169c2d30b0a43f816d0b58b", "09e2a6a6bfe35b83e9434ee9c8dcf417",
+      "4b03c8822169ee4fa058513d65f0e32f", "cabdeebc923837ee3f2d3480354d6a81",
+      "957eda610a23a011ed25976aee94eaf0", "4a197e3dfce1f0d3870138a9b66423aa",
+      "18c0d0fbe0e96a0baf2f98fa1908cbb9", "21114e5737328cdbba9940e4f85a0855",
+  };
+  static const char* const kDigests8x8[kNumIntraPredictors] = {
+      "430e99eecda7e6434e1973dbdcc2a29d", "88864d7402c09b57735db49c58707304",
+      "8312f80b936380ceb51375e29a4fd75d", "472a7ed9c68bdbd9ecca197b7a8b3f01",
+      "4f66ee4dc0cb752c3b65d576cd06bb5c", "36383d6f61799143470129e2d5241a6f",
+      "c96279406c8d2d02771903e93a4e8d37", "4fb64f9700ed0bf08fbe7ab958535348",
+      "c008c33453ac9cf8c42ae6ec88f9941c", "39c401a9938b23e318ae7819e458daf1",
+  };
+  static const char* const kDigests8x16[kNumIntraPredictors] = {
+      "bda6b75fedfe0705f9732ff84c918672", "4ff130a47429e0762386557018ec10b2",
+      "8156557bf938d8e3a266318e57048fc5", "bdfa8e01a825ec7ae2d80519e3c94eec",
+      "108fc8e5608fe09f9cc30d7a52cbc0c1", "a2271660af5424b64c6399ca5509dee1",
+      "b09af9729f39516b28ff62363f8c0cb2", "4fe67869dac99048dfcf4d4e621884ec",
+      "311f498369a9c98f77a961bf91e73e65", "d66e78b9f41d5ee6a4b25e37ec9af324",
+  };
+  static const char* const kDigests8x32[kNumIntraPredictors] = {
+      "26c45325f02521e7e5c66c0aa0819329", "79dfb68513d4ccd2530c485f0367858e",
+      "8288e99b4d738b13956882c3ad3f03fe", "7c4993518b1620b8be8872581bb72239",
+      "2b1c3126012d981f787ed0a2601ee377", "051ba9f0c4d4fecb1fcd81fdea94cae4",
+      "320362239ad402087303a4df39512bb1", "210df35b2055c9c01b9e3e5ae24e524b",
+      "f8536db74ce68c0081bbd8799dac25f9", "27f2fe316854282579906d071af6b705",
+  };
+  static const char* const kDigests16x4[kNumIntraPredictors] = {
+      "decff67721ff7e9e65ec641e78f5ccf3", "99e3b2fbdabfa9b76b749cfb6530a9fd",
+      "accdb3d25629916963a069f1e1c0e061", "ad42855e9146748b0e235b8428487b4b",
+      "53025e465f267e7af2896ebd028447a0", "577d26fcd2d655cc77a1f1f875648699",
+      "7a61a3619267221b448b20723840e9f0", "fb4ccc569bdae3614e87bc5be1e84284",
+      "b866095d8a3e6910cc4f92f8d8d6075a", "6ba9013cba1624872bfbac111e8d344a",
+  };
+  static const char* const kDigests16x8[kNumIntraPredictors] = {
+      "2832156bd076c75f8be5622f34cb3efe", "da70e516f5a8842dd4965b80cd8d2a76",
+      "c3e137c6d79c57be2073d1eda22c8d1e", "8c5d28c7b3301b50326582dd7f89a175",
+      "9d8558775155b201cd178ab61458b642", "ecbddb9c6808e0c609c8fe537b7f7408",
+      "29a123c22cb4020170f9a80edf1208da", "653d0cd0688aa682334156f7b4599b34",
+      "1bfa66ae92a22a0346511db1713fe7df", "1802ad1e657e7fc08fc063342f471ca1",
+  };
+  static const char* const kDigests16x16[kNumIntraPredictors] = {
+      "2270c626de9d49769660ae9184a6428f", "9f069625cdcdd856e2e7ec19ff4fcd50",
+      "34167b9c413362a377aa7b1faf92ae6d", "3cec2b23d179765daea8dfb87c9efdd5",
+      "daa8f0863a5df2aef2b20999961cc8f8", "d9e4dd4bc63991e4f09cb97eb25f4db4",
+      "4e1a182fc3fcf5b9f5a73898f81c2004", "c58e4275406c9fd1c2a74b40c27afff0",
+      "b8092796fd4e4dd9d2b92afb770129ba", "75424d1f18ff00c4093743d033c6c9b6",
+  };
+  static const char* const kDigests16x32[kNumIntraPredictors] = {
+      "5aa050947f3d488537f5a68c23bb135b", "9e66143a2c3863b6fe171275a192d378",
+      "86b0c4777625e84d52913073d234f860", "9e2144fcf2107c76cec4241416bbecd5",
+      "c72be592efc72c3c86f2359b6f622aba", "c4e0e735545f78f43e21e9c39eab7b8f",
+      "52122e7c84a4bab67a8a359efb427023", "7b5fd8bb7e0744e81fd6fa4ed4c2e0fb",
+      "a9950d110bffb0411a8fcd1262dceef0", "2a2dd496f01f5d87f257ed202a703cbe",
+  };
+  static const char* const kDigests16x64[kNumIntraPredictors] = {
+      "eeb1b873e81ca428b11f162bd5b28843", "39ce7d22791f82562b0ca1e0afdf1604",
+      "6bd6bdac8982a4b84613f9963d35d5e9", "a9ac2438e87522621c7e6fe6d02c01ab",
+      "a8b9c471fe6c66ed0717e77fea77bba1", "e050b6aa38aee6e951d3be5a94a8abd0",
+      "3c5ecc31aa45e8175d37e90af247bca6", "30c0f9e412ea726970f575f910edfb94",
+      "f3d96395816ce58fb98480a5b4c32ab2", "9c14811957e013fb009dcd4a3716b338",
+  };
+  static const char* const kDigests32x8[kNumIntraPredictors] = {
+      "d6560d7fc9ae9bd7c25e2983b4a825e3", "90a67154bbdc26cd06ab0fa25fff3c53",
+      "c42d37c5a634e68fafc982626842db0b", "ecc8646d258cfa431facbc0dba168f80",
+      "9f3c167b790b52242dc8686c68eac389", "62dc3bc34406636ccec0941579461f65",
+      "5c0f0ebdb3c936d4decc40d5261aec7c", "dbfc0f056ca25e0331042da6d292e10a",
+      "14fa525d74e6774781198418d505c595", "5f95e70db03da9ed70cd79e23f19199c",
+  };
+  static const char* const kDigests32x16[kNumIntraPredictors] = {
+      "dfe3630aa9eeb1adcc8604269a309f26", "ba6180227d09f5a573f69dc6ee1faf80",
+      "03edea9d71ca3d588e1a0a69aecdf555", "2c8805415f44b4fac6692090dc1b1ddd",
+      "18efd17ed72a6e92ef8b0a692cf7a2e3", "63a6e0abfb839b43c68c23b2c43c8918",
+      "be15479205bb60f5a17baaa81a6b47ad", "243d21e1d9f9dd2b981292ac7769315a",
+      "21de1cb5269e0e1d08930c519e676bf7", "73065b3e27e9c4a3a6d043712d3d8b25",
+  };
+  static const char* const kDigests32x32[kNumIntraPredictors] = {
+      "c3136bb829088e33401b1affef91f692", "68bbcf93d17366db38bbc7605e07e322",
+      "2786be5fb7c25eeec4d2596c4154c3eb", "25ac7468e691753b8291be859aac7493",
+      "a6805ce21bfd26760e749efc8f590fa3", "5a38fd324b466e8ac43f5e289d38107e",
+      "dd0628fc5cc920b82aa941378fa907c8", "8debadbdb2dec3dc7eb43927e9d36998",
+      "61e1bc223c9e04c64152cc4531b6c099", "900b00ac1f20c0a8d22f8b026c0ee1cc",
+  };
+  static const char* const kDigests32x64[kNumIntraPredictors] = {
+      "5a591b2b83f0a6cce3c57ce164a5f983", "f42167ec516102b83b2c5176df57316b",
+      "58f3772d3df511c8289b340beb178d96", "c24166e7dc252d34ac6f92712956d751",
+      "7dca3acfe2ea09e6292a9ece2078b827", "5c029235fc0820804e40187d2b22a96e",
+      "375572944368afbc04ca97dab7fb3328", "8867235908736fd99c4022e4ed604e6e",
+      "63ec336034d62846b75558c49082870f", "46f35d85eb8499d61bfeac1c49e52531",
+  };
+  static const char* const kDigests64x16[kNumIntraPredictors] = {
+      "67755882209304659a0e6bfc324e16b9", "cd89b272fecb5f23431b3f606f590722",
+      "9bcff7d971a4af0a2d1cac6d66d83482", "d8d6bb55ebeec4f03926908d391e15ba",
+      "0eb5b5ced3e7177a1dd6a1e72e7a7d21", "92b47fe431d9cf66f9e601854f0f3017",
+      "7dc599557eddb2ea480f86fc89c76b30", "4f40175676c164320fe8005440ad9217",
+      "b00eacb24081a041127f136e9e5983ec", "cb0ab76a5e90f2eb75c38b99b9833ff8",
+  };
+  static const char* const kDigests64x32[kNumIntraPredictors] = {
+      "21d873011d1b4ef1daedd9aa8c6938ea", "4866da21db0261f738903d97081cb785",
+      "a722112233a82595a8d001a4078b834d", "24c7a133c6fcb59129c3782ef908a6c1",
+      "490e40505dd255d3a909d8a72c280cbc", "2afe719fb30bf2a664829bb74c8f9e2a",
+      "623adad2ebb8f23e355cd77ace4616cd", "d6092541e9262ad009bef79a5d350a86",
+      "ae86d8fba088683ced8abfd7e1ddf380", "32aa8aa21f2f24333d31f99e12b95c53",
+  };
+  static const char* const kDigests64x64[kNumIntraPredictors] = {
+      "6d88aeb40dfe3ac43c68808ca3c00806", "6a75d88ac291d6a3aaf0eec0ddf2aa65",
+      "30ef52d7dc451affdd587c209f5cb2dd", "e073f7969f392258eaa907cf0636452a",
+      "de10f07016a2343bcd3a9deb29f4361e", "dc35ff273fea4355d2c8351c2ed14e6e",
+      "01b9a545968ac75c3639ddabb837fa0b", "85c98ed9c0ea1523a15281bc9a909b8c",
+      "4c255f7ef7fd46db83f323806d79dca4", "fe2fe6ffb19cb8330e2f2534271d6522",
+  };
+
+  switch (tx_size) {
+    case kTransformSize4x4:
+      return kDigests4x4;
+    case kTransformSize4x8:
+      return kDigests4x8;
+    case kTransformSize4x16:
+      return kDigests4x16;
+    case kTransformSize8x4:
+      return kDigests8x4;
+    case kTransformSize8x8:
+      return kDigests8x8;
+    case kTransformSize8x16:
+      return kDigests8x16;
+    case kTransformSize8x32:
+      return kDigests8x32;
+    case kTransformSize16x4:
+      return kDigests16x4;
+    case kTransformSize16x8:
+      return kDigests16x8;
+    case kTransformSize16x16:
+      return kDigests16x16;
+    case kTransformSize16x32:
+      return kDigests16x32;
+    case kTransformSize16x64:
+      return kDigests16x64;
+    case kTransformSize32x8:
+      return kDigests32x8;
+    case kTransformSize32x16:
+      return kDigests32x16;
+    case kTransformSize32x32:
+      return kDigests32x32;
+    case kTransformSize32x64:
+      return kDigests32x64;
+    case kTransformSize64x16:
+      return kDigests64x16;
+    case kTransformSize64x32:
+      return kDigests64x32;
+    case kTransformSize64x64:
+      return kDigests64x64;
+    default:
+      ADD_FAILURE() << "Unknown transform size: " << tx_size;
+      return nullptr;
+  }
+}
+
+TEST_P(IntraPredTest10bpp, DISABLED_Speed) {
+  const auto num_runs =
+      static_cast<int>(2.0e9 / (block_width_ * block_height_));
+  TestSpeed(GetIntraPredDigests10bpp(tx_size_), num_runs);
+}
+
+TEST_P(IntraPredTest10bpp, FixedInput) {
+  TestSpeed(GetIntraPredDigests10bpp(tx_size_), 1);
+}
+
+TEST_P(IntraPredTest10bpp, Overflow) { TestSaturatedValues(); }
+TEST_P(IntraPredTest10bpp, Random) { TestRandomValues(); }
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+constexpr TransformSize kTransformSizes[] = {
+    kTransformSize4x4,   kTransformSize4x8,   kTransformSize4x16,
+    kTransformSize8x4,   kTransformSize8x8,   kTransformSize8x16,
+    kTransformSize8x32,  kTransformSize16x4,  kTransformSize16x8,
+    kTransformSize16x16, kTransformSize16x32, kTransformSize16x64,
+    kTransformSize32x8,  kTransformSize32x16, kTransformSize32x32,
+    kTransformSize32x64, kTransformSize64x16, kTransformSize64x32,
+    kTransformSize64x64};
+
+INSTANTIATE_TEST_SUITE_P(C, IntraPredTest8bpp,
+                         testing::ValuesIn(kTransformSizes));
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, IntraPredTest8bpp,
+                         testing::ValuesIn(kTransformSizes));
+#endif  // LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, IntraPredTest8bpp,
+                         testing::ValuesIn(kTransformSizes));
+#endif  // LIBGAV1_ENABLE_NEON
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+INSTANTIATE_TEST_SUITE_P(C, IntraPredTest10bpp,
+                         testing::ValuesIn(kTransformSizes));
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, IntraPredTest10bpp,
+                         testing::ValuesIn(kTransformSizes));
+#endif  // LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, IntraPredTest10bpp,
+                         testing::ValuesIn(kTransformSizes));
+#endif  // LIBGAV1_ENABLE_NEON
+
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+}  // namespace
+}  // namespace dsp
+
+static std::ostream& operator<<(std::ostream& os, const TransformSize tx_size) {
+  return os << ToString(tx_size);
+}
+
+}  // namespace libgav1
diff --git a/src/dsp/inverse_transform.cc b/src/dsp/inverse_transform.cc
index a03fad2..ed984d8 100644
--- a/src/dsp/inverse_transform.cc
+++ b/src/dsp/inverse_transform.cc
@@ -1184,9 +1184,10 @@ void TransformLoop_C(TransformType tx_type, TransformSize tx_size,
   Residual tx_buffer[64];
   for (int j = 0; j < tx_width; ++j) {
     const int flipped_j = flip_columns ? tx_width - j - 1 : j;
-    for (int i = 0; i < tx_height; ++i) {
+    int i = 0;
+    do {
       tx_buffer[i] = residual[i][flipped_j];
-    }
+    } while (++i != tx_height);
     if (adjusted_tx_height == 1) {
       dconly_transform1d(tx_buffer, column_clamp_range, false, 0, false);
     } else {
@@ -1211,6 +1212,7 @@ void TransformLoop_C(TransformType tx_type, TransformSize tx_size,
 
 //------------------------------------------------------------------------------
 
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
 template <int bitdepth, typename Residual, typename Pixel>
 void InitAll(Dsp* const dsp) {
   // Maximum transform size for Dct is 64.
@@ -1325,6 +1327,7 @@ void InitAll(Dsp* const dsp) {
                       Wht4DcOnly_C<bitdepth, Residual>, Wht4_C<Residual>,
                       /*is_row=*/false>;
 }
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
 
 void Init8bpp() {
   Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
diff --git a/src/dsp/inverse_transform_test.cc b/src/dsp/inverse_transform_test.cc
new file mode 100644
index 0000000..623e203
--- /dev/null
+++ b/src/dsp/inverse_transform_test.cc
@@ -0,0 +1,536 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/inverse_transform.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <cstdio>
+#include <cstring>
+#include <ostream>
+
+#include "absl/strings/match.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/bit_mask_set.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/memory.h"
+#include "tests/block_utils.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kMaxBlockSize = 64;
+constexpr int kTotalPixels = kMaxBlockSize * kMaxBlockSize;
+
+const char* const kTransformSize1DNames[kNum1DTransformSizes] = {
+    "k1DTransformSize4", "k1DTransformSize8", "k1DTransformSize16",
+    "k1DTransformSize32", "k1DTransformSize64"};
+
+constexpr TransformSize1D kRow1DTransformSizes[] = {
+    k1DTransformSize4,  k1DTransformSize4,  k1DTransformSize4,
+    k1DTransformSize8,  k1DTransformSize8,  k1DTransformSize8,
+    k1DTransformSize8,  k1DTransformSize16, k1DTransformSize16,
+    k1DTransformSize16, k1DTransformSize16, k1DTransformSize16,
+    k1DTransformSize32, k1DTransformSize32, k1DTransformSize32,
+    k1DTransformSize32, k1DTransformSize64, k1DTransformSize64,
+    k1DTransformSize64};
+
+constexpr TransformSize1D kCol1DTransformSizes[] = {
+    k1DTransformSize4,  k1DTransformSize8,  k1DTransformSize16,
+    k1DTransformSize4,  k1DTransformSize8,  k1DTransformSize16,
+    k1DTransformSize32, k1DTransformSize4,  k1DTransformSize8,
+    k1DTransformSize16, k1DTransformSize32, k1DTransformSize64,
+    k1DTransformSize8,  k1DTransformSize16, k1DTransformSize32,
+    k1DTransformSize64, k1DTransformSize16, k1DTransformSize32,
+    k1DTransformSize64};
+
+template <int bitdepth, typename SrcPixel, typename DstPixel>
+class InverseTransformTestBase : public testing::TestWithParam<TransformSize>,
+                                 public test_utils::MaxAlignedAllocable {
+ public:
+  InverseTransformTestBase() {
+    switch (tx_size_) {
+      case kNumTransformSizes:
+        EXPECT_NE(tx_size_, kNumTransformSizes);
+        break;
+      default:
+        block_width_ = kTransformWidth[tx_size_];
+        block_height_ = kTransformHeight[tx_size_];
+        break;
+    }
+  }
+
+  InverseTransformTestBase(const InverseTransformTestBase&) = delete;
+  InverseTransformTestBase& operator=(const InverseTransformTestBase&) = delete;
+  ~InverseTransformTestBase() override = default;
+
+ protected:
+  struct InverseTransformMem {
+    void Reset(libvpx_test::ACMRandom* rnd, int width, int height) {
+      ASSERT_NE(rnd, nullptr);
+      // Limit the size of the residual values to bitdepth + sign in order
+      // to prevent outranging in the transforms.
+      const int num_bits = bitdepth + 1;
+      const int sign_shift = (bitdepth == 8 ? 16 : 32) - num_bits;
+      const int mask = (1 << num_bits) - 1;
+      // Fill residual with random data.  For widths == 64, only fill the upper
+      // left 32 x min(block_height_, 32).
+      memset(ref_src, 0, sizeof(ref_src));
+      SrcPixel* r = ref_src;
+      const int stride = width;
+      for (int y = 0; y < std::min(height, 32); ++y) {
+        for (int x = 0; x < std::min(width, 32); ++x) {
+          r[x] = rnd->Rand16() & mask;
+          // The msb of num_bits is the sign bit, so force each 16 bit value to
+          // the correct sign.
+          r[x] = (r[x] << sign_shift) >> sign_shift;
+        }
+        r += stride;
+      }
+
+      // Set frame data to random values.
+      for (int y = 0; y < kMaxBlockSize; ++y) {
+        for (int x = 0; x < kMaxBlockSize; ++x) {
+          const int mask = (1 << bitdepth) - 1;
+          cur_frame[y * kMaxBlockSize + x] = base_frame[y * kMaxBlockSize + x] =
+              rnd->Rand16() & mask;
+        }
+      }
+    }
+
+    // Set ref_src to |pixel|.
+    void Set(const SrcPixel pixel) {
+      for (auto& r : ref_src) r = pixel;
+    }
+
+    alignas(kMaxAlignment) DstPixel base_frame[kTotalPixels];
+    alignas(kMaxAlignment) DstPixel cur_frame[kTotalPixels];
+
+    alignas(kMaxAlignment) SrcPixel base_residual[kTotalPixels];
+    alignas(kMaxAlignment) SrcPixel cur_residual[kTotalPixels];
+
+    alignas(kMaxAlignment) SrcPixel ref_src[kTotalPixels];
+  };
+
+  void SetUp() override { test_utils::ResetDspTable(bitdepth); }
+
+  const TransformSize tx_size_ = GetParam();
+  int block_width_;
+  int block_height_;
+  InverseTransformMem inverse_transform_mem_;
+};
+
+//------------------------------------------------------------------------------
+// InverseTransformTest
+
+template <int bitdepth, typename Pixel, typename DstPixel>
+class InverseTransformTest
+    : public InverseTransformTestBase<bitdepth, Pixel, DstPixel> {
+ public:
+  InverseTransformTest() = default;
+  InverseTransformTest(const InverseTransformTest&) = delete;
+  InverseTransformTest& operator=(const InverseTransformTest&) = delete;
+  ~InverseTransformTest() override = default;
+
+ protected:
+  using InverseTransformTestBase<bitdepth, Pixel, DstPixel>::tx_size_;
+  using InverseTransformTestBase<bitdepth, Pixel, DstPixel>::block_width_;
+  using InverseTransformTestBase<bitdepth, Pixel, DstPixel>::block_height_;
+  using InverseTransformTestBase<bitdepth, Pixel,
+                                 DstPixel>::inverse_transform_mem_;
+
+  void SetUp() override {
+    InverseTransformTestBase<bitdepth, Pixel, DstPixel>::SetUp();
+    InverseTransformInit_C();
+
+    const Dsp* const dsp = GetDspTable(bitdepth);
+    ASSERT_NE(dsp, nullptr);
+
+    tx_size_1d_row_ = kRow1DTransformSizes[tx_size_];
+    tx_size_1d_column_ = kCol1DTransformSizes[tx_size_];
+
+    memcpy(base_inverse_transforms_, dsp->inverse_transforms,
+           sizeof(base_inverse_transforms_));
+
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    const char* const test_case = test_info->test_suite_name();
+    if (absl::StartsWith(test_case, "C/")) {
+      memset(base_inverse_transforms_, 0, sizeof(base_inverse_transforms_));
+    } else if (absl::StartsWith(test_case, "SSE41/")) {
+      if ((GetCpuInfo() & kSSE4_1) != 0) {
+        InverseTransformInit_SSE4_1();
+      }
+    } else if (absl::StartsWith(test_case, "NEON/")) {
+      InverseTransformInit_NEON();
+      InverseTransformInit10bpp_NEON();
+    } else {
+      FAIL() << "Unrecognized architecture prefix in test case name: "
+             << test_case;
+    }
+
+    memcpy(cur_inverse_transforms_, dsp->inverse_transforms,
+           sizeof(cur_inverse_transforms_));
+
+    for (int i = 0; i < kNum1DTransforms; ++i) {
+      // skip functions that haven't been specialized for this particular
+      // architecture.
+      if (cur_inverse_transforms_[i][tx_size_1d_row_][kRow] ==
+          base_inverse_transforms_[i][tx_size_1d_row_][kRow]) {
+        cur_inverse_transforms_[i][tx_size_1d_row_][kRow] = nullptr;
+      }
+      if (cur_inverse_transforms_[i][tx_size_1d_column_][kColumn] ==
+          base_inverse_transforms_[i][tx_size_1d_column_][kColumn]) {
+        cur_inverse_transforms_[i][tx_size_1d_column_][kColumn] = nullptr;
+      }
+    }
+
+    base_frame_buffer_.Reset(kMaxBlockSize, kMaxBlockSize,
+                             inverse_transform_mem_.base_frame);
+
+    cur_frame_buffer_.Reset(kMaxBlockSize, kMaxBlockSize,
+                            inverse_transform_mem_.cur_frame);
+  }
+
+  // These tests modify inverse_transform_mem_.
+  void TestRandomValues(int num_tests);
+  void TestDcOnlyRandomValue(int num_tests);
+
+  Array2DView<DstPixel> base_frame_buffer_;
+  Array2DView<DstPixel> cur_frame_buffer_;
+
+  TransformSize1D tx_size_1d_row_ = k1DTransformSize4;
+  TransformSize1D tx_size_1d_column_ = k1DTransformSize4;
+
+  InverseTransformAddFuncs base_inverse_transforms_;
+  InverseTransformAddFuncs cur_inverse_transforms_;
+};
+
+constexpr TransformType kLibgav1TxType[kNumTransformTypes] = {
+    kTransformTypeDctDct,           kTransformTypeAdstDct,
+    kTransformTypeDctAdst,          kTransformTypeAdstAdst,
+    kTransformTypeFlipadstDct,      kTransformTypeDctFlipadst,
+    kTransformTypeFlipadstFlipadst, kTransformTypeAdstFlipadst,
+    kTransformTypeFlipadstAdst,     kTransformTypeIdentityIdentity,
+    kTransformTypeIdentityDct,      kTransformTypeDctIdentity,
+    kTransformTypeIdentityAdst,     kTransformTypeAdstIdentity,
+    kTransformTypeIdentityFlipadst, kTransformTypeFlipadstIdentity};
+
+// Maps TransformType to dsp::Transform1D for the row transforms.
+constexpr Transform1D kRowTransform[kNumTransformTypes] = {
+    k1DTransformDct,      k1DTransformAdst,     k1DTransformDct,
+    k1DTransformAdst,     k1DTransformAdst,     k1DTransformDct,
+    k1DTransformAdst,     k1DTransformAdst,     k1DTransformAdst,
+    k1DTransformIdentity, k1DTransformIdentity, k1DTransformDct,
+    k1DTransformIdentity, k1DTransformAdst,     k1DTransformIdentity,
+    k1DTransformAdst};
+
+// Maps TransformType to dsp::Transform1D for the column transforms.
+constexpr Transform1D kColumnTransform[kNumTransformTypes] = {
+    k1DTransformDct,      k1DTransformDct,      k1DTransformAdst,
+    k1DTransformAdst,     k1DTransformDct,      k1DTransformAdst,
+    k1DTransformAdst,     k1DTransformAdst,     k1DTransformAdst,
+    k1DTransformIdentity, k1DTransformDct,      k1DTransformIdentity,
+    k1DTransformAdst,     k1DTransformIdentity, k1DTransformAdst,
+    k1DTransformIdentity};
+
+// Mask indicating whether the transform sets contain a particular transform
+// type. If |tx_type| is present in |tx_set|, then the |tx_type|th LSB is set.
+constexpr BitMaskSet kTransformTypeInSetMask[kNumTransformSets] = {
+    BitMaskSet(0x1),    BitMaskSet(0xE0F), BitMaskSet(0x20F),
+    BitMaskSet(0xFFFF), BitMaskSet(0xFFF), BitMaskSet(0x201)};
+
+bool IsTxSizeTypeValid(TransformSize tx_size, TransformType tx_type) {
+  const TransformSize tx_size_square_max = kTransformSizeSquareMax[tx_size];
+  TransformSet tx_set;
+  if (tx_size_square_max > kTransformSize32x32) {
+    tx_set = kTransformSetDctOnly;
+  } else if (tx_size_square_max == kTransformSize32x32) {
+    tx_set = kTransformSetInter3;
+  } else if (tx_size_square_max == kTransformSize16x16) {
+    tx_set = kTransformSetInter2;
+  } else {
+    tx_set = kTransformSetInter1;
+  }
+  return kTransformTypeInSetMask[tx_set].Contains(tx_type);
+}
+
+template <int bitdepth, typename Pixel, typename DstPixel>
+void InverseTransformTest<bitdepth, Pixel, DstPixel>::TestRandomValues(
+    int num_tests) {
+  libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+
+  for (int tx_type_idx = 0; tx_type_idx < kNumTransformTypes; ++tx_type_idx) {
+    const TransformType tx_type = kLibgav1TxType[tx_type_idx];
+    const Transform1D row_transform = kRowTransform[tx_type];
+    const Transform1D column_transform = kColumnTransform[tx_type];
+
+    // Skip the 'C' test case as this is used as the reference.
+    if (base_inverse_transforms_[row_transform][tx_size_1d_row_][kRow] ==
+            nullptr ||
+        cur_inverse_transforms_[row_transform][tx_size_1d_row_][kRow] ==
+            nullptr ||
+        base_inverse_transforms_[column_transform][tx_size_1d_column_]
+                                [kColumn] == nullptr ||
+        cur_inverse_transforms_[column_transform][tx_size_1d_column_]
+                               [kColumn] == nullptr) {
+      continue;
+    }
+
+    // Only test valid tx_size for given tx_type.  See 5.11.40.
+    if (!IsTxSizeTypeValid(tx_size_, tx_type)) continue;
+
+    absl::Duration base_elapsed_time[2];
+    absl::Duration cur_elapsed_time[2];
+
+    for (int n = 0; n < num_tests; ++n) {
+      const int tx_height = std::min(block_height_, 32);
+      const int start_x = 0;
+      const int start_y = 0;
+
+      inverse_transform_mem_.Reset(&rnd, block_width_, block_height_);
+      memcpy(inverse_transform_mem_.base_residual,
+             inverse_transform_mem_.ref_src,
+             sizeof(inverse_transform_mem_.ref_src));
+      memcpy(inverse_transform_mem_.cur_residual,
+             inverse_transform_mem_.ref_src,
+             sizeof(inverse_transform_mem_.ref_src));
+
+      const absl::Time base_row_start = absl::Now();
+      base_inverse_transforms_[row_transform][tx_size_1d_row_][kRow](
+          tx_type, tx_size_, tx_height, inverse_transform_mem_.base_residual,
+          start_x, start_y, &base_frame_buffer_);
+      base_elapsed_time[kRow] += absl::Now() - base_row_start;
+
+      const absl::Time cur_row_start = absl::Now();
+      cur_inverse_transforms_[row_transform][tx_size_1d_row_][kRow](
+          tx_type, tx_size_, tx_height, inverse_transform_mem_.cur_residual,
+          start_x, start_y, &cur_frame_buffer_);
+      cur_elapsed_time[kRow] += absl::Now() - cur_row_start;
+
+      const absl::Time base_column_start = absl::Now();
+      base_inverse_transforms_[column_transform][tx_size_1d_column_][kColumn](
+          tx_type, tx_size_, tx_height, inverse_transform_mem_.base_residual,
+          start_x, start_y, &base_frame_buffer_);
+      base_elapsed_time[kColumn] += absl::Now() - base_column_start;
+
+      const absl::Time cur_column_start = absl::Now();
+      cur_inverse_transforms_[column_transform][tx_size_1d_column_][kColumn](
+          tx_type, tx_size_, tx_height, inverse_transform_mem_.cur_residual,
+          start_x, start_y, &cur_frame_buffer_);
+      cur_elapsed_time[kColumn] += absl::Now() - cur_column_start;
+
+      if (!test_utils::CompareBlocks(inverse_transform_mem_.base_frame,
+                                     inverse_transform_mem_.cur_frame,
+                                     block_width_, block_height_, kMaxBlockSize,
+                                     kMaxBlockSize, false)) {
+        ADD_FAILURE() << "Result from optimized version of "
+                      << ToString(
+                             static_cast<TransformSize1D>(tx_size_1d_column_))
+                      << " differs from reference in iteration #" << n
+                      << "tx_type_idx:" << tx_type_idx;
+        break;
+      }
+    }
+
+    if (num_tests > 1) {
+      const auto base_row_elapsed_time_us =
+          static_cast<int>(absl::ToInt64Microseconds(base_elapsed_time[kRow]));
+      const auto cur_row_elapsed_time_us =
+          static_cast<int>(absl::ToInt64Microseconds(cur_elapsed_time[kRow]));
+      printf("TxType %30s[%19s]:: base_row: %5d us  cur_row: %5d us  %2.2fx \n",
+             ToString(tx_type), kTransformSize1DNames[tx_size_1d_row_],
+             base_row_elapsed_time_us, cur_row_elapsed_time_us,
+             static_cast<float>(base_row_elapsed_time_us) /
+                 static_cast<float>(cur_row_elapsed_time_us));
+      const auto base_column_elapsed_time_us = static_cast<int>(
+          absl::ToInt64Microseconds(base_elapsed_time[kColumn]));
+      const auto cur_column_elapsed_time_us = static_cast<int>(
+          absl::ToInt64Microseconds(cur_elapsed_time[kColumn]));
+      printf("TxType %30s[%19s]:: base_col: %5d us  cur_col: %5d us  %2.2fx \n",
+             ToString(tx_type), kTransformSize1DNames[tx_size_1d_column_],
+             base_column_elapsed_time_us, cur_column_elapsed_time_us,
+             static_cast<float>(base_column_elapsed_time_us) /
+                 static_cast<float>(cur_column_elapsed_time_us));
+    }
+  }
+}
+
+template <int bitdepth, typename Pixel, typename DstPixel>
+void InverseTransformTest<bitdepth, Pixel, DstPixel>::TestDcOnlyRandomValue(
+    int num_tests) {
+  libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+
+  for (int tx_type_idx = 0; tx_type_idx < kNumTransformTypes; ++tx_type_idx) {
+    const TransformType tx_type = kLibgav1TxType[tx_type_idx];
+    const Transform1D row_transform = kRowTransform[tx_type];
+    const Transform1D column_transform = kColumnTransform[tx_type];
+
+    if (cur_inverse_transforms_[row_transform][tx_size_1d_row_][kRow] ==
+            nullptr ||
+        cur_inverse_transforms_[column_transform][tx_size_1d_column_]
+                               [kColumn] == nullptr) {
+      continue;
+    }
+
+    // Only test valid tx_size for given tx_type.  See 5.11.40.
+    if (IsTxSizeTypeValid(tx_size_, tx_type) == 0) continue;
+
+    absl::Duration base_elapsed_time[2];
+    absl::Duration cur_elapsed_time[2];
+
+    for (int n = 0; n < num_tests; ++n) {
+      const int tx_height = std::min(block_height_, 32);
+      const int start_x = 0;
+      const int start_y = 0;
+
+      // Using width == 1 and height == 1 will reset only the dc value.
+      inverse_transform_mem_.Reset(&rnd, 1, 1);
+      memcpy(inverse_transform_mem_.base_residual,
+             inverse_transform_mem_.ref_src,
+             sizeof(inverse_transform_mem_.ref_src));
+      memcpy(inverse_transform_mem_.cur_residual,
+             inverse_transform_mem_.ref_src,
+             sizeof(inverse_transform_mem_.ref_src));
+
+      // For this test, the "base" contains the output when the
+      // tx_height is set to the max for the given block size.  The
+      // "cur" contains the output when the passed in tx_height is 1.
+      // Compare the outputs for match.
+      const absl::Time base_row_start = absl::Now();
+      cur_inverse_transforms_[row_transform][tx_size_1d_row_][kRow](
+          tx_type, tx_size_, tx_height, inverse_transform_mem_.base_residual,
+          start_x, start_y, &base_frame_buffer_);
+      base_elapsed_time[kRow] += absl::Now() - base_row_start;
+
+      const absl::Time cur_row_start = absl::Now();
+      cur_inverse_transforms_[row_transform][tx_size_1d_row_][kRow](
+          tx_type, tx_size_, /*adjusted_tx_height=*/1,
+          inverse_transform_mem_.cur_residual, start_x, start_y,
+          &cur_frame_buffer_);
+      cur_elapsed_time[kRow] += absl::Now() - cur_row_start;
+
+      const absl::Time base_column_start = absl::Now();
+      cur_inverse_transforms_[column_transform][tx_size_1d_column_][kColumn](
+          tx_type, tx_size_, tx_height, inverse_transform_mem_.base_residual,
+          start_x, start_y, &base_frame_buffer_);
+      base_elapsed_time[kColumn] += absl::Now() - base_column_start;
+
+      const absl::Time cur_column_start = absl::Now();
+      cur_inverse_transforms_[column_transform][tx_size_1d_column_][kColumn](
+          tx_type, tx_size_, /*adjusted_tx_height=*/1,
+          inverse_transform_mem_.cur_residual, start_x, start_y,
+          &cur_frame_buffer_);
+      cur_elapsed_time[kColumn] += absl::Now() - cur_column_start;
+
+      if (!test_utils::CompareBlocks(inverse_transform_mem_.base_frame,
+                                     inverse_transform_mem_.cur_frame,
+                                     block_width_, block_height_, kMaxBlockSize,
+                                     kMaxBlockSize, false)) {
+        ADD_FAILURE() << "Result from dc only version of "
+                      << ToString(
+                             static_cast<TransformSize1D>(tx_size_1d_column_))
+                      << " differs from reference in iteration #" << n
+                      << "tx_type_idx:" << tx_type_idx;
+        break;
+      }
+    }
+
+    if (num_tests > 1) {
+      const auto base_row_elapsed_time_us =
+          static_cast<int>(absl::ToInt64Microseconds(base_elapsed_time[kRow]));
+      const auto cur_row_elapsed_time_us =
+          static_cast<int>(absl::ToInt64Microseconds(cur_elapsed_time[kRow]));
+      printf("TxType %30s[%19s]:: base_row: %5d us  cur_row: %5d us  %2.2fx \n",
+             ToString(tx_type), kTransformSize1DNames[tx_size_1d_row_],
+             base_row_elapsed_time_us, cur_row_elapsed_time_us,
+             static_cast<float>(base_row_elapsed_time_us) /
+                 static_cast<float>(cur_row_elapsed_time_us));
+      const auto base_column_elapsed_time_us = static_cast<int>(
+          absl::ToInt64Microseconds(base_elapsed_time[kColumn]));
+      const auto cur_column_elapsed_time_us = static_cast<int>(
+          absl::ToInt64Microseconds(cur_elapsed_time[kColumn]));
+      printf("TxType %30s[%19s]:: base_col: %5d us  cur_col: %5d us  %2.2fx \n",
+             ToString(tx_type), kTransformSize1DNames[tx_size_1d_column_],
+             base_column_elapsed_time_us, cur_column_elapsed_time_us,
+             static_cast<float>(base_column_elapsed_time_us) /
+                 static_cast<float>(cur_column_elapsed_time_us));
+    }
+  }
+}
+
+using InverseTransformTest8bpp = InverseTransformTest<8, int16_t, uint8_t>;
+
+TEST_P(InverseTransformTest8bpp, Random) { TestRandomValues(1); }
+
+TEST_P(InverseTransformTest8bpp, DISABLED_Speed) { TestRandomValues(10000); }
+
+TEST_P(InverseTransformTest8bpp, DcRandom) { TestDcOnlyRandomValue(1); }
+
+constexpr TransformSize kTransformSizesAll[] = {
+    kTransformSize4x4,   kTransformSize4x8,   kTransformSize4x16,
+    kTransformSize8x4,   kTransformSize8x8,   kTransformSize8x16,
+    kTransformSize8x32,  kTransformSize16x4,  kTransformSize16x8,
+    kTransformSize16x16, kTransformSize16x32, kTransformSize16x64,
+    kTransformSize32x8,  kTransformSize32x16, kTransformSize32x32,
+    kTransformSize32x64, kTransformSize64x16, kTransformSize64x32,
+    kTransformSize64x64};
+
+INSTANTIATE_TEST_SUITE_P(C, InverseTransformTest8bpp,
+                         testing::ValuesIn(kTransformSizesAll));
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, InverseTransformTest8bpp,
+                         testing::ValuesIn(kTransformSizesAll));
+#endif
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, InverseTransformTest8bpp,
+                         testing::ValuesIn(kTransformSizesAll));
+#endif
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using InverseTransformTest10bpp = InverseTransformTest<10, int32_t, uint16_t>;
+
+TEST_P(InverseTransformTest10bpp, Random) { TestRandomValues(1); }
+
+TEST_P(InverseTransformTest10bpp, DISABLED_Speed) { TestRandomValues(10000); }
+
+TEST_P(InverseTransformTest10bpp, DcRandom) { TestDcOnlyRandomValue(1); }
+
+INSTANTIATE_TEST_SUITE_P(C, InverseTransformTest10bpp,
+                         testing::ValuesIn(kTransformSizesAll));
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, InverseTransformTest10bpp,
+                         testing::ValuesIn(kTransformSizesAll));
+#endif
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+}  // namespace
+}  // namespace dsp
+
+static std::ostream& operator<<(std::ostream& os, const TransformSize param) {
+  return os << ToString(param);
+}
+
+}  // namespace libgav1
diff --git a/src/dsp/libgav1_dsp.cmake b/src/dsp/libgav1_dsp.cmake
index 960d5a7..a28334d 100644
--- a/src/dsp/libgav1_dsp.cmake
+++ b/src/dsp/libgav1_dsp.cmake
@@ -40,8 +40,16 @@ list(APPEND libgav1_dsp_sources
             "${libgav1_source}/dsp/film_grain_common.h"
             "${libgav1_source}/dsp/intra_edge.cc"
             "${libgav1_source}/dsp/intra_edge.h"
+            "${libgav1_source}/dsp/intrapred_cfl.cc"
+            "${libgav1_source}/dsp/intrapred_cfl.h"
+            "${libgav1_source}/dsp/intrapred_directional.cc"
+            "${libgav1_source}/dsp/intrapred_directional.h"
+            "${libgav1_source}/dsp/intrapred_filter.cc"
+            "${libgav1_source}/dsp/intrapred_filter.h"
             "${libgav1_source}/dsp/intrapred.cc"
             "${libgav1_source}/dsp/intrapred.h"
+            "${libgav1_source}/dsp/intrapred_smooth.cc"
+            "${libgav1_source}/dsp/intrapred_smooth.h"
             "${libgav1_source}/dsp/inverse_transform.cc"
             "${libgav1_source}/dsp/inverse_transform.h"
             "${libgav1_source}/dsp/inverse_transform.inc"
@@ -67,6 +75,8 @@ list(APPEND libgav1_dsp_sources
 
 list(APPEND libgav1_dsp_sources_avx2
             ${libgav1_dsp_sources_avx2}
+            "${libgav1_source}/dsp/x86/cdef_avx2.cc"
+            "${libgav1_source}/dsp/x86/cdef_avx2.h"
             "${libgav1_source}/dsp/x86/convolve_avx2.cc"
             "${libgav1_source}/dsp/x86/convolve_avx2.h"
             "${libgav1_source}/dsp/x86/loop_restoration_10bit_avx2.cc"
@@ -89,11 +99,16 @@ list(APPEND libgav1_dsp_sources_neon
             "${libgav1_source}/dsp/arm/intra_edge_neon.cc"
             "${libgav1_source}/dsp/arm/intra_edge_neon.h"
             "${libgav1_source}/dsp/arm/intrapred_cfl_neon.cc"
+            "${libgav1_source}/dsp/arm/intrapred_cfl_neon.h"
+            "${libgav1_source}/dsp/arm/intrapred_directional_neon.h"
             "${libgav1_source}/dsp/arm/intrapred_directional_neon.cc"
-            "${libgav1_source}/dsp/arm/intrapred_filter_intra_neon.cc"
+            "${libgav1_source}/dsp/arm/intrapred_filter_neon.cc"
+            "${libgav1_source}/dsp/arm/intrapred_filter_neon.h"
             "${libgav1_source}/dsp/arm/intrapred_neon.cc"
             "${libgav1_source}/dsp/arm/intrapred_neon.h"
             "${libgav1_source}/dsp/arm/intrapred_smooth_neon.cc"
+            "${libgav1_source}/dsp/arm/intrapred_smooth_neon.h"
+            "${libgav1_source}/dsp/arm/inverse_transform_10bit_neon.cc"
             "${libgav1_source}/dsp/arm/inverse_transform_neon.cc"
             "${libgav1_source}/dsp/arm/inverse_transform_neon.h"
             "${libgav1_source}/dsp/arm/loop_filter_neon.cc"
@@ -124,14 +139,23 @@ list(APPEND libgav1_dsp_sources_sse4
             "${libgav1_source}/dsp/x86/cdef_sse4.h"
             "${libgav1_source}/dsp/x86/convolve_sse4.cc"
             "${libgav1_source}/dsp/x86/convolve_sse4.h"
+            "${libgav1_source}/dsp/x86/convolve_sse4.inc"
             "${libgav1_source}/dsp/x86/distance_weighted_blend_sse4.cc"
             "${libgav1_source}/dsp/x86/distance_weighted_blend_sse4.h"
+            "${libgav1_source}/dsp/x86/film_grain_sse4.cc"
+            "${libgav1_source}/dsp/x86/film_grain_sse4.h"
             "${libgav1_source}/dsp/x86/intra_edge_sse4.cc"
             "${libgav1_source}/dsp/x86/intra_edge_sse4.h"
+            "${libgav1_source}/dsp/x86/intrapred_cfl_sse4.cc"
+            "${libgav1_source}/dsp/x86/intrapred_cfl_sse4.h"
+            "${libgav1_source}/dsp/x86/intrapred_directional_sse4.cc"
+            "${libgav1_source}/dsp/x86/intrapred_directional_sse4.h"
+            "${libgav1_source}/dsp/x86/intrapred_filter_sse4.cc"
+            "${libgav1_source}/dsp/x86/intrapred_filter_sse4.h"
             "${libgav1_source}/dsp/x86/intrapred_sse4.cc"
             "${libgav1_source}/dsp/x86/intrapred_sse4.h"
-            "${libgav1_source}/dsp/x86/intrapred_cfl_sse4.cc"
             "${libgav1_source}/dsp/x86/intrapred_smooth_sse4.cc"
+            "${libgav1_source}/dsp/x86/intrapred_smooth_sse4.h"
             "${libgav1_source}/dsp/x86/inverse_transform_sse4.cc"
             "${libgav1_source}/dsp/x86/inverse_transform_sse4.h"
             "${libgav1_source}/dsp/x86/loop_filter_sse4.cc"
diff --git a/src/dsp/loop_filter_test.cc b/src/dsp/loop_filter_test.cc
new file mode 100644
index 0000000..ca5107a
--- /dev/null
+++ b/src/dsp/loop_filter_test.cc
@@ -0,0 +1,348 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/loop_filter.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <cstdio>
+#include <cstring>
+#include <ostream>
+#include <string>
+
+#include "absl/strings/match.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "tests/block_utils.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/third_party/libvpx/md5_helper.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+// Horizontal and Vertical need 32x32: 8  pixels preceding filtered section
+//                                     16 pixels within filtered section
+//                                     8  pixels following filtered section
+constexpr int kNumPixels = 1024;
+constexpr int kBlockStride = 32;
+
+constexpr int kNumTests = 50000;
+constexpr int kNumSpeedTests = 500000;
+
+constexpr int kMaxLoopFilter = 63;
+
+template <typename Pixel>
+void InitInput(Pixel* dst, const int stride, const int bitdepth,
+               libvpx_test::ACMRandom& rnd, const uint8_t inner_thresh,
+               const bool transpose) {
+  const int max_pixel = (1 << bitdepth) - 1;
+  const int pixel_range = max_pixel + 1;
+  Pixel tmp[kNumPixels];
+  auto clip_pixel = [max_pixel](int val) {
+    return static_cast<Pixel>(std::max(std::min(val, max_pixel), 0));
+  };
+
+  for (int i = 0; i < kNumPixels;) {
+    const uint8_t val = rnd.Rand8();
+    if (val & 0x80) {  // 50% chance to choose a new value.
+      tmp[i++] = rnd(pixel_range);
+    } else {  // 50% chance to repeat previous value in row X times.
+      int j = 0;
+      while (j++ < ((val & 0x1f) + 1) && i < kNumPixels) {
+        if (i < 1) {
+          tmp[i] = rnd(pixel_range);
+        } else if (val & 0x20) {  // Increment by a value within the limit.
+          tmp[i] = clip_pixel(tmp[i - 1] + (inner_thresh - 1));
+        } else {  // Decrement by a value within the limit.
+          tmp[i] = clip_pixel(tmp[i - 1] - (inner_thresh - 1));
+        }
+        ++i;
+      }
+    }
+  }
+
+  for (int i = 0; i < kNumPixels;) {
+    const uint8_t val = rnd.Rand8();
+    if (val & 0x80) {
+      ++i;
+    } else {  // 50% chance to repeat previous value in column X times.
+      int j = 0;
+      while (j++ < ((val & 0x1f) + 1) && i < kNumPixels) {
+        if (i < 1) {
+          tmp[i] = rnd(pixel_range);
+        } else if (val & 0x20) {  // Increment by a value within the limit.
+          tmp[(i % 32) * 32 + i / 32] = clip_pixel(
+              tmp[((i - 1) % 32) * 32 + (i - 1) / 32] + (inner_thresh - 1));
+        } else {  // Decrement by a value within the inner_thresh.
+          tmp[(i % 32) * 32 + i / 32] = clip_pixel(
+              tmp[((i - 1) % 32) * 32 + (i - 1) / 32] - (inner_thresh - 1));
+        }
+        ++i;
+      }
+    }
+  }
+
+  for (int i = 0; i < kNumPixels; ++i) {
+    const int offset = transpose ? stride * (i % stride) + i / stride : i;
+    dst[i] = tmp[offset];
+  }
+}
+
+template <int bitdepth, typename Pixel>
+class LoopFilterTest : public testing::TestWithParam<LoopFilterSize> {
+ public:
+  LoopFilterTest() = default;
+  LoopFilterTest(const LoopFilterTest&) = delete;
+  LoopFilterTest& operator=(const LoopFilterTest&) = delete;
+  ~LoopFilterTest() override = default;
+
+ protected:
+  void SetUp() override {
+    test_utils::ResetDspTable(bitdepth);
+    LoopFilterInit_C();
+
+    const Dsp* const dsp = GetDspTable(bitdepth);
+    ASSERT_NE(dsp, nullptr);
+    memcpy(base_loop_filters_, dsp->loop_filters[size_],
+           sizeof(base_loop_filters_));
+
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    const char* const test_case = test_info->test_suite_name();
+    if (absl::StartsWith(test_case, "C/")) {
+      memset(base_loop_filters_, 0, sizeof(base_loop_filters_));
+    } else if (absl::StartsWith(test_case, "SSE41/")) {
+      if ((GetCpuInfo() & kSSE4_1) != 0) {
+        LoopFilterInit_SSE4_1();
+      }
+    } else if (absl::StartsWith(test_case, "NEON/")) {
+      LoopFilterInit_NEON();
+    } else {
+      FAIL() << "Unrecognized architecture prefix in test case name: "
+             << test_case;
+    }
+
+    memcpy(cur_loop_filters_, dsp->loop_filters[size_],
+           sizeof(cur_loop_filters_));
+
+    for (int i = 0; i < kNumLoopFilterTypes; ++i) {
+      // skip functions that haven't been specialized for this particular
+      // architecture.
+      if (cur_loop_filters_[i] == base_loop_filters_[i]) {
+        cur_loop_filters_[i] = nullptr;
+      }
+    }
+  }
+
+  // Check |digests| if non-NULL otherwise print the filter timing.
+  void TestRandomValues(const char* const digests[kNumLoopFilterTypes],
+                        int num_runs) const;
+  void TestSaturatedValues() const;
+
+  const LoopFilterSize size_ = GetParam();
+  LoopFilterFunc base_loop_filters_[kNumLoopFilterTypes];
+  LoopFilterFunc cur_loop_filters_[kNumLoopFilterTypes];
+};
+
+template <int bitdepth, typename Pixel>
+void LoopFilterTest<bitdepth, Pixel>::TestRandomValues(
+    const char* const digests[kNumLoopFilterTypes], const int num_runs) const {
+  for (int i = 0; i < kNumLoopFilterTypes; ++i) {
+    libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+    if (cur_loop_filters_[i] == nullptr) continue;
+
+    libvpx_test::MD5 md5_digest;
+    absl::Duration elapsed_time;
+    for (int n = 0; n < num_runs; ++n) {
+      Pixel dst[kNumPixels];
+      const auto outer_thresh =
+          static_cast<uint8_t>(rnd(3 * kMaxLoopFilter + 5));
+      const auto inner_thresh = static_cast<uint8_t>(rnd(kMaxLoopFilter + 1));
+      const auto hev_thresh =
+          static_cast<uint8_t>(rnd(kMaxLoopFilter + 1) >> 4);
+      InitInput(dst, kBlockStride, bitdepth, rnd, inner_thresh, (n & 1) == 0);
+
+      const absl::Time start = absl::Now();
+      cur_loop_filters_[i](dst + 8 + kBlockStride * 8, kBlockStride,
+                           outer_thresh, inner_thresh, hev_thresh);
+      elapsed_time += absl::Now() - start;
+
+      md5_digest.Add(reinterpret_cast<const uint8_t*>(dst), sizeof(dst));
+    }
+    if (digests == nullptr) {
+      const auto elapsed_time_us =
+          static_cast<int>(absl::ToInt64Microseconds(elapsed_time));
+      printf("Mode %s[%25s]: %5d us\n",
+             ToString(static_cast<LoopFilterSize>(size_)),
+             ToString(static_cast<LoopFilterType>(i)), elapsed_time_us);
+    } else {
+      const std::string digest = md5_digest.Get();
+      printf("Mode %s[%25s]: MD5: %s\n",
+             ToString(static_cast<LoopFilterSize>(size_)),
+             ToString(static_cast<LoopFilterType>(i)), digest.c_str());
+      EXPECT_STREQ(digests[i], digest.c_str());
+    }
+  }
+}
+
+template <int bitdepth, typename Pixel>
+void LoopFilterTest<bitdepth, Pixel>::TestSaturatedValues() const {
+  const LoopFilterType filter = kLoopFilterTypeHorizontal;
+  if (cur_loop_filters_[filter] == nullptr) return;
+
+  Pixel dst[kNumPixels], ref[kNumPixels];
+  const auto value = static_cast<Pixel>((1 << bitdepth) - 1);
+  for (auto& r : dst) r = value;
+  memcpy(ref, dst, sizeof(dst));
+
+  const int outer_thresh = 24;
+  const int inner_thresh = 8;
+  const int hev_thresh = 0;
+  cur_loop_filters_[filter](dst + 8 + kBlockStride * 8, kBlockStride,
+                            outer_thresh, inner_thresh, hev_thresh);
+  ASSERT_TRUE(test_utils::CompareBlocks(ref, dst, kBlockStride, kBlockStride,
+                                        kBlockStride, kBlockStride, true))
+      << "kLoopFilterTypeHorizontal output doesn't match reference";
+}
+
+//------------------------------------------------------------------------------
+
+using LoopFilterTest8bpp = LoopFilterTest<8, uint8_t>;
+
+const char* const* GetDigests8bpp(LoopFilterSize size) {
+  static const char* const kDigestsSize4[kNumLoopFilterTypes] = {
+      "2e07bdb04b363d4ce69c7d738b1ee01a",
+      "7ff41f2ffa809a2016d342d92afa7f89",
+  };
+  static const char* const kDigestsSize6[kNumLoopFilterTypes] = {
+      "2cd4d9ee7497ed67e38fad9cbeb7e278",
+      "75c57a30a927d1aca1ac5c4f175712ca",
+  };
+  static const char* const kDigestsSize8[kNumLoopFilterTypes] = {
+      "854860a272d58ace223454ea727a6fe4",
+      "4129ee49b047777583c0e9b2006c87bf",
+  };
+  static const char* const kDigestsSize14[kNumLoopFilterTypes] = {
+      "6eb768620b7ccc84b6f88b9193b02ad2",
+      "56e034d9edbe0d5a3cae69b2d9b3486e",
+  };
+
+  switch (size) {
+    case kLoopFilterSize4:
+      return kDigestsSize4;
+    case kLoopFilterSize6:
+      return kDigestsSize6;
+    case kLoopFilterSize8:
+      return kDigestsSize8;
+    case kLoopFilterSize14:
+      return kDigestsSize14;
+    default:
+      ADD_FAILURE() << "Unknown loop filter size" << size;
+      return nullptr;
+  }
+}
+
+TEST_P(LoopFilterTest8bpp, DISABLED_Speed) {
+  TestRandomValues(nullptr, kNumSpeedTests);
+}
+
+TEST_P(LoopFilterTest8bpp, FixedInput) {
+  TestRandomValues(GetDigests8bpp(size_), kNumTests);
+}
+
+TEST_P(LoopFilterTest8bpp, SaturatedValues) { TestSaturatedValues(); }
+
+constexpr LoopFilterSize kLoopFilterSizes[] = {
+    kLoopFilterSize4, kLoopFilterSize6, kLoopFilterSize8, kLoopFilterSize14};
+
+INSTANTIATE_TEST_SUITE_P(C, LoopFilterTest8bpp,
+                         testing::ValuesIn(kLoopFilterSizes));
+
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, LoopFilterTest8bpp,
+                         testing::ValuesIn(kLoopFilterSizes));
+#endif
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, LoopFilterTest8bpp,
+                         testing::ValuesIn(kLoopFilterSizes));
+#endif
+//------------------------------------------------------------------------------
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using LoopFilterTest10bpp = LoopFilterTest<10, uint16_t>;
+
+const char* const* GetDigests10bpp(LoopFilterSize size) {
+  static const char* const kDigestsSize4[kNumLoopFilterTypes] = {
+      "657dd0f612734c9c1fb50a2313567af4",
+      "b1c0a0a0b35bad1589badf3c291c0461",
+  };
+  static const char* const kDigestsSize6[kNumLoopFilterTypes] = {
+      "d41906d4830157052d5bde417d9df9fc",
+      "451490def78bd649d16d64db4e665a62",
+  };
+  static const char* const kDigestsSize8[kNumLoopFilterTypes] = {
+      "a763127680f31db7184f2a63ee140268",
+      "1f413bebacaa2435f0e07963a9095243",
+  };
+  static const char* const kDigestsSize14[kNumLoopFilterTypes] = {
+      "f0e61add3e5856657c4055751a6dd6e2",
+      "44da25d613ea601bf5f6e2a42d329cf0",
+  };
+
+  switch (size) {
+    case kLoopFilterSize4:
+      return kDigestsSize4;
+    case kLoopFilterSize6:
+      return kDigestsSize6;
+    case kLoopFilterSize8:
+      return kDigestsSize8;
+    case kLoopFilterSize14:
+      return kDigestsSize14;
+    default:
+      ADD_FAILURE() << "Unknown loop filter size" << size;
+      return nullptr;
+  }
+}
+
+TEST_P(LoopFilterTest10bpp, DISABLED_Speed) {
+  TestRandomValues(nullptr, kNumSpeedTests);
+}
+
+TEST_P(LoopFilterTest10bpp, FixedInput) {
+  TestRandomValues(GetDigests10bpp(size_), kNumTests);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, LoopFilterTest10bpp,
+                         testing::ValuesIn(kLoopFilterSizes));
+
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, LoopFilterTest10bpp,
+                         testing::ValuesIn(kLoopFilterSizes));
+#endif
+#endif
+
+}  // namespace
+
+static std::ostream& operator<<(std::ostream& os, const LoopFilterSize size) {
+  return os << ToString(size);
+}
+
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/loop_restoration.cc b/src/dsp/loop_restoration.cc
index 0909df0..1a15d90 100644
--- a/src/dsp/loop_restoration.cc
+++ b/src/dsp/loop_restoration.cc
@@ -143,12 +143,12 @@ inline void WienerVertical(const int16_t* wiener_buffer, const int width,
 // filter[3] = 0 - 2 * (filter[0] + filter[1] + filter[2]).
 // Thus in libaom's computation, an offset of 128 is needed for filter[3].
 template <int bitdepth, typename Pixel>
-void WienerFilter_C(const RestorationUnitInfo& restoration_info,
-                    const void* const source, const void* const top_border,
-                    const void* const bottom_border, const ptrdiff_t stride,
-                    const int width, const int height,
-                    RestorationBuffer* const restoration_buffer,
-                    void* const dest) {
+void WienerFilter_C(
+    const RestorationUnitInfo& restoration_info, const void* const source,
+    const ptrdiff_t stride, const void* const top_border,
+    const ptrdiff_t top_border_stride, const void* const bottom_border,
+    const ptrdiff_t bottom_border_stride, const int width, const int height,
+    RestorationBuffer* const restoration_buffer, void* const dest) {
   constexpr int kCenterTap = kWienerFilterTaps / 2;
   const int16_t* const number_leading_zero_coefficients =
       restoration_info.wiener_info.number_leading_zero_coefficients;
@@ -170,38 +170,42 @@ void WienerFilter_C(const RestorationUnitInfo& restoration_info,
   auto* wiener_buffer = wiener_buffer_org + number_rows_to_skip * width;
 
   if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) {
-    WienerHorizontal<bitdepth, Pixel>(top + (2 - height_extra) * stride, stride,
-                                      width, height_extra, filter_horizontal, 0,
-                                      &wiener_buffer);
+    WienerHorizontal<bitdepth, Pixel>(
+        top + (2 - height_extra) * top_border_stride, top_border_stride, width,
+        height_extra, filter_horizontal, 0, &wiener_buffer);
     WienerHorizontal<bitdepth, Pixel>(src, stride, width, height,
                                       filter_horizontal, 0, &wiener_buffer);
-    WienerHorizontal<bitdepth, Pixel>(bottom, stride, width, height_extra,
-                                      filter_horizontal, 0, &wiener_buffer);
-  } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
-    WienerHorizontal<bitdepth, Pixel>(top + (2 - height_extra) * stride, stride,
-                                      width, height_extra, filter_horizontal, 1,
+    WienerHorizontal<bitdepth, Pixel>(bottom, bottom_border_stride, width,
+                                      height_extra, filter_horizontal, 0,
                                       &wiener_buffer);
+  } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
+    WienerHorizontal<bitdepth, Pixel>(
+        top + (2 - height_extra) * top_border_stride, top_border_stride, width,
+        height_extra, filter_horizontal, 1, &wiener_buffer);
     WienerHorizontal<bitdepth, Pixel>(src, stride, width, height,
                                       filter_horizontal, 1, &wiener_buffer);
-    WienerHorizontal<bitdepth, Pixel>(bottom, stride, width, height_extra,
-                                      filter_horizontal, 1, &wiener_buffer);
-  } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) {
-    WienerHorizontal<bitdepth, Pixel>(top + (2 - height_extra) * stride, stride,
-                                      width, height_extra, filter_horizontal, 2,
+    WienerHorizontal<bitdepth, Pixel>(bottom, bottom_border_stride, width,
+                                      height_extra, filter_horizontal, 1,
                                       &wiener_buffer);
+  } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) {
+    WienerHorizontal<bitdepth, Pixel>(
+        top + (2 - height_extra) * top_border_stride, top_border_stride, width,
+        height_extra, filter_horizontal, 2, &wiener_buffer);
     WienerHorizontal<bitdepth, Pixel>(src, stride, width, height,
                                       filter_horizontal, 2, &wiener_buffer);
-    WienerHorizontal<bitdepth, Pixel>(bottom, stride, width, height_extra,
-                                      filter_horizontal, 2, &wiener_buffer);
+    WienerHorizontal<bitdepth, Pixel>(bottom, bottom_border_stride, width,
+                                      height_extra, filter_horizontal, 2,
+                                      &wiener_buffer);
   } else {
     assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3);
-    WienerHorizontal<bitdepth, Pixel>(top + (2 - height_extra) * stride, stride,
-                                      width, height_extra, filter_horizontal, 3,
-                                      &wiener_buffer);
+    WienerHorizontal<bitdepth, Pixel>(
+        top + (2 - height_extra) * top_border_stride, top_border_stride, width,
+        height_extra, filter_horizontal, 3, &wiener_buffer);
     WienerHorizontal<bitdepth, Pixel>(src, stride, width, height,
                                       filter_horizontal, 3, &wiener_buffer);
-    WienerHorizontal<bitdepth, Pixel>(bottom, stride, width, height_extra,
-                                      filter_horizontal, 3, &wiener_buffer);
+    WienerHorizontal<bitdepth, Pixel>(bottom, bottom_border_stride, width,
+                                      height_extra, filter_horizontal, 3,
+                                      &wiener_buffer);
   }
 
   // vertical filtering.
@@ -233,7 +237,7 @@ void WienerFilter_C(const RestorationUnitInfo& restoration_info,
 //------------------------------------------------------------------------------
 // SGR
 
-// When |height| is 1, |src_stride| could be set to arbitrary value.
+// When |height| is 1, |src_stride| could be set to an arbitrary value.
 template <typename Pixel, int size>
 LIBGAV1_ALWAYS_INLINE void BoxSum(const Pixel* src, const ptrdiff_t src_stride,
                                   const int height, const int width,
@@ -267,7 +271,7 @@ LIBGAV1_ALWAYS_INLINE void BoxSum(const Pixel* src, const ptrdiff_t src_stride,
   } while (--y != 0);
 }
 
-// When |height| is 1, |src_stride| could be set to arbitrary value.
+// When |height| is 1, |src_stride| could be set to an arbitrary value.
 template <typename Pixel>
 LIBGAV1_ALWAYS_INLINE void BoxSum(const Pixel* src, const ptrdiff_t src_stride,
                                   const int height, const int width,
@@ -541,8 +545,11 @@ inline void BoxFilter(const Pixel* const src, const ptrdiff_t stride,
 
 template <int bitdepth, typename Pixel>
 inline void BoxFilterProcess(const RestorationUnitInfo& restoration_info,
-                             const Pixel* src, const Pixel* const top_border,
-                             const Pixel* bottom_border, const ptrdiff_t stride,
+                             const Pixel* src, const ptrdiff_t stride,
+                             const Pixel* const top_border,
+                             const ptrdiff_t top_border_stride,
+                             const Pixel* bottom_border,
+                             const ptrdiff_t bottom_border_stride,
                              const int width, const int height,
                              SgrBuffer* const sgr_buffer, Pixel* dst) {
   const auto temp_stride = Align<ptrdiff_t>(width, 8);
@@ -582,8 +589,8 @@ inline void BoxFilterProcess(const RestorationUnitInfo& restoration_info,
   b565[1] = b565[0] + temp_stride;
   assert(scales[0] != 0);
   assert(scales[1] != 0);
-  BoxSum<Pixel>(top_border, stride, 2, width + 2, sum3, sum5 + 1, square_sum3,
-                square_sum5 + 1);
+  BoxSum<Pixel>(top_border, top_border_stride, 2, width + 2, sum3, sum5 + 1,
+                square_sum3, square_sum5 + 1);
   sum5[0] = sum5[1];
   square_sum5[0] = square_sum5[1];
   BoxSum<Pixel>(src, stride, 1, width + 2, sum3 + 2, sum5 + 3, square_sum3 + 2,
@@ -631,7 +638,7 @@ inline void BoxFilterProcess(const RestorationUnitInfo& restoration_info,
     ptrdiff_t s_stride;
     if ((height & 1) == 0) {
       sr = bottom_border;
-      s_stride = stride;
+      s_stride = bottom_border_stride;
     } else {
       sr = src + 2 * stride;
       s_stride = bottom_border - (src + 2 * stride);
@@ -658,8 +665,9 @@ inline void BoxFilterProcess(const RestorationUnitInfo& restoration_info,
       std::swap(ma565[0], ma565[1]);
       std::swap(b565[0], b565[1]);
     }
-    BoxSum<Pixel>(bottom_border + stride, stride, 1, width + 2, sum3 + 2,
-                  sum5 + 3, square_sum3 + 2, square_sum5 + 3);
+    BoxSum<Pixel>(bottom_border + bottom_border_stride, bottom_border_stride, 1,
+                  width + 2, sum3 + 2, sum5 + 3, square_sum3 + 2,
+                  square_sum5 + 3);
     sum5[4] = sum5[3];
     square_sum5[4] = square_sum5[3];
     BoxFilterPreProcess5<bitdepth>(sum5, square_sum5, width, scales[0],
@@ -681,12 +689,13 @@ inline void BoxFilterProcess(const RestorationUnitInfo& restoration_info,
 
 template <int bitdepth, typename Pixel>
 inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
-                                  const Pixel* src,
+                                  const Pixel* src, const ptrdiff_t stride,
                                   const Pixel* const top_border,
+                                  const ptrdiff_t top_border_stride,
                                   const Pixel* bottom_border,
-                                  const ptrdiff_t stride, const int width,
-                                  const int height, SgrBuffer* const sgr_buffer,
-                                  Pixel* dst) {
+                                  const ptrdiff_t bottom_border_stride,
+                                  const int width, const int height,
+                                  SgrBuffer* const sgr_buffer, Pixel* dst) {
   const auto temp_stride = Align<ptrdiff_t>(width, 8);
   const ptrdiff_t sum_stride = temp_stride + 8;
   const int sgr_proj_index = restoration_info.sgr_proj_info.index;
@@ -705,7 +714,8 @@ inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
   b565[0] = sgr_buffer->b565;
   b565[1] = b565[0] + temp_stride;
   assert(scale != 0);
-  BoxSum<Pixel, 5>(top_border, stride, 2, width + 2, sum5 + 1, square_sum5 + 1);
+  BoxSum<Pixel, 5>(top_border, top_border_stride, 2, width + 2, sum5 + 1,
+                   square_sum5 + 1);
   sum5[0] = sum5[1];
   square_sum5[0] = square_sum5[1];
   BoxSum<Pixel, 5>(src, stride, 1, width + 2, sum5 + 3, square_sum5 + 3);
@@ -736,7 +746,7 @@ inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
     ptrdiff_t s_stride;
     if ((height & 1) == 0) {
       sr = bottom_border;
-      s_stride = stride;
+      s_stride = bottom_border_stride;
     } else {
       sr = src + 2 * stride;
       s_stride = bottom_border - (src + 2 * stride);
@@ -755,8 +765,8 @@ inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
       Circulate5PointersBy2<uint16_t>(sum5);
       Circulate5PointersBy2<uint32_t>(square_sum5);
     }
-    BoxSum<Pixel, 5>(bottom_border + stride, stride, 1, width + 2, sum5 + 3,
-                     square_sum5 + 3);
+    BoxSum<Pixel, 5>(bottom_border + bottom_border_stride, bottom_border_stride,
+                     1, width + 2, sum5 + 3, square_sum5 + 3);
     sum5[4] = sum5[3];
     square_sum5[4] = square_sum5[3];
     BoxFilterPreProcess5<bitdepth>(sum5, square_sum5, width, scale, sgr_buffer,
@@ -772,12 +782,13 @@ inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
 
 template <int bitdepth, typename Pixel>
 inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
-                                  const Pixel* src,
+                                  const Pixel* src, const ptrdiff_t stride,
                                   const Pixel* const top_border,
+                                  const ptrdiff_t top_border_stride,
                                   const Pixel* bottom_border,
-                                  const ptrdiff_t stride, const int width,
-                                  const int height, SgrBuffer* const sgr_buffer,
-                                  Pixel* dst) {
+                                  const ptrdiff_t bottom_border_stride,
+                                  const int width, const int height,
+                                  SgrBuffer* const sgr_buffer, Pixel* dst) {
   assert(restoration_info.sgr_proj_info.multiplier[0] == 0);
   const auto temp_stride = Align<ptrdiff_t>(width, 8);
   const ptrdiff_t sum_stride = temp_stride + 8;
@@ -802,7 +813,8 @@ inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
   b444[0] = sgr_buffer->b444;
   b444[1] = b444[0] + temp_stride;
   assert(scale != 0);
-  BoxSum<Pixel, 3>(top_border, stride, 2, width + 2, sum3, square_sum3);
+  BoxSum<Pixel, 3>(top_border, top_border_stride, 2, width + 2, sum3,
+                   square_sum3);
   BoxSum<Pixel, 3>(src, stride, 1, width + 2, sum3 + 2, square_sum3 + 2);
   BoxFilterPreProcess3<bitdepth>(sum3, square_sum3, width, scale, false,
                                  sgr_buffer, ma343[0], b343[0], nullptr,
@@ -814,7 +826,7 @@ inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
     s = src + stride;
   } else {
     s = bottom_border;
-    bottom_border += stride;
+    bottom_border += bottom_border_stride;
   }
   BoxSum<Pixel, 3>(s, 0, 1, width + 2, sum3 + 2, square_sum3 + 2);
   BoxFilterPreProcess3<bitdepth>(sum3, square_sum3, width, scale, true,
@@ -845,7 +857,7 @@ inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
                                     b444, dst);
     src += stride;
     dst += stride;
-    bottom_border += stride;
+    bottom_border += bottom_border_stride;
     Circulate3PointersBy1<uint16_t>(ma343);
     Circulate3PointersBy1<uint32_t>(b343);
     std::swap(ma444[0], ma444[1]);
@@ -854,12 +866,12 @@ inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
 }
 
 template <int bitdepth, typename Pixel>
-void SelfGuidedFilter_C(const RestorationUnitInfo& restoration_info,
-                        const void* const source, const void* const top_border,
-                        const void* const bottom_border, const ptrdiff_t stride,
-                        const int width, const int height,
-                        RestorationBuffer* const restoration_buffer,
-                        void* const dest) {
+void SelfGuidedFilter_C(
+    const RestorationUnitInfo& restoration_info, const void* const source,
+    const ptrdiff_t stride, const void* const top_border,
+    const ptrdiff_t top_border_stride, const void* const bottom_border,
+    const ptrdiff_t bottom_border_stride, const int width, const int height,
+    RestorationBuffer* const restoration_buffer, void* const dest) {
   const int index = restoration_info.sgr_proj_info.index;
   const int radius_pass_0 = kSgrProjParams[index][0];  // 2 or 0
   const int radius_pass_1 = kSgrProjParams[index][2];  // 1 or 0
@@ -872,17 +884,17 @@ void SelfGuidedFilter_C(const RestorationUnitInfo& restoration_info,
     // |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the
     // following assertion.
     assert(radius_pass_0 != 0);
-    BoxFilterProcessPass1<bitdepth, Pixel>(restoration_info, src - 3, top - 3,
-                                           bottom - 3, stride, width, height,
-                                           sgr_buffer, dst);
+    BoxFilterProcessPass1<bitdepth, Pixel>(
+        restoration_info, src - 3, stride, top - 3, top_border_stride,
+        bottom - 3, bottom_border_stride, width, height, sgr_buffer, dst);
   } else if (radius_pass_0 == 0) {
-    BoxFilterProcessPass2<bitdepth, Pixel>(restoration_info, src - 2, top - 2,
-                                           bottom - 2, stride, width, height,
-                                           sgr_buffer, dst);
+    BoxFilterProcessPass2<bitdepth, Pixel>(
+        restoration_info, src - 2, stride, top - 2, top_border_stride,
+        bottom - 2, bottom_border_stride, width, height, sgr_buffer, dst);
   } else {
-    BoxFilterProcess<bitdepth, Pixel>(restoration_info, src - 3, top - 3,
-                                      bottom - 3, stride, width, height,
-                                      sgr_buffer, dst);
+    BoxFilterProcess<bitdepth, Pixel>(
+        restoration_info, src - 3, stride, top - 3, top_border_stride,
+        bottom - 3, bottom_border_stride, width, height, sgr_buffer, dst);
   }
 }
 
diff --git a/src/dsp/loop_restoration_test.cc b/src/dsp/loop_restoration_test.cc
new file mode 100644
index 0000000..97a05d4
--- /dev/null
+++ b/src/dsp/loop_restoration_test.cc
@@ -0,0 +1,616 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/loop_restoration.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <cstring>
+#include <string>
+
+#include "absl/strings/match.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/common.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/memory.h"
+#include "tests/block_utils.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+// in unit of Pixel.
+constexpr int kBorder = 16;
+constexpr int kWidth = 256;
+constexpr int kHeight = 255;
+constexpr int kStride = kWidth + 2 * kBorder;
+constexpr int kOffset = kBorder * kStride + kBorder;
+constexpr int kMaxBlockSize = 288 * kStride;
+constexpr int kUnitWidths[] = {32, 64, 128, 256};
+
+constexpr int kNumRadiusTypes = 3;
+constexpr int kNumWienerOrders = 4;
+constexpr int kWienerOrders[] = {7, 5, 3, 1};
+constexpr int kWienerOrderIdLookup[] = {0, 3, 0, 2, 0, 1, 0, 0};
+
+template <int bitdepth, typename Pixel>
+class SelfGuidedFilterTest : public testing::TestWithParam<int>,
+                             public test_utils::MaxAlignedAllocable {
+ public:
+  SelfGuidedFilterTest() = default;
+  SelfGuidedFilterTest(const SelfGuidedFilterTest&) = delete;
+  SelfGuidedFilterTest& operator=(const SelfGuidedFilterTest&) = delete;
+  ~SelfGuidedFilterTest() override = default;
+
+  void SetUp() override {
+    test_utils::ResetDspTable(bitdepth);
+    LoopRestorationInit_C();
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    const char* const test_case = test_info->test_suite_name();
+    if (absl::StartsWith(test_case, "C/")) {
+    } else if (absl::StartsWith(test_case, "AVX2/")) {
+      if ((GetCpuInfo() & kAVX2) != 0) {
+        LoopRestorationInit_AVX2();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+        LoopRestorationInit10bpp_AVX2();
+#endif
+      }
+    } else if (absl::StartsWith(test_case, "SSE41/")) {
+      if ((GetCpuInfo() & kSSE4_1) != 0) {
+        LoopRestorationInit_SSE4_1();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+        LoopRestorationInit10bpp_SSE4_1();
+#endif
+      }
+    } else if (absl::StartsWith(test_case, "NEON/")) {
+      LoopRestorationInit_NEON();
+    } else {
+      FAIL() << "Unrecognized architecture prefix in test case name: "
+             << test_case;
+    }
+    const Dsp* const dsp = GetDspTable(bitdepth);
+    ASSERT_NE(dsp, nullptr);
+    target_self_guided_filter_func_ = dsp->loop_restorations[1];
+    restoration_info_.type = kLoopRestorationTypeSgrProj;
+    memset(dst_, 0, sizeof(dst_));
+  }
+
+  void SetInputData(int type, Pixel value, int radius_index,
+                    libvpx_test::ACMRandom* rnd);
+  void TestFixedValues(int test_index, Pixel value);
+  void TestRandomValues(bool speed);
+
+ protected:
+  const int unit_width_ = GetParam();
+  const int unit_height_ = kRestorationUnitHeight;
+
+ private:
+  alignas(kMaxAlignment) Pixel src_[kMaxBlockSize];
+  alignas(kMaxAlignment) Pixel dst_[kMaxBlockSize];
+  RestorationUnitInfo restoration_info_;
+  RestorationBuffer restoration_buffer_;
+  LoopRestorationFunc target_self_guided_filter_func_;
+};
+
+template <int bitdepth, typename Pixel>
+void SelfGuidedFilterTest<bitdepth, Pixel>::SetInputData(
+    int type, Pixel value, int radius_index,
+    libvpx_test::ACMRandom* const rnd) {
+  const int mask = (1 << bitdepth) - 1;
+  if (type == 0) {  // Set fixed values
+    for (auto& s : src_) s = value;
+  } else {  // Set random values
+    for (auto& s : src_) s = rnd->Rand16() & mask;
+  }
+  for (auto& d : dst_) d = rnd->Rand16() & mask;
+  restoration_info_.sgr_proj_info.multiplier[0] =
+      kSgrProjMultiplierMin[0] +
+      rnd->PseudoUniform(kSgrProjMultiplierMax[0] - kSgrProjMultiplierMin[0] +
+                         1);
+  restoration_info_.sgr_proj_info.multiplier[1] =
+      kSgrProjMultiplierMin[1] +
+      rnd->PseudoUniform(kSgrProjMultiplierMax[1] - kSgrProjMultiplierMin[1] +
+                         1);
+  // regulate multiplier so that it matches libaom.
+  // Valid self-guided filter doesn't allow r0 and r1 to be 0 at the same time.
+  // When r0 or r1 is zero, its corresponding multiplier is set to zero in
+  // libaom.
+  int index;
+  if (radius_index == 0) {
+    index = 0;  // r0 = 2, r1 = 1
+  } else if (radius_index == 1) {
+    index = 10;  // r0 = 0, r1 = 1
+  } else /* if (radius_index == 2) */ {
+    index = 14;  // r0 = 2, r1 = 0
+  }
+  const uint8_t r0 = kSgrProjParams[index][0];
+  const uint8_t r1 = kSgrProjParams[index][2];
+  static constexpr int kMultiplier[2] = {0, 95};
+  restoration_info_.sgr_proj_info.index = index;
+  if (r0 == 0) {
+    restoration_info_.sgr_proj_info.multiplier[0] = kMultiplier[0];
+  } else if (r1 == 0) {
+    restoration_info_.sgr_proj_info.multiplier[1] = kMultiplier[1];
+  }
+}
+
+template <int bitdepth, typename Pixel>
+void SelfGuidedFilterTest<bitdepth, Pixel>::TestFixedValues(int test_index,
+                                                            Pixel value) {
+  static const char* const kDigest[][2][kNumRadiusTypes] = {
+      {{"7b78783ff4f03625a50c2ebfd574adca", "4faa0810639016f11a9f761ce28c38b0",
+        "a03314fc210bee68c7adbb44d2bbdac7"},
+       {"fce031d1339cfef5016e76a643538a71", "d439e1060de3f07b5b29c9b0b7c08e54",
+        "a6583fe9359877f4a259c81d900fc4fb"}},
+      {{"948ea16a90c4cefef87ce5b0ee105fc6", "76740629877b721432b84dbbdb4e352a",
+        "27100f37b3e42a5f2a051e1566edb6f8"},
+       {"dd320de3bc82f4ba69738b2190ea9f85", "bf82f271e30a1aca91e53b086e133fb3",
+        "69c274ac59c99999e1bfbf2fc4586ebd"}},
+      {{"9fbf1b246011250f38532a543cc6dd74", "d5c1e0142390ebb51b075c49f8ee9ff4",
+        "92f31086ba2f9e1508983b22d93a4e5c"},
+       {"2198321e6b95e7199738e60f5ddc6966", "34f74626027ffca010c824ddf0942b13",
+        "43dd7df2c2a601262c68cd8af1c61b82"}},
+      {{"42364ff8dbdbd6706fa3b8855a4258be", "a7843fdfd4d3c0d80ba812b353b4d6b4",
+        "f8a6a025827f29f857bed3e28ba3ea33"},
+       {"b83c1f8d7712e37f9b21b033822e37ed", "589daf2e3e6f8715873920515cfc1b42",
+        "20dcbe8e317a4373bebf11d56adc5f02"}}};
+  if (target_self_guided_filter_func_ == nullptr) return;
+  ASSERT_LT(value, 1 << bitdepth);
+  constexpr int bd_index = (bitdepth == 8) ? 0 : 1;
+  libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+  const Pixel* const src = src_ + kOffset;
+  Pixel* const dst = dst_ + kOffset;
+  for (int radius_index = 0; radius_index < kNumRadiusTypes; ++radius_index) {
+    SetInputData(0, value, radius_index, &rnd);
+    const absl::Time start = absl::Now();
+    for (int y = 0; y < kHeight; y += unit_height_) {
+      const int height = std::min(unit_height_, kHeight - y);
+      for (int x = 0; x < kWidth; x += unit_width_) {
+        const int width = std::min(unit_width_, kWidth - x);
+        const Pixel* const source = src + y * kStride + x;
+        target_self_guided_filter_func_(
+            restoration_info_, source, kStride,
+            source - kRestorationVerticalBorder * kStride, kStride,
+            source + height * kStride, kStride, width, height,
+            &restoration_buffer_, dst + y * kStride + x);
+      }
+    }
+    const absl::Duration elapsed_time = absl::Now() - start;
+    test_utils::CheckMd5Digest(
+        "kLoopRestorationTypeSgrProj", std::to_string(GetParam()).c_str(),
+        kDigest[test_index][bd_index][radius_index], dst_ + kBorder * kStride,
+        kHeight * kStride * sizeof(*dst_), elapsed_time);
+  }
+}
+
+template <int bitdepth, typename Pixel>
+void SelfGuidedFilterTest<bitdepth, Pixel>::TestRandomValues(bool speed) {
+  static const char* const kDigest[][2][kNumRadiusTypes] = {
+      {{"9f8358ed820943fa0abe3a8ebb5887db", "fb5d48870165522341843bcbfa8674fb",
+        "ca67159cd29475ac5d52ca4a0df3ea10"},
+       {"a78641886ea0cf8757057d1d91e01434", "1b95172a5f2f9c514c78afa4cf8e5678",
+        "a8ba988283d9e1ad1f0dcdbf6bbdaade"}},
+      {{"f219b445e5c80ffb5dd0359cc2cb4dd4", "699b2c9ddca1cbb0d4fc24cbcbe951e9",
+        "a4005899fa8d3c3c4669910f93ff1290"},
+       {"10a75cab3c78b891c8c6d92d55f685d1", "d46f158f57c628136f6f298ee8ca6e0e",
+        "07203ad761775d5d317f2b7884afd9fe"}},
+      {{"000d4e382be4003b514c9135893d0a37", "8fb082dca975be363bfc9c2d317ae084",
+        "475bcb6a58f87da7723f6227bc2aca0e"},
+       {"4d589683f69ccc5b416149dcc5c835d5", "986b6832df1f6020d50be61ae121e42f",
+        "7cb5c5dbdb3d1c54cfa00def450842dc"}},
+      {{"fd43bfe34d63614554dd29fb24b12173", "5c1ba74ba3062c769d5c3c86a85ac9b9",
+        "f1eda6d15b37172199d9949c2315832f"},
+       {"a11be3117fb77e8fe113581b06f98bd1", "df94d12b774ad5cf744c871e707c36c8",
+        "b23dc0b54c3500248d53377030428a61"}},
+      {{"f3079b3b21d8dc6fce7bb1fd104be359", "c6fcbc686cfb97ab3a64f445d73aad36",
+        "23966cba3e0e7803eeb951905861e0dd"},
+       {"7210391a6fe26e5ca5ea205bc38aa035", "4c3e6eccad3ea152d320ecd1077169de",
+        "dcee48f94126a2132963e86e93dd4903"}}};
+  if (target_self_guided_filter_func_ == nullptr) return;
+  constexpr int bd_index = (bitdepth == 8) ? 0 : 1;
+  const int num_inputs = speed ? 1 : 5;
+  const int num_tests = speed ? 20000 : 1;
+  libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+  const Pixel* const src = src_ + kOffset;
+  Pixel* const dst = dst_ + kOffset;
+  for (int i = 0; i < num_inputs; ++i) {
+    for (int radius_index = 0; radius_index < kNumRadiusTypes; ++radius_index) {
+      SetInputData(1, 0, radius_index, &rnd);
+      const absl::Time start = absl::Now();
+      for (int k = 0; k < num_tests; ++k) {
+        for (int y = 0; y < kHeight; y += unit_height_) {
+          const int height = std::min(unit_height_, kHeight - y);
+          for (int x = 0; x < kWidth; x += unit_width_) {
+            const int width = std::min(unit_width_, kWidth - x);
+            const Pixel* const source = src + y * kStride + x;
+            target_self_guided_filter_func_(
+                restoration_info_, source, kStride,
+                source - kRestorationVerticalBorder * kStride, kStride,
+                source + height * kStride, kStride, width, height,
+                &restoration_buffer_, dst + y * kStride + x);
+          }
+        }
+      }
+      const absl::Duration elapsed_time = absl::Now() - start;
+      test_utils::CheckMd5Digest(
+          "kLoopRestorationTypeSgrProj", std::to_string(GetParam()).c_str(),
+          kDigest[i][bd_index][radius_index], dst_ + kBorder * kStride,
+          kHeight * kStride * sizeof(*dst_), elapsed_time);
+    }
+  }
+}
+
+using SelfGuidedFilterTest8bpp = SelfGuidedFilterTest<8, uint8_t>;
+
+TEST_P(SelfGuidedFilterTest8bpp, Correctness) {
+  TestFixedValues(0, 0);
+  TestFixedValues(1, 1);
+  TestFixedValues(2, 128);
+  TestFixedValues(3, 255);
+  TestRandomValues(false);
+}
+
+TEST_P(SelfGuidedFilterTest8bpp, DISABLED_Speed) { TestRandomValues(true); }
+
+INSTANTIATE_TEST_SUITE_P(C, SelfGuidedFilterTest8bpp,
+                         testing::ValuesIn(kUnitWidths));
+#if LIBGAV1_ENABLE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, SelfGuidedFilterTest8bpp,
+                         testing::ValuesIn(kUnitWidths));
+#endif
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, SelfGuidedFilterTest8bpp,
+                         testing::ValuesIn(kUnitWidths));
+#endif
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, SelfGuidedFilterTest8bpp,
+                         testing::ValuesIn(kUnitWidths));
+#endif
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using SelfGuidedFilterTest10bpp = SelfGuidedFilterTest<10, uint16_t>;
+
+TEST_P(SelfGuidedFilterTest10bpp, Correctness) {
+  TestFixedValues(0, 0);
+  TestFixedValues(1, 1);
+  TestFixedValues(2, 512);
+  TestFixedValues(3, 1023);
+  TestRandomValues(false);
+}
+
+TEST_P(SelfGuidedFilterTest10bpp, DISABLED_Speed) { TestRandomValues(true); }
+
+INSTANTIATE_TEST_SUITE_P(C, SelfGuidedFilterTest10bpp,
+                         testing::ValuesIn(kUnitWidths));
+
+#if LIBGAV1_ENABLE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, SelfGuidedFilterTest10bpp,
+                         testing::ValuesIn(kUnitWidths));
+#endif
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, SelfGuidedFilterTest10bpp,
+                         testing::ValuesIn(kUnitWidths));
+#endif
+
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+template <int bitdepth, typename Pixel>
+class WienerFilterTest : public testing::TestWithParam<int>,
+                         public test_utils::MaxAlignedAllocable {
+ public:
+  WienerFilterTest() = default;
+  WienerFilterTest(const WienerFilterTest&) = delete;
+  WienerFilterTest& operator=(const WienerFilterTest&) = delete;
+  ~WienerFilterTest() override = default;
+
+  void SetUp() override {
+    test_utils::ResetDspTable(bitdepth);
+    LoopRestorationInit_C();
+    const Dsp* const dsp = GetDspTable(bitdepth);
+    ASSERT_NE(dsp, nullptr);
+    base_wiener_filter_func_ = dsp->loop_restorations[0];
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    const char* const test_case = test_info->test_suite_name();
+    if (absl::StartsWith(test_case, "C/")) {
+    } else if (absl::StartsWith(test_case, "AVX2/")) {
+      if ((GetCpuInfo() & kAVX2) != 0) {
+        LoopRestorationInit_AVX2();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+        LoopRestorationInit10bpp_AVX2();
+#endif
+      }
+    } else if (absl::StartsWith(test_case, "SSE41/")) {
+      if ((GetCpuInfo() & kSSE4_1) != 0) {
+        LoopRestorationInit_SSE4_1();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+        LoopRestorationInit10bpp_SSE4_1();
+#endif
+      }
+    } else if (absl::StartsWith(test_case, "NEON/")) {
+      LoopRestorationInit_NEON();
+    } else {
+      FAIL() << "Unrecognized architecture prefix in test case name: "
+             << test_case;
+    }
+    target_wiener_filter_func_ = dsp->loop_restorations[0];
+    restoration_info_.type = kLoopRestorationTypeWiener;
+    memset(dst_, 0, sizeof(dst_));
+    memset(tmp_, 0, sizeof(tmp_));
+    memset(buffer_, 0, sizeof(buffer_));
+  }
+
+  static void CleanFilterByOrder(const int order,
+                                 int16_t filter[kWienerFilterTaps]) {
+    if (order <= 5) filter[0] = 0;
+    if (order <= 3) filter[1] = 0;
+    if (order <= 1) filter[2] = 0;
+  }
+
+  void SetInputData(int type, Pixel value, int vertical_order,
+                    int horizontal_order);
+  void TestFixedValues(int digest_id, Pixel value);
+  void TestRandomValues(bool speed);
+  void TestCompare2C();
+
+ protected:
+  const int unit_width_ = GetParam();
+  const int unit_height_ = kRestorationUnitHeight;
+
+ private:
+  alignas(kMaxAlignment)
+      uint16_t buffer_[(kRestorationUnitWidth + kWienerFilterTaps - 1) *
+                       kRestorationUnitHeight];
+  alignas(kMaxAlignment) Pixel src_[kMaxBlockSize];
+  alignas(kMaxAlignment) Pixel dst_[kMaxBlockSize];
+  alignas(kMaxAlignment) Pixel tmp_[kMaxBlockSize];
+  RestorationUnitInfo restoration_info_;
+  RestorationBuffer restoration_buffer_;
+  LoopRestorationFunc base_wiener_filter_func_;
+  LoopRestorationFunc target_wiener_filter_func_;
+};
+
+template <int bitdepth, typename Pixel>
+void WienerFilterTest<bitdepth, Pixel>::SetInputData(
+    int type, Pixel value, const int vertical_order,
+    const int horizontal_order) {
+  const int mask = (1 << bitdepth) - 1;
+  libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+  if (type == 0) {
+    for (auto& s : src_) s = value;
+  } else {
+    for (auto& s : src_) s = rnd.Rand16() & mask;
+  }
+  int order = vertical_order;
+  for (int i = WienerInfo::kVertical; i <= WienerInfo::kHorizontal; ++i) {
+    auto& filter = restoration_info_.wiener_info.filter[i];
+    filter[3] = 128;
+    for (int j = 0; j < 3; ++j) {
+      filter[j] = kWienerTapsMin[j] +
+                  rnd.PseudoUniform(kWienerTapsMax[j] - kWienerTapsMin[j] + 1);
+    }
+    CleanFilterByOrder(order, filter);
+    filter[3] -= 2 * (filter[0] + filter[1] + filter[2]);
+    restoration_info_.wiener_info.number_leading_zero_coefficients[i] =
+        (kWienerFilterTaps - order) / 2;
+    order = horizontal_order;
+  }
+}
+
+template <int bitdepth, typename Pixel>
+void WienerFilterTest<bitdepth, Pixel>::TestFixedValues(int digest_id,
+                                                        Pixel value) {
+  static const char* const kDigest[2][4] = {
+      {"74fc90760a14b13340cb718f200ba350", "5bacaca0128cd36f4805330b3787771d",
+       "1109e17545cc4fbd5810b8b77e19fc36", "e7f914ec9d065aba92338016e17a526c"},
+      {"c8cc38790ceb0bea1eb989686755e1e5", "70f573b7e8875262c638a68d2f317916",
+       "193b19065899c835cb513149eb36d135", "f1dff65e3e53558b303ef0a2e3f3ba98"}};
+  if (target_wiener_filter_func_ == nullptr) return;
+  ASSERT_LT(value, 1 << bitdepth);
+  constexpr int bd_index = (bitdepth == 8) ? 0 : 1;
+  const Pixel* const src = src_ + kOffset;
+  Pixel* const dst = dst_ + kOffset;
+  for (const auto vertical_order : kWienerOrders) {
+    for (const auto horizontal_order : kWienerOrders) {
+      SetInputData(0, value, vertical_order, horizontal_order);
+      memset(dst_, 0, sizeof(dst_));
+      const absl::Time start = absl::Now();
+      for (int y = 0; y < kHeight; y += unit_height_) {
+        const int height = std::min(unit_height_, kHeight - y);
+        for (int x = 0; x < kWidth; x += unit_width_) {
+          const int width = std::min(unit_width_, kWidth - x);
+          const Pixel* const source = src + y * kStride + x;
+          target_wiener_filter_func_(
+              restoration_info_, source, kStride,
+              source - kRestorationVerticalBorder * kStride, kStride,
+              source + height * kStride, kStride, width, height,
+              &restoration_buffer_, dst + y * kStride + x);
+        }
+      }
+      const absl::Duration elapsed_time = absl::Now() - start;
+      test_utils::CheckMd5Digest(
+          "kLoopRestorationTypeWiener", std::to_string(GetParam()).c_str(),
+          kDigest[bd_index][digest_id], dst_, sizeof(dst_), elapsed_time);
+    }
+  }
+}
+
+template <int bitdepth, typename Pixel>
+void WienerFilterTest<bitdepth, Pixel>::TestRandomValues(bool speed) {
+  static const char* const kDigest[2][kNumWienerOrders][kNumWienerOrders] = {
+      {{"40d0cf56d2ffb4f581e68b0fc97f547f", "5c04745209b684ba98004ebb0f64e70b",
+        "545ed7d3f7e7ca3b86b4ada31f7aaee7", "0d6b2967f1bd1d99b720e563fe0cf03f"},
+       {"44b37076f0cf27f6eb506aca50c1d3e4", "e927d64dc9249e05a65e10ee75baa7d9",
+        "6136ecb4e29b17c9566504148943fd47", "c5ee2da81d44dc8cb2ac8021f724eb7a"},
+       {"125cbb227313ec91a2683f26e6f049d1", "77671b6529c806d23b749f304b548f59",
+        "28d53a1b486881895b8f73fa64486df1", "f5e32165bafe575d7ee7a6fbae75f36d"},
+       {"e832c41f2566ab542b32abba9d4f27bd", "ab1336ee6b85cba651f35ee5d3b3cc5c",
+        "52a673b6d14fbdca5ebdb1a34ee3326f",
+        "ebb42c7c9111f2e39f21e2158e801d9e"}},
+      {{"8cd9c6bd9983bd49564a58ed4af9098a", "f71f333c9d71237ed4e46f0ef2283196",
+        "375b43abc1d6682d62f91c1841b8b0fc", "71e2444822ae9c697ddfc96e07c6e8a1"},
+       {"d9ed3a66ceef405c08c87f6e91b71059", "c171fcff5fb7bb919f13ead7a4917a4c",
+        "8fbd1edb82fcd78d4d286886f65a700a", "fe14a143e6b261c5bb07b179d40be5a2"},
+       {"1c995f4e7f117857de73211b81093bd0", "5ab1ee3bb14adcd66d66802d58bee068",
+        "d77430783e173ebd1b30e5d9336c8b69", "e159a3620747458dff7ed3d20da1a4b7"},
+       {"5346fa07d195c257548a332753b057a3", "c77674bc0a638abc4d38d58e494fc7cf",
+        "7cbc1562a9dd08e1973b3b9ac1afc765",
+        "3c91bf1a34672cd40bf261c5820d3ec3"}}};
+  if (target_wiener_filter_func_ == nullptr) return;
+  constexpr int bd_index = (bitdepth == 8) ? 0 : 1;
+  const int num_tests = speed ? 100000 : 1;
+  const Pixel* const src = src_ + kOffset;
+  Pixel* const dst = dst_ + kOffset;
+  for (const auto vertical_order : kWienerOrders) {
+    for (const auto horizontal_order : kWienerOrders) {
+      SetInputData(1, (1 << bitdepth) - 1, vertical_order, horizontal_order);
+      memset(dst_, 0, sizeof(dst_));
+      const absl::Time start = absl::Now();
+      for (int i = 0; i < num_tests; ++i) {
+        for (int y = 0; y < kHeight; y += unit_height_) {
+          const int height = std::min(unit_height_, kHeight - y);
+          for (int x = 0; x < kWidth; x += unit_width_) {
+            const int width = std::min(unit_width_, kWidth - x);
+            const Pixel* const source = src + y * kStride + x;
+            target_wiener_filter_func_(
+                restoration_info_, source, kStride,
+                source - kRestorationVerticalBorder * kStride, kStride,
+                source + height * kStride, kStride, width, height,
+                &restoration_buffer_, dst + y * kStride + x);
+          }
+        }
+      }
+      const absl::Duration elapsed_time = absl::Now() - start;
+      test_utils::CheckMd5Digest(
+          "kLoopRestorationTypeWiener", std::to_string(GetParam()).c_str(),
+          kDigest[bd_index][kWienerOrderIdLookup[vertical_order]]
+                 [kWienerOrderIdLookup[horizontal_order]],
+          dst_, sizeof(dst_), elapsed_time);
+    }
+  }
+}
+
+template <int bitdepth, typename Pixel>
+void WienerFilterTest<bitdepth, Pixel>::TestCompare2C() {
+  if (base_wiener_filter_func_ == nullptr) return;
+  if (target_wiener_filter_func_ == nullptr) return;
+  if (base_wiener_filter_func_ == target_wiener_filter_func_) return;
+  const Pixel* const src = src_ + kOffset;
+  Pixel* const dst = dst_ + kOffset;
+  Pixel* const tmp = tmp_ + kOffset;
+  for (const auto vertical_order : kWienerOrders) {
+    for (const auto horizontal_order : kWienerOrders) {
+      SetInputData(1, (1 << bitdepth) - 1, vertical_order, horizontal_order);
+      for (int x = 0; x < 2; ++x) {
+        // Prepare min/max filter coefficients.
+        int order = vertical_order;
+        for (int i = WienerInfo::kVertical; i <= WienerInfo::kHorizontal; ++i) {
+          auto& filter = restoration_info_.wiener_info.filter[i];
+          for (int j = 0; j < 3; ++j) {
+            filter[j] = (x == 0) ? kWienerTapsMin[j] : kWienerTapsMax[j];
+          }
+          CleanFilterByOrder(order, filter);
+          filter[3] = 128 - 2 * (filter[0] + filter[1] + filter[2]);
+          restoration_info_.wiener_info.number_leading_zero_coefficients[i] =
+              (kWienerFilterTaps - order) / 2;
+          order = horizontal_order;
+        }
+        base_wiener_filter_func_(restoration_info_, src, kStride,
+                                 src - kRestorationVerticalBorder * kStride,
+                                 kStride, src + unit_height_ * kStride, kStride,
+                                 unit_width_, unit_height_,
+                                 &restoration_buffer_, dst);
+        target_wiener_filter_func_(restoration_info_, src, kStride,
+                                   src - kRestorationVerticalBorder * kStride,
+                                   kStride, src + unit_height_ * kStride,
+                                   kStride, unit_width_, unit_height_,
+                                   &restoration_buffer_, tmp);
+        if (!test_utils::CompareBlocks(dst, tmp, unit_width_, unit_height_,
+                                       kStride, kStride, false, false)) {
+          ADD_FAILURE() << "Mismatch -- wiener taps min/max";
+        }
+      }
+    }
+  }
+}
+
+using WienerFilterTest8bpp = WienerFilterTest<8, uint8_t>;
+
+TEST_P(WienerFilterTest8bpp, Correctness) {
+  TestFixedValues(0, 0);
+  TestFixedValues(1, 1);
+  TestFixedValues(2, 128);
+  TestFixedValues(3, 255);
+  TestRandomValues(false);
+}
+
+TEST_P(WienerFilterTest8bpp, DISABLED_Speed) { TestRandomValues(true); }
+
+TEST_P(WienerFilterTest8bpp, TestCompare2C) { TestCompare2C(); }
+
+INSTANTIATE_TEST_SUITE_P(C, WienerFilterTest8bpp,
+                         testing::ValuesIn(kUnitWidths));
+#if LIBGAV1_ENABLE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, WienerFilterTest8bpp,
+                         testing::ValuesIn(kUnitWidths));
+#endif
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, WienerFilterTest8bpp,
+                         testing::ValuesIn(kUnitWidths));
+#endif
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, WienerFilterTest8bpp,
+                         testing::ValuesIn(kUnitWidths));
+#endif
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using WienerFilterTest10bpp = WienerFilterTest<10, uint16_t>;
+
+TEST_P(WienerFilterTest10bpp, Correctness) {
+  TestFixedValues(0, 0);
+  TestFixedValues(1, 1);
+  TestFixedValues(2, 512);
+  TestFixedValues(3, 1023);
+  TestRandomValues(false);
+}
+
+TEST_P(WienerFilterTest10bpp, DISABLED_Speed) { TestRandomValues(true); }
+
+TEST_P(WienerFilterTest10bpp, TestCompare2C) { TestCompare2C(); }
+
+INSTANTIATE_TEST_SUITE_P(C, WienerFilterTest10bpp,
+                         testing::ValuesIn(kUnitWidths));
+
+#if LIBGAV1_ENABLE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, WienerFilterTest10bpp,
+                         testing::ValuesIn(kUnitWidths));
+#endif
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, WienerFilterTest10bpp,
+                         testing::ValuesIn(kUnitWidths));
+#endif
+
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+}  // namespace
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/mask_blend.cc b/src/dsp/mask_blend.cc
index 101c410..15ef821 100644
--- a/src/dsp/mask_blend.cc
+++ b/src/dsp/mask_blend.cc
@@ -25,8 +25,8 @@ namespace libgav1 {
 namespace dsp {
 namespace {
 
-template <int subsampling_x, int subsampling_y>
-uint8_t GetMaskValue(const uint8_t* mask, const uint8_t* mask_next_row, int x) {
+uint8_t GetMaskValue(const uint8_t* mask, const uint8_t* mask_next_row, int x,
+                     int subsampling_x, int subsampling_y) {
   if ((subsampling_x | subsampling_y) == 0) {
     return mask[x];
   }
@@ -63,7 +63,7 @@ void MaskBlend_C(const void* prediction_0, const void* prediction_1,
   for (int y = 0; y < height; ++y) {
     for (int x = 0; x < width; ++x) {
       const uint8_t mask_value =
-          GetMaskValue<subsampling_x, subsampling_y>(mask, mask_next_row, x);
+          GetMaskValue(mask, mask_next_row, x, subsampling_x, subsampling_y);
       if (is_inter_intra) {
         dst[x] = static_cast<Pixel>(RightShiftWithRounding(
             mask_value * pred_1[x] + (64 - mask_value) * pred_0[x], 6));
@@ -96,7 +96,7 @@ void InterIntraMaskBlend8bpp_C(const uint8_t* prediction_0,
   for (int y = 0; y < height; ++y) {
     for (int x = 0; x < width; ++x) {
       const uint8_t mask_value =
-          GetMaskValue<subsampling_x, subsampling_y>(mask, mask_next_row, x);
+          GetMaskValue(mask, mask_next_row, x, subsampling_x, subsampling_y);
       prediction_1[x] = static_cast<uint8_t>(RightShiftWithRounding(
           mask_value * prediction_1[x] + (64 - mask_value) * prediction_0[x],
           6));
@@ -148,6 +148,7 @@ void Init8bpp() {
 #ifndef LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp420
   dsp->inter_intra_mask_blend_8bpp[2] = InterIntraMaskBlend8bpp_C<1, 1>;
 #endif
+  static_cast<void>(GetMaskValue);
 #endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
 }
 
diff --git a/src/dsp/mask_blend_test.cc b/src/dsp/mask_blend_test.cc
new file mode 100644
index 0000000..b5e7e60
--- /dev/null
+++ b/src/dsp/mask_blend_test.cc
@@ -0,0 +1,493 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/mask_blend.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <ostream>
+#include <string>
+#include <type_traits>
+
+#include "absl/strings/match.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/memory.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kNumSpeedTests = 50000;
+// mask_blend is applied to compound prediction values when is_inter_intra is
+// false. This implies a range far exceeding that of pixel values. The ranges
+// include kCompoundOffset in 10bpp and 12bpp.
+// see: src/dsp/convolve.cc & src/dsp/warp.cc.
+constexpr int kCompoundPredictionRange[3][2] = {
+    // 8bpp
+    {-5132, 9212},
+    // 10bpp
+    {3988, 61532},
+    // 12bpp
+    {3974, 61559},
+};
+
+const char* GetDigest8bpp(int id) {
+  static const char* const kDigest[] = {
+      "4b70d5ef5ac7554b4b2660a4abe14a41", "64adb36f07e4a2c4ea4f05cfd715ff58",
+      "c490478208374a43765900ef7115c264", "b98f222eb70ef8589da2d6c839ca22b8",
+      "54752ca05f67b5af571bc311aa4e3de3", "344b2dab7accd8bd0a255bee16207336",
+      "0b2f6f755d1547eea7e0172f8133ea01", "310dc6364fdacba186c01f0e8ac4fcb7",
+      "b0c9f08b73d9e5c16eaf5abdbca1fdc0", "eaad805999d949fa1e1bbbb63b4b7827",
+      "6eb2a80d212df89403efb50db7a81b08", "c30730aa799dba78a2ebd3f729af82c7",
+      "4346c2860b23f0072b6b288f14c1df36", "8f8dd3eeed74ef115ca8a2f82ebff0ba",
+      "42e8872a81647767636f4c75609e0e2f", "1ff2526547d59557f7bb458249e34527",
+      "cd303d685268aebd2919dd468928d0ba", "254fb3ad990f9d408d252c70dd682e27",
+      "ba8d99c62853d14855f5d93e9574c97b", "e8ab744348681d6aa1043080efa86fc9",
+      "2fa919ca1f54b4336de878ff4015c352", "18e47c9809b909c2bfad08e00dffc635",
+      "9a90c843f06f0b662c509c26f5dd5054", "f89c608f884f37b064fc2b49eb2690a9",
+      "2448734d948ca6ddeb0ce8038a4ab2cf", "a3e0f86b7a5cb49716a424709c00b5a4",
+      "eb84dba768b54da10cded2f932f0aab7", "d6e8fdeb6875b70488f25d7f7ed9423f",
+      "1ca0822febce19c02ddc42a7b3331257", "a9259bb9b87ad002619eb47b907d7226",
+      "6408c5f327f1a9a390fb0046d4bc112b", "dba612489f87d00a82f2735fbcb98dcc",
+      "e8626a97699fbd247d6358ad5f766bee", "5e638a6897d7a2950f3512f871fa19e6",
+      "45a58708939779413f8e0e1de2ee5e6f", "079ae4682d398f0a7e4b66059589586d",
+      "6a06e617308409f9181b59bdd4f63d83", "b05ade2c1a572fc5fcca92b4163d9afb",
+      "30e955c3f86111207d5922575602e90a", "af5e6c65ed48a0eb7d509f7036398728",
+      "f9da3310d7dc75910483dfdd2af6ee62", "a9423b4d67bee5e7c7bc3baa7a9c017a",
+      "6b90a04333407013dd011c1af582e79f", "e658088a74bfb7cc57a2faa74a6f8689",
+      "6eedf27126eba6915035f9f701a1b992", "89116a7c6ad3f70a5b3f3105d04ad1a8",
+      "f41e5e166b049d0006d8b2cab56523b3", "3bed57a684075bbe3c25fd0c3e5520c3",
+      "85c0b21af2afb18ce948abfe3e23c85b", "bd8aaa3602d6b42438f8449f8adb52cb",
+      "1266bad904caad2c6d4047abefc2393d", "6573f2fe2a14c9ab7d5e192742388489",
+      "6b9b443f6306059fa3fe18df9de6dc48", "c9a91ee6ae8b653f552866e4073dd097",
+      "fa58938384198f7709d4871d155ba100", "033d121fc782e83ff94c31e73407d2a8",
+      "7ea268d79f7b8c75a4feeb24e892471a", "73a376bb3e07172d1e094ab8e01a7d42",
+      "13c366e0da1663fac126ea3d3876c110", "2f5eb5fcdf953c63fee2b8c75a6e5568",
+      "2054b197f002223f2d75699884279511", "67ce53e6991657a922d77cc8a23f1e07",
+      "f48e6d666435e7a917d6f90539b0d557", "21d03669d8d255e43552f8fb90724717",
+      "43dbaa1a7aaf2a01764e78e041b6763b", "a8173347ea861ecee6da54f81df73951",
+      "6b97ec4e4647a8de026d693059b855b7", "a85bf4c4b48791ac4971339877e4bc8a",
+      "04cf84d020a60ce3ce53845255ca8ec9", "ddd87035b960499b883d0aefcf96b6b2",
+      "278c5dd102474d598bf788cd66977ba9", "78b3790785811516142d417a49177c8c",
+      "7883ea9c2df0b4f5797cba31f4352678", "727004811025ac97b04940e2eaf68f94",
+      "7ffa3f97ec13dc8b6225550133a392bc", "6f5f2cb7a44aa0daea5c6b3315110591",
+      "88a59d68875fb44ec3be9d3fa293bccb", "0516e71f76b9d998794d3d63e480fa2f",
+      "193793d42f0964b4a958a68d9d7eb4ba", "4d259c7c6a95744e4ebaaa5361befb11",
+      "c090155b997dc103203bcb5a9dcc6282",
+  };
+  return kDigest[id];
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+const char* GetDigest10bpp(int id) {
+  static const char* const kDigest[] = {
+      "1af3cbd1616941b59e6a3f6a417b6312", "1d8b3f4b9d5d2f4ff5be8e81b7243121",
+      "53a3a76bf2bcd5761cd15fc739a4f4e1", "7597f69dc19a584280be0d67911db6a6",
+      "e1221c172843dc6c1b345bcd370771cc", "2ccbe012ca167114b14c3ba70befa960",
+      "0f68632d7e5faddb4554ca430d1df822", "8caa0061a26e142b783951d5abd7bf5d",
+      "1cce6acdbd8ca8d2546ba937584730bf", "022913e87a3c1a86aaefe2c2d4f89882",
+      "48f8ab636ba15a06731d869b603cbe58", "ba1616c990d224c20de123c3ccf19952",
+      "346a797b7cb4de10759e329f8b49e077", "8f4aa102e9b1ac430bdb9ebd4ec4cfca",
+      "5886397456b15e504ad55d8e0ce71e0e", "2a78b52ce43dc28606e83521963c00fa",
+      "8d3ef5280063337b0df97f91251bb8fc", "81f0ceada000ce40586be828a2045430",
+      "edb7b70a473392148bc419a44385326b", "97abe2eecaf9158a0529b234a241a57a",
+      "65729d750aa1258e4a7eccef247ac8c2", "78cc995e81188b9e8b29fa58796a3313",
+      "a1eb6a8c2f7c77e30e739a1b3b07cc74", "805b0f2f4b9d80f118d800b5ab4f603e",
+      "12610c83533f7170149390ba581f70b2", "cba20deed43b49ada3f626c91510995d",
+      "ba7ea35410b746fcbcf56c24ccb56d59", "933b2235b9b943984607d87f0bce1067",
+      "7ae59015295db8983bc8472429076464", "c18cce63327b367c0a260e9cbf4222b9",
+      "7c9672a7dfa964cb3ed3f2b4b443d2b6", "b29bcf1cc5369702e0179db1198db531",
+      "412326aff6c89116240b5d3ef63fa5cc", "3d854589fd171e42d118be4627ec5330",
+      "9a157e51e39ed314031224f074193791", "c645cdc63d3112f27b90cc9080c6d071",
+      "3f360cc336a4ee9a9bd78bde1a6e9eb3", "37b40fa8674d03a7cd66afdee939b9bf",
+      "cd6c7b98fe71b533c6a06d6d9122a6d0", "c26e0a0e90a969d762edcab770bed3b7",
+      "e517967d2cf4f1b0fff09d334475e2ae", "bc760a328a0a4b2d75593667adfa2a0e",
+      "b6239fdeeccc462640047cb2e2c2be96", "bc01f6a232ef9f0d9e57301779edd67f",
+      "cf6e8c1823c5498fa5589db40406a6ad", "2a9a4bd0bd84f0b85225a5b30f5eaa16",
+      "56f7bb2265dbd8a563bb269aa527c8a3", "fcbed0f0350be5a1384f95f8090d262e",
+      "f3ecf2e5747ebff65ac78ecbe7cc5e6a", "1d57d1371ad2f5f320cc4de789665f7c",
+      "e9f400fee64673b0f6313400fe449135", "5dfdc4a8376740011c777df46418b5d2",
+      "a4eb2c077300c0d8eeda028c9db3a63a", "90551259280c2b2150f018304204f072",
+      "4cbcd76496fc5b841cd164b6067b9c0b", "895964acc7b7e7d084de2266421c351b",
+      "af2e05159d369d0e3b72707f242b2845", "c7d393cef751950df3b9ed8056a9ffce",
+      "788541c0807aed47b863d47e5912555d", "163a06512f48c1b0f2535c8c50815bcc",
+      "dc5e723bab9fbfd7074a62e05b6b3c2b", "bf91200ce1bf97b4642a601adc13d700",
+      "d93fcefa6b9004baaab76d436e7ac931", "e89a2111caecc6bcf5f2b42ea0167ab4",
+      "e04a058df9b87878ca97edc1c42e76e1", "5d1f60876147edd6ed29d1fb50172464",
+      "655fb228aa410fd244c58c87fe510bec", "639a8a0a8f62d628136f5a97b3728b69",
+      "5b60f2428b092a502d6471fa09befd7f", "40601555ac945b4d37d3434b6e5619be",
+      "02be23bf1f89d5f5af02a39b98f96142", "9347a45bd54d28d8105f8183996b3505",
+      "d8429cc7b0b388981861a0fdd40289f0", "c4b7fab3b044486f663e160c07805e0a",
+      "f5f5d513b1f1c13d0abc70fc18afea48", "f236795ea30f1b8761b268734a245ba1",
+      "c7b7452ea8247a3a40248278d08953d5", "ddd6ba3c5ec56cc7a0b0161ae67001fa",
+      "94675749f2db46a8ade6f2f211db9a32", "3d165364ff96a5ef39e67a53fe3ed3be",
+      "3d1d66a9401fd7e78050724ca1fa0419",
+  };
+  return kDigest[id];
+}
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+struct MaskBlendTestParam {
+  MaskBlendTestParam(int width, int height, int subsampling_x,
+                     int subsampling_y, bool is_inter_intra,
+                     bool is_wedge_inter_intra)
+      : width(width),
+        height(height),
+        subsampling_x(subsampling_x),
+        subsampling_y(subsampling_y),
+        is_inter_intra(is_inter_intra),
+        is_wedge_inter_intra(is_wedge_inter_intra) {}
+  int width;
+  int height;
+  int subsampling_x;
+  int subsampling_y;
+  bool is_inter_intra;
+  bool is_wedge_inter_intra;
+};
+
+std::ostream& operator<<(std::ostream& os, const MaskBlendTestParam& param) {
+  return os << "BlockSize" << param.width << "x" << param.height
+            << ", subsampling(x/y): " << param.subsampling_x << "/"
+            << param.subsampling_y
+            << ", is_inter_intra: " << param.is_inter_intra
+            << ", is_wedge_inter_intra: " << param.is_wedge_inter_intra;
+}
+
+template <int bitdepth, typename Pixel>
+class MaskBlendTest : public testing::TestWithParam<MaskBlendTestParam>,
+                      public test_utils::MaxAlignedAllocable {
+ public:
+  MaskBlendTest() = default;
+  ~MaskBlendTest() override = default;
+
+  void SetUp() override {
+    test_utils::ResetDspTable(bitdepth);
+    MaskBlendInit_C();
+    const dsp::Dsp* const dsp = dsp::GetDspTable(bitdepth);
+    ASSERT_NE(dsp, nullptr);
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    const absl::string_view test_case = test_info->test_suite_name();
+    if (absl::StartsWith(test_case, "C/")) {
+    } else if (absl::StartsWith(test_case, "NEON/")) {
+      MaskBlendInit_NEON();
+    } else if (absl::StartsWith(test_case, "SSE41/")) {
+      if ((GetCpuInfo() & kSSE4_1) != 0) {
+        MaskBlendInit_SSE4_1();
+      }
+    } else {
+      FAIL() << "Unrecognized architecture prefix in test case name: "
+             << test_case;
+    }
+    func_ = (param_.is_inter_intra && !param_.is_wedge_inter_intra)
+                ? dsp->mask_blend[0][param_.is_inter_intra]
+                : dsp->mask_blend[param_.subsampling_x + param_.subsampling_y]
+                                 [param_.is_inter_intra];
+    func_8bpp_ = dsp->inter_intra_mask_blend_8bpp[param_.is_wedge_inter_intra
+                                                      ? param_.subsampling_x +
+                                                            param_.subsampling_y
+                                                      : 0];
+  }
+
+ protected:
+  int GetDigestIdOffset() const {
+    // id is for retrieving the corresponding digest from the lookup table given
+    // the set of input parameters. id can be figured out by its width, height
+    // and an offset (id_offset).
+    // For example, in kMaskBlendTestParam, this set of parameters
+    // (8, 8, 0, 0, false, false) corresponds to the first entry in the
+    // digest lookup table, where id == 0.
+    // (8, 8, 1, 0, false, false) corresponds to id == 13.
+    // (8, 8, 1, 1, false, false) corresponds to id == 26.
+    // (8, 8, 0, 0, true, false) corresponds to id == 39.
+    // Id_offset denotes offset for different modes (is_inter_intra,
+    // is_wedge_inter_intra). Width and height help to figure out id:
+    // width = 8, height = 8, id = id_offset + log2(8) - 3.
+    // width = 8, height = 16, id = id_offset + log2(min(width, height) - 3 + 1.
+    // ...
+    if (!param_.is_inter_intra && !param_.is_wedge_inter_intra) {
+      return param_.subsampling_x * 13 + param_.subsampling_y * 13;
+    }
+    if (param_.is_inter_intra && !param_.is_wedge_inter_intra) {
+      return 39 + param_.subsampling_x * 7 + param_.subsampling_y * 7;
+    }
+    if (param_.is_inter_intra && param_.is_wedge_inter_intra) {
+      return 60 + param_.subsampling_x * 7 + param_.subsampling_y * 7;
+    }
+    return 0;
+  }
+
+  int GetDigestId() const {
+    int id = GetDigestIdOffset();
+    if (param_.width == param_.height) {
+      return id + 3 * (FloorLog2(param_.width) - 3);
+    }
+    if (param_.width < param_.height) {
+      return id + 1 + 3 * (FloorLog2(param_.width) - 3);
+    }
+    return id + 2 + 3 * (FloorLog2(param_.height) - 3);
+  }
+
+  void Test(const char* digest, int num_runs);
+
+ private:
+  using PredType =
+      typename std::conditional<bitdepth == 8, int16_t, uint16_t>::type;
+  static constexpr int kStride = kMaxSuperBlockSizeInPixels;
+  static constexpr int kDestStride = kMaxSuperBlockSizeInPixels * sizeof(Pixel);
+  const MaskBlendTestParam param_ = GetParam();
+  alignas(kMaxAlignment) PredType
+      source1_[kMaxSuperBlockSizeInPixels * kMaxSuperBlockSizeInPixels] = {};
+  uint8_t source1_8bpp_[kMaxSuperBlockSizeInPixels *
+                        kMaxSuperBlockSizeInPixels] = {};
+  alignas(kMaxAlignment) PredType
+      source2_[kMaxSuperBlockSizeInPixels * kMaxSuperBlockSizeInPixels] = {};
+  uint8_t source2_8bpp_[kMaxSuperBlockSizeInPixels *
+                        kMaxSuperBlockSizeInPixels] = {};
+  uint8_t source2_8bpp_cache_[kMaxSuperBlockSizeInPixels *
+                              kMaxSuperBlockSizeInPixels] = {};
+  uint8_t mask_[kMaxSuperBlockSizeInPixels * kMaxSuperBlockSizeInPixels];
+  uint8_t dest_[sizeof(Pixel) * kMaxSuperBlockSizeInPixels *
+                kMaxSuperBlockSizeInPixels] = {};
+  dsp::MaskBlendFunc func_;
+  dsp::InterIntraMaskBlendFunc8bpp func_8bpp_;
+};
+
+template <int bitdepth, typename Pixel>
+void MaskBlendTest<bitdepth, Pixel>::Test(const char* const digest,
+                                          const int num_runs) {
+  if (func_ == nullptr && func_8bpp_ == nullptr) return;
+  const int width = param_.width >> param_.subsampling_x;
+  const int height = param_.height >> param_.subsampling_y;
+
+  // Add id offset to seed just to add more randomness to input blocks.
+  // If we use the same seed for different block sizes, the generated input
+  // blocks are repeated. For example, if input size is 8x8, the generated
+  // block is exactly the upper left half of the generated 16x16 block.
+  libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed() +
+                             GetDigestIdOffset());
+  PredType* src_1 = source1_;
+  uint8_t* src_1_8bpp = source1_8bpp_;
+  PredType* src_2 = source2_;
+  uint8_t* src_2_8bpp = source2_8bpp_;
+  const ptrdiff_t src_2_stride = param_.is_inter_intra ? kStride : width;
+  uint8_t* mask_row = mask_;
+  const int range_mask = (1 << (bitdepth)) - 1;
+  for (int y = 0; y < height; ++y) {
+    for (int x = 0; x < width; ++x) {
+      src_1[x] = static_cast<PredType>(rnd.Rand16() & range_mask);
+      src_2[x] = static_cast<PredType>(rnd.Rand16() & range_mask);
+      if (param_.is_inter_intra && bitdepth == 8) {
+        src_1_8bpp[x] = src_1[x];
+        src_2_8bpp[x] = src_2[x];
+      }
+      if (!param_.is_inter_intra) {
+        // Implies isCompound == true.
+        constexpr int bitdepth_index = (bitdepth - 8) >> 1;
+        const int min_val = kCompoundPredictionRange[bitdepth_index][0];
+        const int max_val = kCompoundPredictionRange[bitdepth_index][1];
+        src_1[x] = static_cast<PredType>(rnd(max_val - min_val) + min_val);
+        src_2[x] = static_cast<PredType>(rnd(max_val - min_val) + min_val);
+      }
+    }
+    src_1 += width;
+    src_1_8bpp += width;
+    src_2 += src_2_stride;
+    src_2_8bpp += src_2_stride;
+  }
+  // Mask should be setup regardless of subsampling.
+  for (int y = 0; y < param_.height; ++y) {
+    for (int x = 0; x < param_.width; ++x) {
+      mask_row[x] = rnd.Rand8() & 63;
+      mask_row[x] += rnd.Rand8() & 1;  // Range of mask is [0, 64].
+    }
+    mask_row += kStride;
+  }
+
+  absl::Duration elapsed_time;
+  for (int i = 0; i < num_runs; ++i) {
+    const absl::Time start = absl::Now();
+    if (param_.is_inter_intra && bitdepth == 8) {
+      ASSERT_EQ(func_, nullptr);
+      static_assert(sizeof(source2_8bpp_cache_) == sizeof(source2_8bpp_), "");
+      // source2_8bpp_ is modified in the call.
+      memcpy(source2_8bpp_cache_, source2_8bpp_, sizeof(source2_8bpp_));
+      func_8bpp_(source1_8bpp_, source2_8bpp_, src_2_stride, mask_, kStride,
+                 width, height);
+      for (int y = 0; y < height; ++y) {
+        for (int x = 0; x < width; ++x) {
+          dest_[y * kDestStride + x] = source2_8bpp_[y * src_2_stride + x];
+        }
+      }
+      memcpy(source2_8bpp_, source2_8bpp_cache_, sizeof(source2_8bpp_));
+    } else {
+      if (bitdepth != 8) {
+        ASSERT_EQ(func_8bpp_, nullptr);
+      }
+      func_(source1_, source2_, src_2_stride, mask_, kStride, width, height,
+            dest_, kDestStride);
+    }
+    elapsed_time += absl::Now() - start;
+  }
+
+  test_utils::CheckMd5Digest(
+      "MaskBlend",
+      absl::StrFormat("%dx%d", param_.width, param_.height).c_str(), digest,
+      dest_, sizeof(dest_), elapsed_time);
+}
+
+const MaskBlendTestParam kMaskBlendTestParam[] = {
+    // is_inter_intra = false, is_wedge_inter_intra = false.
+    // block size range is from 8x8 to 128x128.
+    MaskBlendTestParam(8, 8, 0, 0, false, false),
+    MaskBlendTestParam(8, 16, 0, 0, false, false),
+    MaskBlendTestParam(16, 8, 0, 0, false, false),
+    MaskBlendTestParam(16, 16, 0, 0, false, false),
+    MaskBlendTestParam(16, 32, 0, 0, false, false),
+    MaskBlendTestParam(32, 16, 0, 0, false, false),
+    MaskBlendTestParam(32, 32, 0, 0, false, false),
+    MaskBlendTestParam(32, 64, 0, 0, false, false),
+    MaskBlendTestParam(64, 32, 0, 0, false, false),
+    MaskBlendTestParam(64, 64, 0, 0, false, false),
+    MaskBlendTestParam(64, 128, 0, 0, false, false),
+    MaskBlendTestParam(128, 64, 0, 0, false, false),
+    MaskBlendTestParam(128, 128, 0, 0, false, false),
+    MaskBlendTestParam(8, 8, 1, 0, false, false),
+    MaskBlendTestParam(8, 16, 1, 0, false, false),
+    MaskBlendTestParam(16, 8, 1, 0, false, false),
+    MaskBlendTestParam(16, 16, 1, 0, false, false),
+    MaskBlendTestParam(16, 32, 1, 0, false, false),
+    MaskBlendTestParam(32, 16, 1, 0, false, false),
+    MaskBlendTestParam(32, 32, 1, 0, false, false),
+    MaskBlendTestParam(32, 64, 1, 0, false, false),
+    MaskBlendTestParam(64, 32, 1, 0, false, false),
+    MaskBlendTestParam(64, 64, 1, 0, false, false),
+    MaskBlendTestParam(64, 128, 1, 0, false, false),
+    MaskBlendTestParam(128, 64, 1, 0, false, false),
+    MaskBlendTestParam(128, 128, 1, 0, false, false),
+    MaskBlendTestParam(8, 8, 1, 1, false, false),
+    MaskBlendTestParam(8, 16, 1, 1, false, false),
+    MaskBlendTestParam(16, 8, 1, 1, false, false),
+    MaskBlendTestParam(16, 16, 1, 1, false, false),
+    MaskBlendTestParam(16, 32, 1, 1, false, false),
+    MaskBlendTestParam(32, 16, 1, 1, false, false),
+    MaskBlendTestParam(32, 32, 1, 1, false, false),
+    MaskBlendTestParam(32, 64, 1, 1, false, false),
+    MaskBlendTestParam(64, 32, 1, 1, false, false),
+    MaskBlendTestParam(64, 64, 1, 1, false, false),
+    MaskBlendTestParam(64, 128, 1, 1, false, false),
+    MaskBlendTestParam(128, 64, 1, 1, false, false),
+    MaskBlendTestParam(128, 128, 1, 1, false, false),
+    // is_inter_intra = true, is_wedge_inter_intra = false.
+    // block size range is from 8x8 to 32x32.
+    MaskBlendTestParam(8, 8, 0, 0, true, false),
+    MaskBlendTestParam(8, 16, 0, 0, true, false),
+    MaskBlendTestParam(16, 8, 0, 0, true, false),
+    MaskBlendTestParam(16, 16, 0, 0, true, false),
+    MaskBlendTestParam(16, 32, 0, 0, true, false),
+    MaskBlendTestParam(32, 16, 0, 0, true, false),
+    MaskBlendTestParam(32, 32, 0, 0, true, false),
+    MaskBlendTestParam(8, 8, 1, 0, true, false),
+    MaskBlendTestParam(8, 16, 1, 0, true, false),
+    MaskBlendTestParam(16, 8, 1, 0, true, false),
+    MaskBlendTestParam(16, 16, 1, 0, true, false),
+    MaskBlendTestParam(16, 32, 1, 0, true, false),
+    MaskBlendTestParam(32, 16, 1, 0, true, false),
+    MaskBlendTestParam(32, 32, 1, 0, true, false),
+    MaskBlendTestParam(8, 8, 1, 1, true, false),
+    MaskBlendTestParam(8, 16, 1, 1, true, false),
+    MaskBlendTestParam(16, 8, 1, 1, true, false),
+    MaskBlendTestParam(16, 16, 1, 1, true, false),
+    MaskBlendTestParam(16, 32, 1, 1, true, false),
+    MaskBlendTestParam(32, 16, 1, 1, true, false),
+    MaskBlendTestParam(32, 32, 1, 1, true, false),
+    // is_inter_intra = true, is_wedge_inter_intra = true.
+    // block size range is from 8x8 to 32x32.
+    MaskBlendTestParam(8, 8, 0, 0, true, true),
+    MaskBlendTestParam(8, 16, 0, 0, true, true),
+    MaskBlendTestParam(16, 8, 0, 0, true, true),
+    MaskBlendTestParam(16, 16, 0, 0, true, true),
+    MaskBlendTestParam(16, 32, 0, 0, true, true),
+    MaskBlendTestParam(32, 16, 0, 0, true, true),
+    MaskBlendTestParam(32, 32, 0, 0, true, true),
+    MaskBlendTestParam(8, 8, 1, 0, true, true),
+    MaskBlendTestParam(8, 16, 1, 0, true, true),
+    MaskBlendTestParam(16, 8, 1, 0, true, true),
+    MaskBlendTestParam(16, 16, 1, 0, true, true),
+    MaskBlendTestParam(16, 32, 1, 0, true, true),
+    MaskBlendTestParam(32, 16, 1, 0, true, true),
+    MaskBlendTestParam(32, 32, 1, 0, true, true),
+    MaskBlendTestParam(8, 8, 1, 1, true, true),
+    MaskBlendTestParam(8, 16, 1, 1, true, true),
+    MaskBlendTestParam(16, 8, 1, 1, true, true),
+    MaskBlendTestParam(16, 16, 1, 1, true, true),
+    MaskBlendTestParam(16, 32, 1, 1, true, true),
+    MaskBlendTestParam(32, 16, 1, 1, true, true),
+    MaskBlendTestParam(32, 32, 1, 1, true, true),
+};
+
+using MaskBlendTest8bpp = MaskBlendTest<8, uint8_t>;
+
+TEST_P(MaskBlendTest8bpp, Blending) { Test(GetDigest8bpp(GetDigestId()), 1); }
+
+TEST_P(MaskBlendTest8bpp, DISABLED_Speed) {
+  Test(GetDigest8bpp(GetDigestId()), kNumSpeedTests);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, MaskBlendTest8bpp,
+                         testing::ValuesIn(kMaskBlendTestParam));
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, MaskBlendTest8bpp,
+                         testing::ValuesIn(kMaskBlendTestParam));
+#endif
+
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, MaskBlendTest8bpp,
+                         testing::ValuesIn(kMaskBlendTestParam));
+#endif
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using MaskBlendTest10bpp = MaskBlendTest<10, uint16_t>;
+
+TEST_P(MaskBlendTest10bpp, Blending) { Test(GetDigest10bpp(GetDigestId()), 1); }
+
+TEST_P(MaskBlendTest10bpp, DISABLED_Speed) {
+  Test(GetDigest10bpp(GetDigestId()), kNumSpeedTests);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, MaskBlendTest10bpp,
+                         testing::ValuesIn(kMaskBlendTestParam));
+
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, MaskBlendTest10bpp,
+                         testing::ValuesIn(kMaskBlendTestParam));
+#endif
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+}  // namespace
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/motion_field_projection_test.cc b/src/dsp/motion_field_projection_test.cc
new file mode 100644
index 0000000..3a47cc7
--- /dev/null
+++ b/src/dsp/motion_field_projection_test.cc
@@ -0,0 +1,213 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/motion_field_projection.h"
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <cmath>
+#include <cstdint>
+#include <string>
+
+#include "absl/strings/match.h"
+#include "absl/strings/str_format.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/reference_info.h"
+#include "src/utils/types.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kMotionFieldWidth = 160;
+constexpr int kMotionFieldHight = 120;
+
+// The 'int' parameter is unused but required to allow for instantiations of C,
+// NEON, etc.
+class MotionFieldProjectionTest : public testing::TestWithParam<int> {
+ public:
+  MotionFieldProjectionTest() = default;
+  MotionFieldProjectionTest(const MotionFieldProjectionTest&) = delete;
+  MotionFieldProjectionTest& operator=(const MotionFieldProjectionTest&) =
+      delete;
+  ~MotionFieldProjectionTest() override = default;
+
+  void SetUp() override {
+    test_utils::ResetDspTable(8);
+    MotionFieldProjectionInit_C();
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    const char* const test_case = test_info->test_suite_name();
+    if (absl::StartsWith(test_case, "C/")) {
+    } else if (absl::StartsWith(test_case, "NEON/")) {
+      MotionFieldProjectionInit_NEON();
+    } else if (absl::StartsWith(test_case, "SSE41/")) {
+      if ((GetCpuInfo() & kSSE4_1) != 0) {
+        MotionFieldProjectionInit_SSE4_1();
+      }
+    } else {
+      FAIL() << "Unrecognized architecture prefix in test case name: "
+             << test_case;
+    }
+    const Dsp* const dsp = GetDspTable(8);
+    ASSERT_NE(dsp, nullptr);
+    target_motion_field_projection_kernel_func_ =
+        dsp->motion_field_projection_kernel;
+  }
+
+  void SetInputData(int motion_field_width, libvpx_test::ACMRandom* rnd);
+  void TestRandomValues(bool speed);
+
+ private:
+  MotionFieldProjectionKernelFunc target_motion_field_projection_kernel_func_;
+  ReferenceInfo reference_info_;
+  TemporalMotionField motion_field_;
+};
+
+void MotionFieldProjectionTest::SetInputData(
+    const int motion_field_width, libvpx_test::ACMRandom* const rnd) {
+  ASSERT_TRUE(reference_info_.Reset(kMotionFieldHight, motion_field_width));
+  ASSERT_TRUE(motion_field_.mv.Reset(kMotionFieldHight, motion_field_width,
+                                     /*zero_initialize=*/false));
+  ASSERT_TRUE(motion_field_.reference_offset.Reset(kMotionFieldHight,
+                                                   motion_field_width,
+                                                   /*zero_initialize=*/false));
+  constexpr int order_hint_bits = 6;
+  unsigned int order_hint_shift_bits = Mod32(32 - order_hint_bits);
+  const unsigned int current_frame_order_hint =
+      rnd->Rand8() & ((1 << order_hint_bits) - 1);  // [0, 63]
+  uint8_t reference_frame_order_hint = 0;
+  reference_info_.relative_distance_to[0] = 0;
+  reference_info_.skip_references[kReferenceFrameIntra] = true;
+  reference_info_.projection_divisions[kReferenceFrameIntra] = 0;
+  for (int i = kReferenceFrameLast; i < kNumReferenceFrameTypes; ++i) {
+    reference_frame_order_hint =
+        rnd->Rand8() & ((1 << order_hint_bits) - 1);  // [0, 63]
+    const int relative_distance_to =
+        GetRelativeDistance(current_frame_order_hint,
+                            reference_frame_order_hint, order_hint_shift_bits);
+    reference_info_.relative_distance_to[i] = relative_distance_to;
+    reference_info_.skip_references[i] =
+        relative_distance_to > kMaxFrameDistance || relative_distance_to <= 0;
+    reference_info_.projection_divisions[i] =
+        reference_info_.skip_references[i]
+            ? 0
+            : kProjectionMvDivisionLookup[relative_distance_to];
+  }
+  for (int y = 0; y < kMotionFieldHight; ++y) {
+    for (int x = 0; x < motion_field_width; ++x) {
+      reference_info_.motion_field_reference_frame[y][x] =
+          static_cast<ReferenceFrameType>(rnd->Rand16() &
+                                          kReferenceFrameAlternate);
+      reference_info_.motion_field_mv[y][x].mv[0] = rnd->Rand16Signed() / 512;
+      reference_info_.motion_field_mv[y][x].mv[1] = rnd->Rand16Signed() / 512;
+    }
+  }
+  MotionVector invalid_mv;
+  invalid_mv.mv[0] = kInvalidMvValue;
+  invalid_mv.mv[1] = kInvalidMvValue;
+  MotionVector* const motion_field_mv = &motion_field_.mv[0][0];
+  int8_t* const motion_field_reference_offset =
+      &motion_field_.reference_offset[0][0];
+  std::fill(motion_field_mv, motion_field_mv + motion_field_.mv.size(),
+            invalid_mv);
+  std::fill(
+      motion_field_reference_offset,
+      motion_field_reference_offset + motion_field_.reference_offset.size(),
+      -128);
+}
+
+void MotionFieldProjectionTest::TestRandomValues(bool speed) {
+  static const char* const kDigestMv[8] = {
+      "87c2a74538f5c015809492ac2e521075", "ba7b4a5d82c6083b13a5b02eb7655ab7",
+      "8c37d96bf1744d5553860bf44a4f60a3", "720aa644f85e48995db9785e87cd02e3",
+      "9289c0c66524bb77a605870d78285f35", "f0326509885c2b2c89feeac53698cd47",
+      "6b9ad1d672dec825cb1803063d35badc", "dfe06c57cc9c70d27246df7fd0afa0b2"};
+  static const char* const kDigestReferenceOffset[8] = {
+      "d8d1384268d7cf5c4514b39c329f94fb", "7f30e79ceb064befbad64a20d206a540",
+      "61e2eb5644edbd3a91b939403edc891e", "7a018f1bf88193e86934241af445dc36",
+      "2d6166bf8bbe1db77baf687ecf71d028", "95fee61f0219e06076d6f0e1073b1a4e",
+      "64d0a63751267bdc573cab761f1fe685", "906a99e0e791dbcb9183c9b68ecc4ea3"};
+  const int num_tests = speed ? 2000 : 1;
+  if (target_motion_field_projection_kernel_func_ == nullptr) return;
+  libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+  for (int width_idx = 0; width_idx < 8; ++width_idx) {
+    const int motion_field_width = kMotionFieldWidth + width_idx;
+    SetInputData(motion_field_width, &rnd);
+    const int dst_sign = ((rnd.Rand16() & 1) != 0) ? 0 : -1;
+    const int reference_to_current_with_sign =
+        rnd.PseudoUniform(2 * kMaxFrameDistance + 1) - kMaxFrameDistance;
+    assert(std::abs(reference_to_current_with_sign) <= kMaxFrameDistance);
+    // Step of y8 and x8 is at least 16 except the last hop.
+    for (int step = 16; step <= 80; step += 16) {
+      const absl::Time start = absl::Now();
+      for (int k = 0; k < num_tests; ++k) {
+        for (int y8 = 0; y8 < kMotionFieldHight; y8 += step) {
+          const int y8_end = std::min(y8 + step, kMotionFieldHight);
+          for (int x8 = 0; x8 < motion_field_width; x8 += step) {
+            const int x8_end = std::min(x8 + step, motion_field_width);
+            target_motion_field_projection_kernel_func_(
+                reference_info_, reference_to_current_with_sign, dst_sign, y8,
+                y8_end, x8, x8_end, &motion_field_);
+          }
+        }
+      }
+      const absl::Duration elapsed_time = absl::Now() - start;
+      test_utils::CheckMd5Digest(
+          "MotionFieldProjectionKernel",
+          absl::StrFormat("(mv) width %d  step %d", motion_field_width, step)
+              .c_str(),
+          kDigestMv[width_idx], motion_field_.mv[0],
+          sizeof(motion_field_.mv[0][0]) * motion_field_.mv.size(),
+          elapsed_time);
+      test_utils::CheckMd5Digest(
+          "MotionFieldProjectionKernel",
+          absl::StrFormat("(ref offset) width %d  step %d", motion_field_width,
+                          step)
+              .c_str(),
+          kDigestReferenceOffset[width_idx], motion_field_.reference_offset[0],
+          sizeof(motion_field_.reference_offset[0][0]) *
+              motion_field_.reference_offset.size(),
+          elapsed_time);
+    }
+  }
+}
+
+TEST_P(MotionFieldProjectionTest, Correctness) { TestRandomValues(false); }
+
+TEST_P(MotionFieldProjectionTest, DISABLED_Speed) { TestRandomValues(true); }
+
+INSTANTIATE_TEST_SUITE_P(C, MotionFieldProjectionTest, testing::Values(0));
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, MotionFieldProjectionTest, testing::Values(0));
+#endif
+
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, MotionFieldProjectionTest, testing::Values(0));
+#endif
+
+}  // namespace
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/motion_vector_search_test.cc b/src/dsp/motion_vector_search_test.cc
new file mode 100644
index 0000000..a7b2ec8
--- /dev/null
+++ b/src/dsp/motion_vector_search_test.cc
@@ -0,0 +1,197 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/motion_vector_search.h"
+
+#include <cstdint>
+#include <string>
+
+#include "absl/strings/match.h"
+#include "absl/strings/str_format.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/memory.h"
+#include "src/utils/types.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+// The 'int' parameter is unused but required to allow for instantiations of C,
+// NEON, etc.
+class MotionVectorSearchTest : public testing::TestWithParam<int>,
+                               public test_utils::MaxAlignedAllocable {
+ public:
+  MotionVectorSearchTest() = default;
+  MotionVectorSearchTest(const MotionVectorSearchTest&) = delete;
+  MotionVectorSearchTest& operator=(const MotionVectorSearchTest&) = delete;
+  ~MotionVectorSearchTest() override = default;
+
+  void SetUp() override {
+    test_utils::ResetDspTable(8);
+    MotionVectorSearchInit_C();
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    const char* const test_case = test_info->test_suite_name();
+    if (absl::StartsWith(test_case, "C/")) {
+    } else if (absl::StartsWith(test_case, "NEON/")) {
+      MotionVectorSearchInit_NEON();
+    } else if (absl::StartsWith(test_case, "SSE41/")) {
+      if ((GetCpuInfo() & kSSE4_1) != 0) {
+        MotionVectorSearchInit_SSE4_1();
+      }
+    } else {
+      FAIL() << "Unrecognized architecture prefix in test case name: "
+             << test_case;
+    }
+    const Dsp* const dsp = GetDspTable(8);
+    ASSERT_NE(dsp, nullptr);
+    mv_projection_compound_[0] = dsp->mv_projection_compound[0];
+    mv_projection_compound_[1] = dsp->mv_projection_compound[1];
+    mv_projection_compound_[2] = dsp->mv_projection_compound[2];
+    mv_projection_single_[0] = dsp->mv_projection_single[0];
+    mv_projection_single_[1] = dsp->mv_projection_single[1];
+    mv_projection_single_[2] = dsp->mv_projection_single[2];
+  }
+
+  void SetInputData(libvpx_test::ACMRandom* rnd);
+  void TestRandomValues(bool speed);
+
+ private:
+  MvProjectionCompoundFunc mv_projection_compound_[3];
+  MvProjectionSingleFunc mv_projection_single_[3];
+  int reference_offsets_[2];
+  alignas(kMaxAlignment)
+      MotionVector temporal_mvs_[kMaxTemporalMvCandidatesWithPadding];
+  int8_t temporal_reference_offsets_[kMaxTemporalMvCandidatesWithPadding];
+  CompoundMotionVector compound_mv_org_[kMaxTemporalMvCandidates + 1]
+                                       [kMaxTemporalMvCandidatesWithPadding];
+  alignas(kMaxAlignment)
+      CompoundMotionVector compound_mv_[kMaxTemporalMvCandidates + 1]
+                                       [kMaxTemporalMvCandidatesWithPadding];
+  MotionVector single_mv_org_[kMaxTemporalMvCandidates + 1]
+                             [kMaxTemporalMvCandidatesWithPadding];
+  alignas(kMaxAlignment)
+      MotionVector single_mv_[kMaxTemporalMvCandidates + 1]
+                             [kMaxTemporalMvCandidatesWithPadding];
+};
+
+void MotionVectorSearchTest::SetInputData(libvpx_test::ACMRandom* const rnd) {
+  reference_offsets_[0] =
+      Clip3(rnd->Rand16(), -kMaxFrameDistance, kMaxFrameDistance);
+  reference_offsets_[1] =
+      Clip3(rnd->Rand16(), -kMaxFrameDistance, kMaxFrameDistance);
+  for (int i = 0; i < kMaxTemporalMvCandidatesWithPadding; ++i) {
+    temporal_reference_offsets_[i] = rnd->RandRange(kMaxFrameDistance);
+    for (auto& mv : temporal_mvs_[i].mv) {
+      mv = rnd->Rand16Signed() / 8;
+    }
+  }
+  for (int i = 0; i <= kMaxTemporalMvCandidates; ++i) {
+    for (int j = 0; j < kMaxTemporalMvCandidatesWithPadding; ++j) {
+      for (int k = 0; k < 2; ++k) {
+        single_mv_[i][j].mv[k] = rnd->Rand16Signed();
+        for (auto& mv : compound_mv_[i][j].mv[k].mv) {
+          mv = rnd->Rand16Signed();
+        }
+      }
+      compound_mv_org_[i][j] = compound_mv_[i][j];
+      single_mv_org_[i][j] = single_mv_[i][j];
+    }
+  }
+}
+
+void MotionVectorSearchTest::TestRandomValues(bool speed) {
+  static const char* const kDigestCompound[3] = {
+      "74c055b06c3701b2e50f2c964a6130b9", "cab21dd54f0a1bf6e80b58cdcf1fe0a9",
+      "e42de30cd84fa4e7b8581a330ed08a8b"};
+  static const char* const kDigestSingle[3] = {
+      "265ffbb59d0895183f8e2d90b6652c71", "5068d980c4ce42ed3f11963b8aece6cc",
+      "7e699d58df3954a38ff11c8e34151e66"};
+  const int num_tests = speed ? 1000000 : 1;
+  libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+  for (int function_index = 0; function_index < 3; ++function_index) {
+    SetInputData(&rnd);
+    if (mv_projection_compound_[function_index] == nullptr) continue;
+    const absl::Time start = absl::Now();
+    for (int count = 1; count <= kMaxTemporalMvCandidates; ++count) {
+      const int total_count = count + (count & 1);
+      for (int i = 0; i < num_tests; ++i) {
+        mv_projection_compound_[function_index](
+            temporal_mvs_, temporal_reference_offsets_, reference_offsets_,
+            count, compound_mv_[count]);
+      }
+      // One more element could be calculated in SIMD implementations.
+      // Restore the original values if any.
+      for (int i = count; i < total_count; ++i) {
+        compound_mv_[count][i] = compound_mv_org_[count][i];
+      }
+    }
+    const absl::Duration elapsed_time = absl::Now() - start;
+    test_utils::CheckMd5Digest(
+        "MvProjectionCompound",
+        absl::StrFormat("function_index %d", function_index).c_str(),
+        kDigestCompound[function_index], compound_mv_, sizeof(compound_mv_),
+        elapsed_time);
+  }
+  for (int function_index = 0; function_index < 3; ++function_index) {
+    SetInputData(&rnd);
+    if (mv_projection_single_[function_index] == nullptr) continue;
+    const absl::Time start = absl::Now();
+    for (int count = 1; count <= kMaxTemporalMvCandidates; ++count) {
+      const int total_count = (count + 3) & ~3;
+      for (int i = 0; i < num_tests; ++i) {
+        mv_projection_single_[function_index](
+            temporal_mvs_, temporal_reference_offsets_, reference_offsets_[0],
+            count, single_mv_[count]);
+      }
+      // Up to three more elements could be calculated in SIMD implementations.
+      // Restore the original values if any.
+      for (int i = count; i < total_count; ++i) {
+        single_mv_[count][i] = single_mv_org_[count][i];
+      }
+    }
+    const absl::Duration elapsed_time = absl::Now() - start;
+    test_utils::CheckMd5Digest(
+        "MvProjectionSingle",
+        absl::StrFormat("function_index %d", function_index).c_str(),
+        kDigestSingle[function_index], single_mv_, sizeof(single_mv_),
+        elapsed_time);
+  }
+}
+
+TEST_P(MotionVectorSearchTest, Correctness) { TestRandomValues(false); }
+
+TEST_P(MotionVectorSearchTest, DISABLED_Speed) { TestRandomValues(true); }
+
+INSTANTIATE_TEST_SUITE_P(C, MotionVectorSearchTest, testing::Values(0));
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, MotionVectorSearchTest, testing::Values(0));
+#endif
+
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, MotionVectorSearchTest, testing::Values(0));
+#endif
+
+}  // namespace
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/obmc_test.cc b/src/dsp/obmc_test.cc
new file mode 100644
index 0000000..60b10c6
--- /dev/null
+++ b/src/dsp/obmc_test.cc
@@ -0,0 +1,349 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/obmc.h"
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <ostream>
+#include <string>
+
+#include "absl/strings/match.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "tests/block_utils.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+#include "src/dsp/obmc.inc"
+
+constexpr int kMaxBlendingBlockSize = 64;
+constexpr int kNumSpeedTests = 1000000;
+
+const char* GetDigest8bpp(int id) {
+  static const char* const kDigest[] = {
+      "76906f87892c30c7059a5c97e4838c42", "0b8670d937217c66425f2662b51eebbe",
+      "c8659acd1e8ecdab06be73f0954fa1ae", "e785f31f2723a193fefd534bd6f6c18f",
+      "751fcd8a345fef1c38a25293c9b528c0", "69af412dfa5e96ad43b79c178cb1c58b",
+      "2766a64622e183bb4614f2018f14fa85", "8d98589a5cef6e68ee8fadf19d420e3c",
+      "19eccf31dd8cf1abcee9414128fe4141", "35019f98e30bcbc6ab624682a0628519",
+      "199c551164e73c100045d7ab033ffdcc", "ad5a5eb2906265690c22741b0715f37b",
+      "e2152dea159249149ff4151111b73ed6", "6b44c0052789ce2fa4df882f35618e7d",
+      "1edd570bec7e63780d83588f6aacda25", "b04b81c9e52c58885907dc7f1ef2c11c",
+      "b24ad192e151b1e0f74d1493004cb1b6", "6c1ce7ed3463cc60870e336f990d4f14",
+      "2e6b7a06da21512dfdd9a517d2988655", "971ba1c41ab13bb341c04f936760f546",
+      "55b803239d9f12888c666c5320450937", "3d0838963f8c95dafbfb8e5e25c865d2",
+      "98a9be6245720d4e0da18115c1a1dbd7", "7e7afe3136ad681b5ea05664fe916548",
+      "33971753243f09106173199b7bae1ef5", "65413f33c19a42c112d395121aa4b3b4",
+  };
+  return kDigest[id];
+}
+
+const char* GetDigestSpeed8bpp(int id) {
+  static const char* const kDigest[] = {
+      "c5b532f5960477bdd50684ab25fae0f4", "bf76ed404bc5674e0a4ff238efceb62b",
+      "5ea519b616cd2998fbb9b25b4c2660cb", "f23d18197a96de48901738d130a147d9",
+      "07b4140c693947a63865f835089766c4", "62547d29bc4dfb2e201e9d907c09e345",
+      "c3988da521be50aeb9944564001b282b", "d5a8ff9ca1bd49f4260bb497c489b06c",
+      "b3e94f1e33c316759ebf47620327168c", "c5e64a34ca7e55f4daed19cbe4c27049",
+      "3b234eb729e8e79db8692c4cbe1b6667", "f9f3060a44c3a575470f9700b3c3a75b",
+      "e3a1960b0a7238db1184a3f9d8e9a4b2", "721c7e8ec3aa0608b64f10f7ff5427db",
+      "ba9938553703d520bc0ade427c397140", "8b6e15e8ecd234363f70f51c64b0aea1",
+      "31bf64a6ed1e8002d488c0b9dcffb80a", "9ab1f3ae2e7f70cd27452f30cecfd18e",
+      "eaf25ac79ad70fc17ca96d8fcdf0f939", "9aaa88cb5e6b8757e37c3430bd664e70",
+      "8293874b2794df8fd22f5a35c3de7bee", "e9d6ee9106227c2c67ea9e6a4652e4ad",
+      "29f8a6fc2a650f3945a4ea6d3b975b6d", "8f300a257e913a42666b4921b2b0b5c5",
+      "a526265c4b3c8593736a82ddc1fd1603", "76e248f6756ac96343204b0e48d72a9e",
+  };
+  return kDigest[id];
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+const char* GetDigest10bpp(int id) {
+  static const char* const kDigest[] = {
+      "6ab8f28e8fb3c4b10b23efee38d4154e", "d4374005d34e43e06c1b0c906289dadd",
+      "6f922e4142b644ca3f1eb0f363a1c34e", "84e7c098a9335b36082fec0bc7203075",
+      "40f00ea6884fea23a3b7fae59e3b02c3", "70cb92d08b4fdb6dd9c7d418cb1455d3",
+      "ed550798b56e70439a93cb48c359e873", "55e0d927b984e78cd51a1961e58a431d",
+      "482a6856b87265a82e4ea3fdadb2d95b", "0be46226ff87d74ff2ce68a83eaf9cca",
+      "bb4461f0131a1693a0a76f21d92a480b", "ea24f78d74c7864fb247c9a98c9b97b6",
+      "d2e70b81882aeb3d9fccef89e7552a9d", "4a692ddf91905727bc524d91735cf93c",
+      "f5d882ee6d9ae6f7dfa467ca99301424", "58821b87e7d9d4388d6003ffcb3723d1",
+      "824ddb98eb4129b3d254c0bc7a64cd73", "5eaaafa8ef9b7ba5e2856a947e5b33df",
+      "071de1494e0f1b2f99266b90bdc43ddd", "c33227a96dad506adc32dacfb371ab78",
+      "e8a632f9fff240c439d4ae6e86795046", "26b90d74f18f9df4427b6180d48db1fc",
+      "e4a01e492ddc0398b5c5b60c81468242", "f1b4f7ab5c8b949e51db104f2e33565a",
+      "b1fb9ecc6a552e2b23ee92e2f3e4122a", "a683d20129a91bb20b904aa20c0499b1",
+  };
+  return kDigest[id];
+}
+
+const char* GetDigestSpeed10bpp(int id) {
+  static const char* const kDigest[] = {
+      "df59e5fd6e0237a56381f3a516806eb8", "f478bdf43e0b91b8dc9b2661eb207e49",
+      "80557576299708005111029cef04da53", "24f84f07f53f61cd46bdcfe1e05ff9b5",
+      "4dd6bc62145baa5357a4cbf6d7a6ef15", "0b7aa27cee43b8ae0c02d07887eaa225",
+      "9e28cdae73ca97433499c31ca79e1d07", "1cacd6466a143f88e736fffaf21e2246",
+      "9c7699626660d8965e06a54282a408f3", "eef893efef62b2eb4aaad06fc462819c",
+      "4965d0a3ff750813df85c0082b21bd4b", "ec10fd79fbf552abc595def392e9a863",
+      "a148bbafdc4466fbb700b31acccca8ac", "ff0566921ff2d5145f79fbf409508fb2",
+      "5da9d960988549f53b817003b93e4d01", "fa9028b2ed049ad71b5fd15f2daacbe5",
+      "b4c4f88d1fb54869ce7ff452ca7786a6", "d607f785fce62bad85102054539e7089",
+      "b441761ea2817e4618c594aaa11d670a", "1cc5e08e6d5f9315dbc0369b97af941d",
+      "568cc1a3a67ba4e6e77f54602d0ed3e3", "522f14c068f788bc284a7d1e47d623ed",
+      "b543855cbe384b88861c881853c28192", "5faaafc124e94eedc69dc0f5d33dacac",
+      "13ca4d01bd20085459e6126555e1f7b5", "46d46fae3c8a7d9e4725154d8d2b76d8",
+  };
+  return kDigest[id];
+}
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+struct ObmcTestParam {
+  ObmcTestParam(int width, int height, ObmcDirection blending_direction)
+      : width(width), height(height), blending_direction(blending_direction) {}
+  int width;
+  int height;
+  ObmcDirection blending_direction;
+};
+
+std::ostream& operator<<(std::ostream& os, const ObmcTestParam& param) {
+  return os << "BlockSize" << param.width << "x" << param.height
+            << ", blending_direction: " << ToString(param.blending_direction);
+}
+
+template <int bitdepth, typename Pixel>
+class ObmcBlendTest : public testing::TestWithParam<ObmcTestParam> {
+ public:
+  ObmcBlendTest() = default;
+  ~ObmcBlendTest() override = default;
+
+  void SetUp() override {
+    test_utils::ResetDspTable(bitdepth);
+    ObmcInit_C();
+    const dsp::Dsp* const dsp = dsp::GetDspTable(bitdepth);
+    ASSERT_NE(dsp, nullptr);
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    const absl::string_view test_case = test_info->test_suite_name();
+    if (absl::StartsWith(test_case, "C/")) {
+    } else if (absl::StartsWith(test_case, "SSE41/")) {
+      if ((GetCpuInfo() & kSSE4_1) != 0) {
+        ObmcInit_SSE4_1();
+      }
+    } else if (absl::StartsWith(test_case, "NEON/")) {
+      ObmcInit_NEON();
+    } else {
+      FAIL() << "Unrecognized architecture prefix in test case name: "
+             << test_case;
+    }
+    func_ = dsp->obmc_blend[blending_direction_];
+  }
+
+ protected:
+  int GetDigestId() const {
+    // blending_direction_ == 0:
+    // (width, height):
+    // (2, 2), id = 0. (2, 4), id = 1. (4, 2), id = 2.
+    // (4, 4), id = 3. (4, 8), id = 4. (8, 4), id = 5.
+    // ...
+    // blending_direction_ == 1: id starts from 13.
+    const int id = (blending_direction_ == kObmcDirectionVertical) ? 0 : 13;
+    if (width_ == height_) return id + 3 * (FloorLog2(width_) - 1);
+    if (width_ < height_) return id + 1 + 3 * (FloorLog2(width_) - 1);
+    return id + 2 + 3 * (FloorLog2(height_) - 1);
+  }
+
+  // Note |digest| is only used when |use_fixed_values| is false.
+  void Test(const char* digest, bool use_fixed_values, int value);
+  void TestSpeed(const char* digest, int num_runs);
+
+ private:
+  const int width_ = GetParam().width;
+  const int height_ = GetParam().height;
+  const int blending_direction_ = GetParam().blending_direction;
+  Pixel source1_[kMaxBlendingBlockSize * kMaxBlendingBlockSize] = {};
+  Pixel source2_[kMaxBlendingBlockSize * kMaxBlendingBlockSize] = {};
+  dsp::ObmcBlendFunc func_;
+};
+
+template <int bitdepth, typename Pixel>
+void ObmcBlendTest<bitdepth, Pixel>::Test(const char* const digest,
+                                          const bool use_fixed_values,
+                                          const int value) {
+  if (func_ == nullptr) return;
+  if (use_fixed_values) {
+    std::fill(source1_,
+              source1_ + kMaxBlendingBlockSize * kMaxBlendingBlockSize, value);
+    std::fill(source2_,
+              source2_ + kMaxBlendingBlockSize * kMaxBlendingBlockSize, value);
+  } else {
+    libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+    Pixel* src_1 = source1_;
+    Pixel* src_2 = source2_;
+    const int mask = (1 << bitdepth) - 1;
+    for (int y = 0; y < height_; ++y) {
+      for (int x = 0; x < width_; ++x) {
+        src_1[x] = rnd.Rand16() & mask;
+        src_2[x] = rnd.Rand16() & mask;
+      }
+      src_1 += kMaxBlendingBlockSize;
+      src_2 += kMaxBlendingBlockSize;
+    }
+  }
+  const ptrdiff_t stride = kMaxBlendingBlockSize * sizeof(Pixel);
+  func_(source1_, stride, width_, height_, source2_, stride);
+  if (use_fixed_values) {
+    const bool success = test_utils::CompareBlocks(
+        source1_, source2_, width_, height_, kMaxBlendingBlockSize,
+        kMaxBlendingBlockSize, false);
+    EXPECT_TRUE(success);
+  } else {
+    test_utils::CheckMd5Digest(
+        "Obmc", absl::StrFormat("%dx%d", width_, height_).c_str(), digest,
+        source1_, sizeof(source1_), absl::Duration());
+  }
+}
+
+template <int bitdepth, typename Pixel>
+void ObmcBlendTest<bitdepth, Pixel>::TestSpeed(const char* const digest,
+                                               const int num_runs) {
+  if (func_ == nullptr) return;
+  libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+  Pixel* src_1 = source1_;
+  Pixel* src_2 = source2_;
+  const int mask = (1 << bitdepth) - 1;
+  for (int y = 0; y < height_; ++y) {
+    for (int x = 0; x < width_; ++x) {
+      src_1[x] = rnd.Rand16() & mask;
+      src_2[x] = rnd.Rand16() & mask;
+    }
+    src_1 += kMaxBlendingBlockSize;
+    src_2 += kMaxBlendingBlockSize;
+  }
+  const ptrdiff_t stride = kMaxBlendingBlockSize * sizeof(Pixel);
+  uint8_t dest[sizeof(Pixel) * kMaxBlendingBlockSize * kMaxBlendingBlockSize];
+  absl::Duration elapsed_time;
+  for (int i = 0; i < num_runs; ++i) {
+    memcpy(dest, source1_,
+           sizeof(Pixel) * kMaxBlendingBlockSize * kMaxBlendingBlockSize);
+    const absl::Time start = absl::Now();
+    func_(dest, stride, width_, height_, source2_, stride);
+    elapsed_time += absl::Now() - start;
+  }
+  memcpy(source1_, dest,
+         sizeof(Pixel) * kMaxBlendingBlockSize * kMaxBlendingBlockSize);
+  test_utils::CheckMd5Digest("Obmc",
+                             absl::StrFormat("%dx%d", width_, height_).c_str(),
+                             digest, source1_, sizeof(source1_), elapsed_time);
+}
+
+const ObmcTestParam kObmcTestParam[] = {
+    ObmcTestParam(2, 2, kObmcDirectionVertical),
+    ObmcTestParam(2, 4, kObmcDirectionVertical),
+    ObmcTestParam(4, 2, kObmcDirectionVertical),
+    ObmcTestParam(4, 4, kObmcDirectionVertical),
+    ObmcTestParam(4, 8, kObmcDirectionVertical),
+    ObmcTestParam(8, 4, kObmcDirectionVertical),
+    ObmcTestParam(8, 8, kObmcDirectionVertical),
+    ObmcTestParam(8, 16, kObmcDirectionVertical),
+    ObmcTestParam(16, 8, kObmcDirectionVertical),
+    ObmcTestParam(16, 16, kObmcDirectionVertical),
+    ObmcTestParam(16, 32, kObmcDirectionVertical),
+    ObmcTestParam(32, 16, kObmcDirectionVertical),
+    ObmcTestParam(32, 32, kObmcDirectionVertical),
+    ObmcTestParam(2, 2, kObmcDirectionHorizontal),
+    ObmcTestParam(2, 4, kObmcDirectionHorizontal),
+    ObmcTestParam(4, 2, kObmcDirectionHorizontal),
+    ObmcTestParam(4, 4, kObmcDirectionHorizontal),
+    ObmcTestParam(4, 8, kObmcDirectionHorizontal),
+    ObmcTestParam(8, 4, kObmcDirectionHorizontal),
+    ObmcTestParam(8, 8, kObmcDirectionHorizontal),
+    ObmcTestParam(8, 16, kObmcDirectionHorizontal),
+    ObmcTestParam(16, 8, kObmcDirectionHorizontal),
+    ObmcTestParam(16, 16, kObmcDirectionHorizontal),
+    ObmcTestParam(16, 32, kObmcDirectionHorizontal),
+    ObmcTestParam(32, 16, kObmcDirectionHorizontal),
+    ObmcTestParam(32, 32, kObmcDirectionHorizontal),
+};
+
+using ObmcBlendTest8bpp = ObmcBlendTest<8, uint8_t>;
+
+TEST_P(ObmcBlendTest8bpp, Blending) {
+  Test(/*digest=*/nullptr, /*use_fixed_values=*/true, 0);
+  Test(/*digest=*/nullptr, /*use_fixed_values=*/true, 1);
+  Test(/*digest=*/nullptr, /*use_fixed_values=*/true, 128);
+  Test(/*digest=*/nullptr, /*use_fixed_values=*/true, 255);
+  Test(GetDigest8bpp(GetDigestId()), /*use_fixed_values=*/false, -1);
+}
+
+TEST_P(ObmcBlendTest8bpp, DISABLED_Speed) {
+  TestSpeed(
+      GetDigestSpeed8bpp(GetDigestId()),
+      (kNumSpeedTests * 32 * 32) / (GetParam().height * GetParam().width));
+}
+
+INSTANTIATE_TEST_SUITE_P(C, ObmcBlendTest8bpp,
+                         testing::ValuesIn(kObmcTestParam));
+
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, ObmcBlendTest8bpp,
+                         testing::ValuesIn(kObmcTestParam));
+#endif
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, ObmcBlendTest8bpp,
+                         testing::ValuesIn(kObmcTestParam));
+#endif
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using ObmcBlendTest10bpp = ObmcBlendTest<10, uint16_t>;
+
+TEST_P(ObmcBlendTest10bpp, Blending) {
+  Test(/*digest=*/nullptr, /*use_fixed_values=*/true, 0);
+  Test(/*digest=*/nullptr, /*use_fixed_values=*/true, 1);
+  Test(/*digest=*/nullptr, /*use_fixed_values=*/true, 128);
+  Test(/*digest=*/nullptr, /*use_fixed_values=*/true, (1 << 10) - 1);
+  Test(GetDigest10bpp(GetDigestId()), /*use_fixed_values=*/false, -1);
+}
+
+TEST_P(ObmcBlendTest10bpp, DISABLED_Speed) {
+  TestSpeed(
+      GetDigestSpeed10bpp(GetDigestId()),
+      (kNumSpeedTests * 32 * 32) / (GetParam().height * GetParam().width));
+}
+
+INSTANTIATE_TEST_SUITE_P(C, ObmcBlendTest10bpp,
+                         testing::ValuesIn(kObmcTestParam));
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, ObmcBlendTest10bpp,
+                         testing::ValuesIn(kObmcTestParam));
+#endif
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+}  // namespace
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/super_res.cc b/src/dsp/super_res.cc
index d041bd1..abb01a1 100644
--- a/src/dsp/super_res.cc
+++ b/src/dsp/super_res.cc
@@ -26,10 +26,10 @@ namespace {
 
 template <int bitdepth, typename Pixel>
 void SuperRes_C(const void* /*coefficients*/, void* const source,
-                const ptrdiff_t stride, const int height,
+                const ptrdiff_t source_stride, const int height,
                 const int downscaled_width, const int upscaled_width,
-                const int initial_subpixel_x, const int step,
-                void* const dest) {
+                const int initial_subpixel_x, const int step, void* const dest,
+                ptrdiff_t dest_stride) {
   assert(step <= 1 << kSuperResScaleBits);
   auto* src = static_cast<Pixel*>(source) - DivideBy2(kSuperResFilterTaps);
   auto* dst = static_cast<Pixel*>(dest);
@@ -61,8 +61,8 @@ void SuperRes_C(const void* /*coefficients*/, void* const source,
                      (1 << bitdepth) - 1);
       subpixel_x += step;
     } while (++x < upscaled_width);
-    src += stride;
-    dst += stride;
+    src += source_stride;
+    dst += dest_stride;
   } while (--y != 0);
 }
 
diff --git a/src/dsp/super_res_test.cc b/src/dsp/super_res_test.cc
new file mode 100644
index 0000000..a93fc31
--- /dev/null
+++ b/src/dsp/super_res_test.cc
@@ -0,0 +1,264 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/super_res.h"
+
+#include <cstdint>
+#include <cstdio>
+#include <cstring>
+#include <string>
+#include <vector>
+
+#include "absl/strings/match.h"
+#include "absl/strings/numbers.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/str_split.h"
+#include "absl/strings/string_view.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/memory.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kNumSpeedTests = 5e5;
+
+const char* GetDigest8bpp(int id) {
+  static const char* const kDigestSuperRes[] = {
+      "52eb4eac1df0c51599d57696405b69d0", "ccb07cc8295fd1440ff2e3b9199ec4f9",
+      "baef34cca795b95f3d1fd81d609da679", "03f1579c2773c8ba9c867316a22b94a3"};
+  return kDigestSuperRes[id];
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+const char* GetDigest10bpp(int id) {
+  static const char* const kDigestSuperRes[] = {
+      "8fd78e05d944aeb11fac278b47ee60ba", "948eaecb70fa5614ce1c1c95e9942dc3",
+      "126cd7727e787e0625ec3f5ce97f8fa0", "85c806c41d40b841764bcb54f6d3a712"};
+  return kDigestSuperRes[id];
+}
+#endif
+
+struct SuperResTestParam {
+  SuperResTestParam(int downscaled_width, int upscaled_width)
+      : downscaled_width(downscaled_width), upscaled_width(upscaled_width) {}
+  int downscaled_width;
+  int upscaled_width;
+};
+
+template <int bitdepth, typename Pixel, typename Coefficient>
+class SuperResTest : public testing::TestWithParam<SuperResTestParam>,
+                     public test_utils::MaxAlignedAllocable {
+ public:
+  SuperResTest() = default;
+  void SetUp() override {
+    test_utils::ResetDspTable(bitdepth);
+    SuperResInit_C();
+    const dsp::Dsp* const dsp = dsp::GetDspTable(bitdepth);
+    ASSERT_NE(dsp, nullptr);
+
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    const std::vector<std::string> split_test_name =
+        absl::StrSplit(test_info->name(), '/');
+    ASSERT_TRUE(absl::SimpleAtoi(split_test_name[1], &test_id_));
+    const absl::string_view test_case = test_info->test_suite_name();
+    if (absl::StartsWith(test_case, "C/")) {
+    } else if (absl::StartsWith(test_case, "NEON/")) {
+      SuperResInit_NEON();
+    } else if (absl::StartsWith(test_case, "SSE41/")) {
+      SuperResInit_SSE4_1();
+    } else {
+      FAIL() << "Unrecognized architecture prefix in test case name: "
+             << test_case;
+    }
+    super_res_coefficients_ = dsp->super_res_coefficients;
+    func_ = dsp->super_res;
+  }
+
+  void TestComputeSuperRes(int fixed_value, int num_runs);
+
+ private:
+  static constexpr int kHeight = 127;
+  // The maximum width that must be allocated.
+  static constexpr int kUpscaledBufferWidth = 192;
+  // Allow room for the filter taps.
+  static constexpr int kStride =
+      ((kUpscaledBufferWidth + 2 * kSuperResHorizontalBorder + 15) & ~15);
+  const int kDownscaledWidth = GetParam().downscaled_width;
+  const int kUpscaledWidth = GetParam().upscaled_width;
+  int test_id_;
+  SuperResCoefficientsFunc super_res_coefficients_;
+  SuperResFunc func_;
+  Pixel source_buffer_[kHeight][kStride];
+  alignas(kMaxAlignment) Pixel dest_buffer_[kHeight][kStride];
+  alignas(kMaxAlignment) Coefficient
+      superres_coefficients_[kSuperResFilterTaps * kUpscaledBufferWidth];
+};
+
+template <int bitdepth, typename Pixel, typename Coefficient>
+void SuperResTest<bitdepth, Pixel, Coefficient>::TestComputeSuperRes(
+    int fixed_value, int num_runs) {
+  if (func_ == nullptr) return;
+  const int superres_width = kDownscaledWidth << kSuperResScaleBits;
+  const int step = (superres_width + kUpscaledWidth / 2) / kUpscaledWidth;
+  const int error = step * kUpscaledWidth - superres_width;
+  const int initial_subpixel_x =
+      ((-((kUpscaledWidth - kDownscaledWidth) << (kSuperResScaleBits - 1)) +
+        DivideBy2(kUpscaledWidth)) /
+           kUpscaledWidth +
+       (1 << (kSuperResExtraBits - 1)) - error / 2) &
+      kSuperResScaleMask;
+  if (super_res_coefficients_ != nullptr) {
+    super_res_coefficients_(kUpscaledWidth, initial_subpixel_x, step,
+                            superres_coefficients_);
+  }
+  memset(dest_buffer_, 0, sizeof(dest_buffer_));
+  if (fixed_value != 0) {
+    SetBlock<Pixel>(kHeight, kStride, fixed_value, source_buffer_[0], kStride);
+  } else {
+    // Random values.
+    libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+    const int bitdepth_mask = (1 << bitdepth) - 1;
+    for (int y = 0; y < kHeight; ++y) {
+      for (int x = 0; x < kStride; ++x) {
+        source_buffer_[y][x] = rnd.Rand16() & bitdepth_mask;
+      }
+    }
+  }
+  // Offset starting point in the buffer to accommodate line extension.
+  Pixel* src_ptr = source_buffer_[0] + kSuperResHorizontalBorder;
+
+  const absl::Time start = absl::Now();
+  for (int i = 0; i < num_runs; ++i) {
+    func_(superres_coefficients_, src_ptr, kStride, kHeight, kDownscaledWidth,
+          kUpscaledWidth, initial_subpixel_x, step, dest_buffer_, kStride);
+  }
+  const absl::Duration elapsed_time = absl::Now() - start;
+
+  if (fixed_value != 0) {
+    for (int y = 0; y < kHeight; ++y) {
+      for (int x = 0; x < kUpscaledWidth; ++x) {
+        EXPECT_TRUE(dest_buffer_[y][x] == fixed_value)
+            << "At location [" << y << ", " << x
+            << "]\nexpected: " << fixed_value
+            << "\nactual: " << dest_buffer_[y][x];
+      }
+    }
+  } else if (num_runs == 1) {
+    // Random values.
+    if ((kUpscaledWidth & 15) != 0) {
+      // The SIMD functions overwrite up to 15 pixels in each row. Reset them.
+      for (int y = 0; y < kHeight; ++y) {
+        for (int x = kUpscaledWidth; x < Align(kUpscaledWidth, 16); ++x) {
+          dest_buffer_[y][x] = 0;
+        }
+      }
+    }
+    const char* expected_digest;
+    if (bitdepth == 8) {
+      expected_digest = GetDigest8bpp(test_id_);
+    } else {
+#if LIBGAV1_MAX_BITDEPTH >= 10
+      expected_digest = GetDigest10bpp(test_id_);
+#endif
+    }
+    test_utils::CheckMd5Digest(
+        "SuperRes",
+        absl::StrFormat("width %d, step %d, start %d", kUpscaledWidth, step,
+                        initial_subpixel_x)
+            .c_str(),
+        expected_digest, dest_buffer_, sizeof(dest_buffer_), elapsed_time);
+  } else {
+    // Speed test.
+    printf("Mode SuperRes [width %d, step %d, start %d]: %d us\n",
+           kUpscaledWidth, step, initial_subpixel_x,
+           static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+  }
+}
+
+using SuperResTest8bpp = SuperResTest<8, uint8_t, int8_t>;
+
+TEST_P(SuperResTest8bpp, FixedValues) {
+  TestComputeSuperRes(100, 1);
+  TestComputeSuperRes(255, 1);
+  TestComputeSuperRes(1, 1);
+}
+
+TEST_P(SuperResTest8bpp, RandomValues) { TestComputeSuperRes(0, 1); }
+
+TEST_P(SuperResTest8bpp, DISABLED_Speed) {
+  TestComputeSuperRes(0, kNumSpeedTests);
+}
+
+const SuperResTestParam kSuperResTestParams[] = {
+    SuperResTestParam(96, 192),
+    SuperResTestParam(171, 192),
+    SuperResTestParam(102, 128),
+    SuperResTestParam(61, 121),
+};
+
+INSTANTIATE_TEST_SUITE_P(C, SuperResTest8bpp,
+                         testing::ValuesIn(kSuperResTestParams));
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, SuperResTest8bpp,
+                         testing::ValuesIn(kSuperResTestParams));
+#endif
+
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, SuperResTest8bpp,
+                         testing::ValuesIn(kSuperResTestParams));
+#endif
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using SuperResTest10bpp = SuperResTest<10, uint16_t, int16_t>;
+
+TEST_P(SuperResTest10bpp, FixedValues) {
+  TestComputeSuperRes(100, 1);
+  TestComputeSuperRes(511, 1);
+  TestComputeSuperRes(1, 1);
+}
+
+TEST_P(SuperResTest10bpp, RandomValues) { TestComputeSuperRes(0, 1); }
+
+TEST_P(SuperResTest10bpp, DISABLED_Speed) {
+  TestComputeSuperRes(0, kNumSpeedTests);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, SuperResTest10bpp,
+                         testing::ValuesIn(kSuperResTestParams));
+
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, SuperResTest10bpp,
+                         testing::ValuesIn(kSuperResTestParams));
+#endif
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, SuperResTest10bpp,
+                         testing::ValuesIn(kSuperResTestParams));
+#endif
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+}  // namespace
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/warp_test.cc b/src/dsp/warp_test.cc
new file mode 100644
index 0000000..e7384f4
--- /dev/null
+++ b/src/dsp/warp_test.cc
@@ -0,0 +1,649 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/warp.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <ostream>
+#include <string>
+#include <type_traits>
+
+#include "absl/base/macros.h"
+#include "absl/strings/match.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/post_filter.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/memory.h"
+#include "tests/block_utils.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kSourceBorderHorizontal = 16;
+constexpr int kSourceBorderVertical = 13;
+
+constexpr int kMaxSourceBlockWidth =
+    kMaxSuperBlockSizeInPixels + kSourceBorderHorizontal * 2;
+constexpr int kMaxSourceBlockHeight =
+    kMaxSuperBlockSizeInPixels + kSourceBorderVertical * 2;
+constexpr int kMaxDestBlockWidth =
+    kMaxSuperBlockSizeInPixels + kConvolveBorderLeftTop * 2;
+constexpr int kMaxDestBlockHeight =
+    kMaxSuperBlockSizeInPixels + kConvolveBorderLeftTop * 2;
+
+constexpr uint16_t kDivisorLookup[257] = {
+    16384, 16320, 16257, 16194, 16132, 16070, 16009, 15948, 15888, 15828, 15768,
+    15709, 15650, 15592, 15534, 15477, 15420, 15364, 15308, 15252, 15197, 15142,
+    15087, 15033, 14980, 14926, 14873, 14821, 14769, 14717, 14665, 14614, 14564,
+    14513, 14463, 14413, 14364, 14315, 14266, 14218, 14170, 14122, 14075, 14028,
+    13981, 13935, 13888, 13843, 13797, 13752, 13707, 13662, 13618, 13574, 13530,
+    13487, 13443, 13400, 13358, 13315, 13273, 13231, 13190, 13148, 13107, 13066,
+    13026, 12985, 12945, 12906, 12866, 12827, 12788, 12749, 12710, 12672, 12633,
+    12596, 12558, 12520, 12483, 12446, 12409, 12373, 12336, 12300, 12264, 12228,
+    12193, 12157, 12122, 12087, 12053, 12018, 11984, 11950, 11916, 11882, 11848,
+    11815, 11782, 11749, 11716, 11683, 11651, 11619, 11586, 11555, 11523, 11491,
+    11460, 11429, 11398, 11367, 11336, 11305, 11275, 11245, 11215, 11185, 11155,
+    11125, 11096, 11067, 11038, 11009, 10980, 10951, 10923, 10894, 10866, 10838,
+    10810, 10782, 10755, 10727, 10700, 10673, 10645, 10618, 10592, 10565, 10538,
+    10512, 10486, 10460, 10434, 10408, 10382, 10356, 10331, 10305, 10280, 10255,
+    10230, 10205, 10180, 10156, 10131, 10107, 10082, 10058, 10034, 10010, 9986,
+    9963,  9939,  9916,  9892,  9869,  9846,  9823,  9800,  9777,  9754,  9732,
+    9709,  9687,  9664,  9642,  9620,  9598,  9576,  9554,  9533,  9511,  9489,
+    9468,  9447,  9425,  9404,  9383,  9362,  9341,  9321,  9300,  9279,  9259,
+    9239,  9218,  9198,  9178,  9158,  9138,  9118,  9098,  9079,  9059,  9039,
+    9020,  9001,  8981,  8962,  8943,  8924,  8905,  8886,  8867,  8849,  8830,
+    8812,  8793,  8775,  8756,  8738,  8720,  8702,  8684,  8666,  8648,  8630,
+    8613,  8595,  8577,  8560,  8542,  8525,  8508,  8490,  8473,  8456,  8439,
+    8422,  8405,  8389,  8372,  8355,  8339,  8322,  8306,  8289,  8273,  8257,
+    8240,  8224,  8208,  8192};
+
+template <bool is_compound>
+const char* GetDigest8bpp(int id) {
+  static const char* const kDigest[] = {
+      "77ba358a0f5e19a8e69fa0a95712578e", "141b23d13a04e0b84d26d514de76d6b0",
+      "b0265858454b979852ffadae323f0fb7", "9cf38e3579265b656f1f2100ba15b0e9",
+      "ab51d05cc255ef8e37921182df1d89b1", "e3e96f90a4b07ca733e40f057dc01c41",
+      "4eee8c1a52a62a266db9b1c9338e124c", "901a87d8f88f6324dbc0960a6de861ac",
+      "da9cb6faf6adaeeae12b6784f39186c5", "14450ab05536cdb0d2f499716ccb559d",
+      "566b396cbf008bbb869b364fdc81860d", "681a872baf2de4e58d73ea9ab8643a72",
+      "7f17d290d513a7416761b3a01f10fd2f",
+  };
+  static const char* const kCompoundDigest[] = {
+      "7e9339d265b7beac7bbe32fe7bb0fccb", "f747d663b427bb38a3ff36b0815a394c",
+      "858cf54d2253281a919fbdb48fe91c53", "4721dd97a212c6068bd488f400259afc",
+      "36878c7906492bc740112abdea77616f", "89deb68aa35764bbf3024b501a6bed50",
+      "8ac5b08f9b2afd38143c357646af0f82", "bf6e2a64835ea0c9d7467394253d0eb2",
+      "7b0a539acd2a27eff398dd084abad933", "61c8d81b397c1cf727ff8a9fabab90af",
+      "4d412349a25a832c1fb3fb29e3f0e2b3", "2c6dd2a9a4ede9fa00adb567ba646f30",
+      "b2a0ce68db3cadd207299f73112bed74",
+  };
+  return is_compound ? kCompoundDigest[id] : kDigest[id];
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+template <bool is_compound>
+const char* GetDigest10bpp(int id) {
+  static const char* const kDigest[] = {
+      "1fef54f56a0bafccf7f8da1ac3b18b76", "8a65c72f171feafa2f393d31d6b7fe1b",
+      "808019346f2f1f45f8cf2e9fc9a49320", "c28e2f2c6c830a29bcc2452166cba521",
+      "f040674d6f54e8910d655f0d11fd8cdd", "473af9bb1c6023965c2284b716feef97",
+      "e4f6d7babd0813d5afb0f575ebfa8166", "58f96ef8a880963a213624bb0d06d47c",
+      "1ec0995fa4490628b679d03683233388", "9526fb102fde7dc1a7e160e65af6da33",
+      "f0457427d0c0e31d82ea4f612f7f86f1", "ddc82ae298cccebad493ba9de0f69fbd",
+      "5ed615091e2f62df26de7e91a985cb81",
+  };
+  static const char* const kCompoundDigest[] = {
+      "8e6986ae143260e0b8b4887f15a141a1", "0a7f0db8316b8c3569f08834dd0c6f50",
+      "90705b2e7dbe083e8a1f70f29d6f257e", "e428a75bea77d769d21f3f7a1d2b0b38",
+      "a570b13d790c085c4ab50d71dd085d56", "e5d043c6cd6ff6dbab6e38a8877e93bd",
+      "12ea96991e46e3e9aa78ab812ffa0525", "84293a94a53f1cf814fa25e793c3fe27",
+      "b98a7502c84ac8437266f702dcc0a92e", "d8db5d52e9b0a5be0ad2d517d5bd16e9",
+      "f3be504bbb609ce4cc71c5539252638a", "fcde83b54e14e9de23460644f244b047",
+      "42eb66e752e9ef289b47053b5c73fdd6",
+  };
+  return is_compound ? kCompoundDigest[id] : kDigest[id];
+}
+#endif
+
+int RandomWarpedParam(int seed_offset, int bits) {
+  libvpx_test::ACMRandom rnd(seed_offset +
+                             libvpx_test::ACMRandom::DeterministicSeed());
+  // 1 in 8 chance of generating zero (arbitrary).
+  const bool zero = (rnd.Rand16() & 7) == 0;
+  if (zero) return 0;
+  // Generate uniform values in the range [-(1 << bits), 1] U [1, 1 <<
+  // bits].
+  const int mask = (1 << bits) - 1;
+  const int value = 1 + (rnd.RandRange(1u << 31) & mask);
+  const bool sign = (rnd.Rand16() & 1) != 0;
+  return sign ? value : -value;
+}
+
+// This function is a copy from warp_prediction.cc.
+template <typename T>
+void GenerateApproximateDivisor(T value, int16_t* division_factor,
+                                int16_t* division_shift) {
+  const int n = FloorLog2(std::abs(value));
+  const T e = std::abs(value) - (static_cast<T>(1) << n);
+  const int entry = (n > kDivisorLookupBits)
+                        ? RightShiftWithRounding(e, n - kDivisorLookupBits)
+                        : static_cast<int>(e << (kDivisorLookupBits - n));
+  *division_shift = n + kDivisorLookupPrecisionBits;
+  *division_factor =
+      (value < 0) ? -kDivisorLookup[entry] : kDivisorLookup[entry];
+}
+
+// This function is a copy from warp_prediction.cc.
+int16_t GetShearParameter(int value) {
+  return static_cast<int16_t>(
+      LeftShift(RightShiftWithRoundingSigned(value, kWarpParamRoundingBits),
+                kWarpParamRoundingBits));
+}
+
+// This function is a copy from warp_prediction.cc.
+// This function is used here to help generate valid warp parameters.
+bool SetupShear(const int* params, int16_t* alpha, int16_t* beta,
+                int16_t* gamma, int16_t* delta) {
+  int16_t division_shift;
+  int16_t division_factor;
+  GenerateApproximateDivisor<int32_t>(params[2], &division_factor,
+                                      &division_shift);
+  const int alpha0 =
+      Clip3(params[2] - (1 << kWarpedModelPrecisionBits), INT16_MIN, INT16_MAX);
+  const int beta0 = Clip3(params[3], INT16_MIN, INT16_MAX);
+  const int64_t v = LeftShift(params[4], kWarpedModelPrecisionBits);
+  const int gamma0 =
+      Clip3(RightShiftWithRoundingSigned(v * division_factor, division_shift),
+            INT16_MIN, INT16_MAX);
+  const int64_t w = static_cast<int64_t>(params[3]) * params[4];
+  const int delta0 = Clip3(
+      params[5] -
+          RightShiftWithRoundingSigned(w * division_factor, division_shift) -
+          (1 << kWarpedModelPrecisionBits),
+      INT16_MIN, INT16_MAX);
+
+  *alpha = GetShearParameter(alpha0);
+  *beta = GetShearParameter(beta0);
+  *gamma = GetShearParameter(gamma0);
+  *delta = GetShearParameter(delta0);
+  if ((4 * std::abs(*alpha) + 7 * std::abs(*beta) >=
+       (1 << kWarpedModelPrecisionBits)) ||
+      (4 * std::abs(*gamma) + 4 * std::abs(*delta) >=
+       (1 << kWarpedModelPrecisionBits))) {
+    return false;  // NOLINT (easier condition to understand).
+  }
+
+  return true;
+}
+
+void GenerateWarpedModel(int* params, int16_t* alpha, int16_t* beta,
+                         int16_t* gamma, int16_t* delta, int seed) {
+  do {
+    params[0] = RandomWarpedParam(seed, kWarpedModelPrecisionBits + 6);
+    params[1] = RandomWarpedParam(seed, kWarpedModelPrecisionBits + 6);
+    params[2] = RandomWarpedParam(seed, kWarpedModelPrecisionBits - 3) +
+                (1 << kWarpedModelPrecisionBits);
+    params[3] = RandomWarpedParam(seed, kWarpedModelPrecisionBits - 3);
+    params[4] = RandomWarpedParam(seed, kWarpedModelPrecisionBits - 3);
+    params[5] = RandomWarpedParam(seed, kWarpedModelPrecisionBits - 3) +
+                (1 << kWarpedModelPrecisionBits);
+    ++seed;
+  } while (params[2] == 0 || !SetupShear(params, alpha, beta, gamma, delta));
+}
+
+struct WarpTestParam {
+  WarpTestParam(int width, int height) : width(width), height(height) {}
+  int width;
+  int height;
+};
+
+template <bool is_compound, int bitdepth, typename Pixel>
+class WarpTest : public testing::TestWithParam<WarpTestParam> {
+ public:
+  WarpTest() = default;
+  ~WarpTest() override = default;
+
+  void SetUp() override {
+    test_utils::ResetDspTable(bitdepth);
+    WarpInit_C();
+    const dsp::Dsp* const dsp = dsp::GetDspTable(bitdepth);
+    ASSERT_NE(dsp, nullptr);
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    const absl::string_view test_case = test_info->test_suite_name();
+    if (absl::StartsWith(test_case, "C/")) {
+    } else if (absl::StartsWith(test_case, "NEON/")) {
+      WarpInit_NEON();
+    } else if (absl::StartsWith(test_case, "SSE41/")) {
+      WarpInit_SSE4_1();
+    } else {
+      FAIL() << "Unrecognized architecture prefix in test case name: "
+             << test_case;
+    }
+    func_ = is_compound ? dsp->warp_compound : dsp->warp;
+  }
+
+ protected:
+  using DestType =
+      typename std::conditional<is_compound, uint16_t, Pixel>::type;
+
+  void SetInputData(bool use_fixed_values, int value);
+  void Test(bool use_fixed_values, int value, int num_runs = 1);
+  void TestFixedValues();
+  void TestRandomValues();
+  void TestSpeed();
+
+  const WarpTestParam param_ = GetParam();
+
+ private:
+  int warp_params_[8];
+  dsp::WarpFunc func_;
+  // Warp filters are 7-tap, which needs 3 pixels (kConvolveBorderLeftTop)
+  // padding. Destination buffer indices are based on subsampling values (x+y):
+  // 0: (4:4:4), 1:(4:2:2), 2: (4:2:0).
+  Pixel source_[kMaxSourceBlockHeight * kMaxSourceBlockWidth] = {};
+  DestType dest_[3][kMaxDestBlockHeight * kMaxDestBlockWidth] = {};
+};
+
+template <bool is_compound, int bitdepth, typename Pixel>
+void WarpTest<is_compound, bitdepth, Pixel>::SetInputData(bool use_fixed_values,
+                                                          int value) {
+  if (use_fixed_values) {
+    for (int y = 0; y < param_.height; ++y) {
+      const int row = kSourceBorderVertical + y;
+      Memset(source_ + row * kMaxSourceBlockWidth + kSourceBorderHorizontal,
+             value, param_.width);
+    }
+  } else {
+    const int mask = (1 << bitdepth) - 1;
+    libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+    for (int y = 0; y < param_.height; ++y) {
+      const int row = kSourceBorderVertical + y;
+      for (int x = 0; x < param_.width; ++x) {
+        const int column = kSourceBorderHorizontal + x;
+        source_[row * kMaxSourceBlockWidth + column] = rnd.Rand16() & mask;
+      }
+    }
+  }
+  PostFilter::ExtendFrame<Pixel>(
+      &source_[kSourceBorderVertical * kMaxSourceBlockWidth +
+               kSourceBorderHorizontal],
+      param_.width, param_.height, kMaxSourceBlockWidth,
+      kSourceBorderHorizontal, kSourceBorderHorizontal, kSourceBorderVertical,
+      kSourceBorderVertical);
+}
+
+template <bool is_compound, int bitdepth, typename Pixel>
+void WarpTest<is_compound, bitdepth, Pixel>::Test(bool use_fixed_values,
+                                                  int value,
+                                                  int num_runs /*= 1*/) {
+  if (func_ == nullptr) return;
+  SetInputData(use_fixed_values, value);
+  libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+  const int source_offset =
+      kSourceBorderVertical * kMaxSourceBlockWidth + kSourceBorderHorizontal;
+  const int dest_offset =
+      kConvolveBorderLeftTop * kMaxDestBlockWidth + kConvolveBorderLeftTop;
+  const Pixel* const src = source_ + source_offset;
+  const ptrdiff_t src_stride = kMaxSourceBlockWidth * sizeof(Pixel);
+  const ptrdiff_t dst_stride =
+      is_compound ? kMaxDestBlockWidth : kMaxDestBlockWidth * sizeof(Pixel);
+
+  absl::Duration elapsed_time;
+  for (int subsampling_x = 0; subsampling_x <= 1; ++subsampling_x) {
+    for (int subsampling_y = 0; subsampling_y <= 1; ++subsampling_y) {
+      if (subsampling_x == 0 && subsampling_y == 1) {
+        // When both are 0: 4:4:4
+        // When both are 1: 4:2:0
+        // When only |subsampling_x| is 1: 4:2:2
+        // Having only |subsampling_y| == 1 is unsupported.
+        continue;
+      }
+      int params[8];
+      int16_t alpha;
+      int16_t beta;
+      int16_t gamma;
+      int16_t delta;
+      GenerateWarpedModel(params, &alpha, &beta, &gamma, &delta, rnd.Rand8());
+
+      const int dest_id = subsampling_x + subsampling_y;
+      DestType* const dst = dest_[dest_id] + dest_offset;
+      const absl::Time start = absl::Now();
+      for (int n = 0; n < num_runs; ++n) {
+        func_(src, src_stride, param_.width, param_.height, params,
+              subsampling_x, subsampling_y, 0, 0, param_.width, param_.height,
+              alpha, beta, gamma, delta, dst, dst_stride);
+      }
+      elapsed_time += absl::Now() - start;
+    }
+  }
+
+  if (use_fixed_values) {
+    // For fixed values, input and output are identical.
+    for (size_t i = 0; i < ABSL_ARRAYSIZE(dest_); ++i) {
+      // |is_compound| holds a few more bits of precision and an offset value.
+      Pixel compensated_dest[kMaxDestBlockWidth * kMaxDestBlockHeight];
+      const int compound_offset = (bitdepth == 8) ? 0 : kCompoundOffset;
+      if (is_compound) {
+        for (int y = 0; y < param_.height; ++y) {
+          for (int x = 0; x < param_.width; ++x) {
+            const int compound_value =
+                dest_[i][dest_offset + y * kMaxDestBlockWidth + x];
+            const int remove_offset = compound_value - compound_offset;
+            const int full_shift =
+                remove_offset >>
+                (kInterRoundBitsVertical - kInterRoundBitsCompoundVertical);
+            compensated_dest[y * kMaxDestBlockWidth + x] =
+                Clip3(full_shift, 0, (1 << bitdepth) - 1);
+          }
+        }
+      }
+      Pixel* pixel_dest =
+          is_compound ? compensated_dest
+                      : reinterpret_cast<Pixel*>(dest_[i] + dest_offset);
+      const bool success = test_utils::CompareBlocks(
+          src, pixel_dest, param_.width, param_.height, kMaxSourceBlockWidth,
+          kMaxDestBlockWidth, false);
+      EXPECT_TRUE(success) << "subsampling_x + subsampling_y: " << i;
+    }
+  } else {
+    // (width, height):
+    // (8, 8), id = 0. (8, 16), id = 1. (16, 8), id = 2.
+    // (16, 16), id = 3. (16, 32), id = 4. (32, 16), id = 5.
+    // ...
+    // (128, 128), id = 12.
+    int id;
+    if (param_.width == param_.height) {
+      id = 3 * static_cast<int>(FloorLog2(param_.width) - 3);
+    } else if (param_.width < param_.height) {
+      id = 1 + 3 * static_cast<int>(FloorLog2(param_.width) - 3);
+    } else {
+      id = 2 + 3 * static_cast<int>(FloorLog2(param_.height) - 3);
+    }
+
+    const char* expected_digest;
+    if (bitdepth == 8) {
+      expected_digest = GetDigest8bpp<is_compound>(id);
+    } else {
+#if LIBGAV1_MAX_BITDEPTH >= 10
+      expected_digest = GetDigest10bpp<is_compound>(id);
+#endif
+    }
+    test_utils::CheckMd5Digest(
+        "Warp", absl::StrFormat("%dx%d", param_.width, param_.height).c_str(),
+        expected_digest, dest_, sizeof(dest_), elapsed_time);
+  }
+}
+
+template <bool is_compound, int bitdepth, typename Pixel>
+void WarpTest<is_compound, bitdepth, Pixel>::TestFixedValues() {
+  Test(true, 0);
+  Test(true, 1);
+  Test(true, 128);
+  Test(true, (1 << bitdepth) - 1);
+}
+
+template <bool is_compound, int bitdepth, typename Pixel>
+void WarpTest<is_compound, bitdepth, Pixel>::TestRandomValues() {
+  Test(false, 0);
+}
+
+template <bool is_compound, int bitdepth, typename Pixel>
+void WarpTest<is_compound, bitdepth, Pixel>::TestSpeed() {
+  const int num_runs = static_cast<int>(1.0e7 / (param_.width * param_.height));
+  Test(false, 0, num_runs);
+}
+
+void ApplyFilterToSignedInput(const int min_input, const int max_input,
+                              const int8_t filter[kSubPixelTaps],
+                              int* min_output, int* max_output) {
+  int min = 0, max = 0;
+  for (int i = 0; i < kSubPixelTaps; ++i) {
+    const int tap = filter[i];
+    if (tap > 0) {
+      max += max_input * tap;
+      min += min_input * tap;
+    } else {
+      min += max_input * tap;
+      max += min_input * tap;
+    }
+  }
+  *min_output = min;
+  *max_output = max;
+}
+
+void ApplyFilterToUnsignedInput(const int max_input,
+                                const int8_t filter[kSubPixelTaps],
+                                int* min_output, int* max_output) {
+  ApplyFilterToSignedInput(0, max_input, filter, min_output, max_output);
+}
+
+// Validate the maximum ranges for different parts of the Warp process.
+template <int bitdepth>
+void ShowRange() {
+  constexpr int horizontal_bits = (bitdepth == kBitdepth12)
+                                      ? kInterRoundBitsHorizontal12bpp
+                                      : kInterRoundBitsHorizontal;
+  constexpr int vertical_bits = (bitdepth == kBitdepth12)
+                                    ? kInterRoundBitsVertical12bpp
+                                    : kInterRoundBitsVertical;
+  constexpr int compound_vertical_bits = kInterRoundBitsCompoundVertical;
+
+  constexpr int compound_offset = (bitdepth == 8) ? 0 : kCompoundOffset;
+
+  constexpr int max_input = (1 << bitdepth) - 1;
+
+  const int8_t* worst_warp_filter = kWarpedFilters8[93];
+
+  // First pass.
+  printf("Bitdepth: %2d Input range:            [%8d, %8d]\n", bitdepth, 0,
+         max_input);
+
+  int min = 0, max = 0;
+  ApplyFilterToUnsignedInput(max_input, worst_warp_filter, &min, &max);
+
+  int first_pass_offset;
+  if (bitdepth == 8) {
+    // Derive an offset for 8 bit.
+    for (first_pass_offset = 1; - first_pass_offset > min;
+         first_pass_offset <<= 1) {
+    }
+    printf("  8bpp intermediate offset: %d.\n", first_pass_offset);
+    min += first_pass_offset;
+    max += first_pass_offset;
+    assert(min > 0);
+    assert(max < UINT16_MAX);
+  } else {
+    // 10bpp and 12bpp require int32_t for the intermediate values. Adding an
+    // offset is not required.
+    assert(min > INT32_MIN);
+    assert(max > INT16_MAX && max < INT32_MAX);
+  }
+
+  printf("  intermediate range:                [%8d, %8d]\n", min, max);
+
+  const int first_pass_min = RightShiftWithRounding(min, horizontal_bits);
+  const int first_pass_max = RightShiftWithRounding(max, horizontal_bits);
+
+  printf("  first pass output range:           [%8d, %8d]\n", first_pass_min,
+         first_pass_max);
+
+  // Second pass.
+  if (bitdepth == 8) {
+    ApplyFilterToUnsignedInput(first_pass_max, worst_warp_filter, &min, &max);
+  } else {
+    ApplyFilterToSignedInput(first_pass_min, first_pass_max, worst_warp_filter,
+                             &min, &max);
+  }
+
+  if (bitdepth == 8) {
+    // Remove the offset that was applied in the first pass since we must use
+    // int32_t for this phase anyway. 128 is the sum of the filter taps.
+    const int offset_removal = (first_pass_offset >> horizontal_bits) * 128;
+    printf("  8bpp intermediate offset removal: %d.\n", offset_removal);
+    max -= offset_removal;
+    min -= offset_removal;
+    assert(min < INT16_MIN && min > INT32_MIN);
+    assert(max > INT16_MAX && max < INT32_MAX);
+  } else {
+    // 10bpp and 12bpp require int32_t for the intermediate values. Adding an
+    // offset is not required.
+    assert(min > INT32_MIN);
+    assert(max > INT16_MAX && max < INT32_MAX);
+  }
+
+  printf("  intermediate range:                [%8d, %8d]\n", min, max);
+
+  // Second pass non-compound output is clipped to Pixel values.
+  const int second_pass_min =
+      Clip3(RightShiftWithRounding(min, vertical_bits), 0, max_input);
+  const int second_pass_max =
+      Clip3(RightShiftWithRounding(max, vertical_bits), 0, max_input);
+  printf("  second pass output range:          [%8d, %8d]\n", second_pass_min,
+         second_pass_max);
+
+  // Output is Pixel so matches Pixel values.
+  assert(second_pass_min == 0);
+  assert(second_pass_max == max_input);
+
+  const int compound_second_pass_min =
+      RightShiftWithRounding(min, compound_vertical_bits) + compound_offset;
+  const int compound_second_pass_max =
+      RightShiftWithRounding(max, compound_vertical_bits) + compound_offset;
+
+  printf("  compound second pass output range: [%8d, %8d]\n",
+         compound_second_pass_min, compound_second_pass_max);
+
+  if (bitdepth == 8) {
+    // 8bpp output is int16_t without an offset.
+    assert(compound_second_pass_min > INT16_MIN);
+    assert(compound_second_pass_max < INT16_MAX);
+  } else {
+    // 10bpp and 12bpp use the offset to fit inside uint16_t.
+    assert(compound_second_pass_min > 0);
+    assert(compound_second_pass_max < UINT16_MAX);
+  }
+
+  printf("\n");
+}
+
+TEST(WarpTest, ShowRange) {
+  ShowRange<kBitdepth8>();
+  ShowRange<kBitdepth10>();
+  ShowRange<kBitdepth12>();
+}
+
+using WarpTest8bpp = WarpTest</*is_compound=*/false, 8, uint8_t>;
+// TODO(jzern): Coverage could be added for kInterRoundBitsCompoundVertical via
+// WarpCompoundTest.
+// using WarpCompoundTest8bpp = WarpTest</*is_compound=*/true, 8, uint8_t>;
+
+// Verifies the sum of the warped filter coefficients is 128 for every filter.
+//
+// Verifies the properties used in the calculation of ranges of variables in
+// the block warp process:
+// * The maximum sum of the positive warped filter coefficients is 175.
+// * The minimum (i.e., most negative) sum of the negative warped filter
+//   coefficients is -47.
+//
+// NOTE: This test is independent of the bitdepth and the implementation of the
+// block warp function, so it just needs to be a test in the WarpTest8bpp class
+// and does not need to be defined with TEST_P.
+TEST(WarpTest8bpp, WarpedFilterCoefficientSums) {
+  int max_positive_sum = 0;
+  int min_negative_sum = 0;
+  for (const auto& filter : kWarpedFilters) {
+    int sum = 0;
+    int positive_sum = 0;
+    int negative_sum = 0;
+    for (const auto coefficient : filter) {
+      sum += coefficient;
+      if (coefficient > 0) {
+        positive_sum += coefficient;
+      } else {
+        negative_sum += coefficient;
+      }
+    }
+    EXPECT_EQ(sum, 128);
+    max_positive_sum = std::max(positive_sum, max_positive_sum);
+    min_negative_sum = std::min(negative_sum, min_negative_sum);
+  }
+  EXPECT_EQ(max_positive_sum, 175);
+  EXPECT_EQ(min_negative_sum, -47);
+}
+
+TEST_P(WarpTest8bpp, FixedValues) { TestFixedValues(); }
+
+TEST_P(WarpTest8bpp, RandomValues) { TestRandomValues(); }
+
+TEST_P(WarpTest8bpp, DISABLED_Speed) { TestSpeed(); }
+const WarpTestParam warp_test_param[] = {
+    WarpTestParam(8, 8),     WarpTestParam(8, 16),   WarpTestParam(16, 8),
+    WarpTestParam(16, 16),   WarpTestParam(16, 32),  WarpTestParam(32, 16),
+    WarpTestParam(32, 32),   WarpTestParam(32, 64),  WarpTestParam(64, 32),
+    WarpTestParam(64, 64),   WarpTestParam(64, 128), WarpTestParam(128, 64),
+    WarpTestParam(128, 128),
+};
+
+INSTANTIATE_TEST_SUITE_P(C, WarpTest8bpp, testing::ValuesIn(warp_test_param));
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, WarpTest8bpp,
+                         testing::ValuesIn(warp_test_param));
+#endif
+
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, WarpTest8bpp,
+                         testing::ValuesIn(warp_test_param));
+#endif
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using WarpTest10bpp = WarpTest</*is_compound=*/false, 10, uint16_t>;
+// TODO(jzern): Coverage could be added for kInterRoundBitsCompoundVertical via
+// WarpCompoundTest.
+// using WarpCompoundTest10bpp = WarpTest</*is_compound=*/true, 10, uint16_t>;
+
+TEST_P(WarpTest10bpp, FixedValues) { TestFixedValues(); }
+
+TEST_P(WarpTest10bpp, RandomValues) { TestRandomValues(); }
+
+TEST_P(WarpTest10bpp, DISABLED_Speed) { TestSpeed(); }
+
+INSTANTIATE_TEST_SUITE_P(C, WarpTest10bpp, testing::ValuesIn(warp_test_param));
+#endif
+
+std::ostream& operator<<(std::ostream& os, const WarpTestParam& warp_param) {
+  return os << "BlockSize" << warp_param.width << "x" << warp_param.height;
+}
+
+}  // namespace
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/weight_mask_test.cc b/src/dsp/weight_mask_test.cc
new file mode 100644
index 0000000..77b608e
--- /dev/null
+++ b/src/dsp/weight_mask_test.cc
@@ -0,0 +1,390 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/weight_mask.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <ostream>
+#include <string>
+#include <type_traits>
+
+#include "absl/strings/match.h"
+#include "absl/strings/str_format.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/memory.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kNumSpeedTests = 50000;
+constexpr int kMaxPredictionSize = 128;
+// weight_mask is only used with kCompoundPredictionTypeDiffWeighted with
+// convolve producing the most extreme ranges.
+// This includes kCompoundOffset in 10bpp and 12bpp.
+// see: src/dsp/convolve.cc & src/dsp/warp.cc.
+constexpr int kCompoundPredictionRange[3][2] = {
+    // 8bpp
+    {-5132, 9212},
+    // 10bpp
+    {3988, 61532},
+    // 12bpp
+    {3974, 61559},
+};
+
+const char* GetDigest8bpp(int id) {
+  static const char* const kDigest[] = {
+      "035267cb2ac5a0f8ff50c2d30ad52226",
+      "3231f4972dd858b734e0cc48c4cd001e",
+      "7e163b69721a13ec9f75b5cd74ffee3f",
+      "" /*kBlock4x16*/,
+      "b75e90abc224acca8754c82039b3ba93",
+      "9f555f3a2c1a933a663d6103b8118dea",
+      "8539e54f34cd6668ff6e6606210be201",
+      "20f85c9db7c878c21fbf2052936f269e",
+      "620ec166de57b0639260b2d72eebfc3e",
+      "be666394b5a894d78f4097b6cca272fe",
+      "57a96816e84cdb381f596c23827b5922",
+      "f2e0d348f608f246b6d8d799b66c189e",
+      "161ac051f38372d9339d36728b9926ba",
+      "d5fad48aaf132a81cb62bba4f07bbebb",
+      "e10be2dca2f7dae38dae75150fc1612d",
+      "7f744481eb551bbc224b5236c82cbade",
+      "0d99bbf31ecddc1c2d5063a68c0e9375",
+      "5fb8ec5f582f0ebfe519ed55860f67c4",
+
+      // mask_is_inverse = true.
+      "a4250ca39daa700836138371d36d465f",
+      "abe9a9a1c3a5accda9bfefd4d6e81ccb",
+      "e95b08878d0bb5f2293c27c3a6fe0253",
+      "" /*kBlock4x16*/,
+      "e1c52be02ce9ab2800015bb08b866c31",
+      "eea1dc73811f73866edfeb4555865f20",
+      "3178e64085645bd819256a8ab43c7b0a",
+      "ee83884e4d5cd2c9ac04879116bab681",
+      "d107eff7d5ae9ba14d2c6b3b8d9fca49",
+      "400aeea7d299626fc336c46b1ad7a9d8",
+      "e9e26a400f67f3ad36350fe4171fc613",
+      "4c31ad714f470f34127febaf1bac714b",
+      "bbdcb1097c66d561dd4ea16b3fb73f97",
+      "3a21dfbf53e4c964e303a75a3308ce15",
+      "3416dab4512fd0dc61d788b433cd624e",
+      "68ace8f01fdd74aec3fee528c8167738",
+      "9fabe05a6523da81a45150e19f75acff",
+      "7c0643e4d02421d06d7ca71822a94e1d",
+  };
+  return kDigest[id];
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+const char* GetDigest10bpp(int id) {
+  static const char* const kDigest[] = {
+      "1dc9bdd042e5228705b857b42798e364",
+      "c054c8644bd482ce78a139d8e063e013",
+      "bbe4ac48f013f34c84779da05b0bcbe0",
+      "" /*kBlock4x16*/,
+      "13d4759277637a607f25439182553708",
+      "f089667610561a47d50f9f930ad7c454",
+      "46715e6f7819f59725bdb083f4403255",
+      "3774541c339ae3af920ef2b1d6abf6a1",
+      "94913b01d226cb5eb273dfee84b51f65",
+      "be0c0847629dfff8e0e991ed67697a7d",
+      "716b5398b77d7459274d4ea9c91ebd8e",
+      "f5c1b0b461df4182529949472242b421",
+      "5e9576ea4cf107249ce4ae89a72b9c95",
+      "da021bcdf7936f7bd9a2399c69e4d37c",
+      "b3a310a39c1900e00f992839ff188656",
+      "9f3a15351af5945615f296242ec56a38",
+      "b6e0bd03c521c5f00e90530daa7d4432",
+      "3270d7f621d488aec5b76bcf121debd0",
+
+      // mask_is_inverse = true.
+      "33df96dd246683133eefe4caea6e3f7d",
+      "73e0ccc5d42806548a4b59f856256c1e",
+      "3561a0358cf831aee9477d07feafae2d",
+      "" /*kBlock4x16*/,
+      "c5a2e633c0cd6925e68f21f47f0e2d84",
+      "8755a2d3840dde5fd6a0cce6bd6642c5",
+      "85ec538b72cecd6ea1fddab5ce3b4e64",
+      "a53e0dec84c675c4c6b1f5792b0232ff",
+      "86180da325f9727670a98cf2dbf7410e",
+      "a5fdc95104948047e179b2bc3d47f51d",
+      "9b95b3858187838e4669180e2ddb295e",
+      "6e40ca55608f6bf2f8cd91c8dbf3ddbf",
+      "d3a092672e921b588279d57e50b31888",
+      "9883eb19b733ee9f1cb6a6b6a1a00bb5",
+      "dd34764e068b228b7820321b06864e63",
+      "6c743dc9c8c87c7044151d29993e5042",
+      "44925dab01011a98b8ab1f0308fa852a",
+      "6d984b2ccfa056278e2130771127a943",
+  };
+  return kDigest[id];
+}
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+struct WeightMaskTestParam {
+  WeightMaskTestParam(int width, int height, bool mask_is_inverse)
+      : width(width), height(height), mask_is_inverse(mask_is_inverse) {}
+  int width;
+  int height;
+  bool mask_is_inverse;
+};
+
+std::ostream& operator<<(std::ostream& os, const WeightMaskTestParam& param) {
+  return os << param.width << "x" << param.height
+            << ", mask_is_inverse: " << param.mask_is_inverse;
+}
+
+template <int bitdepth>
+class WeightMaskTest : public testing::TestWithParam<WeightMaskTestParam>,
+                       public test_utils::MaxAlignedAllocable {
+ public:
+  WeightMaskTest() = default;
+  ~WeightMaskTest() override = default;
+
+  void SetUp() override {
+    test_utils::ResetDspTable(bitdepth);
+    WeightMaskInit_C();
+    const dsp::Dsp* const dsp = dsp::GetDspTable(bitdepth);
+    ASSERT_NE(dsp, nullptr);
+    const int width_index = FloorLog2(width_) - 3;
+    const int height_index = FloorLog2(height_) - 3;
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    const char* const test_case = test_info->test_suite_name();
+    if (absl::StartsWith(test_case, "C/")) {
+    } else if (absl::StartsWith(test_case, "NEON/")) {
+      WeightMaskInit_NEON();
+    } else if (absl::StartsWith(test_case, "SSE41/")) {
+      WeightMaskInit_SSE4_1();
+    }
+    func_ = dsp->weight_mask[width_index][height_index][mask_is_inverse_];
+  }
+
+ protected:
+  void SetInputData(bool use_fixed_values, int value_1, int value_2);
+  void Test(int num_runs, bool use_fixed_values, int value_1, int value_2);
+
+ private:
+  const int width_ = GetParam().width;
+  const int height_ = GetParam().height;
+  const bool mask_is_inverse_ = GetParam().mask_is_inverse;
+  using PredType =
+      typename std::conditional<bitdepth == 8, int16_t, uint16_t>::type;
+  alignas(
+      kMaxAlignment) PredType block_1_[kMaxPredictionSize * kMaxPredictionSize];
+  alignas(
+      kMaxAlignment) PredType block_2_[kMaxPredictionSize * kMaxPredictionSize];
+  uint8_t mask_[kMaxPredictionSize * kMaxPredictionSize] = {};
+  dsp::WeightMaskFunc func_;
+};
+
+template <int bitdepth>
+void WeightMaskTest<bitdepth>::SetInputData(const bool use_fixed_values,
+                                            const int value_1,
+                                            const int value_2) {
+  if (use_fixed_values) {
+    std::fill(block_1_, block_1_ + kMaxPredictionSize * kMaxPredictionSize,
+              value_1);
+    std::fill(block_2_, block_2_ + kMaxPredictionSize * kMaxPredictionSize,
+              value_2);
+  } else {
+    constexpr int bitdepth_index = (bitdepth - 8) >> 1;
+    libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+    for (int y = 0; y < height_; ++y) {
+      for (int x = 0; x < width_; ++x) {
+        const int min_val = kCompoundPredictionRange[bitdepth_index][0];
+        const int max_val = kCompoundPredictionRange[bitdepth_index][1];
+        block_1_[y * width_ + x] =
+            static_cast<PredType>(rnd(max_val - min_val) + min_val);
+        block_2_[y * width_ + x] =
+            static_cast<PredType>(rnd(max_val - min_val) + min_val);
+      }
+    }
+  }
+}
+
+BlockSize DimensionsToBlockSize(int width, int height) {
+  if (width == 4) {
+    if (height == 4) return kBlock4x4;
+    if (height == 8) return kBlock4x8;
+    if (height == 16) return kBlock4x16;
+    return kBlockInvalid;
+  }
+  if (width == 8) {
+    if (height == 4) return kBlock8x4;
+    if (height == 8) return kBlock8x8;
+    if (height == 16) return kBlock8x16;
+    if (height == 32) return kBlock8x32;
+    return kBlockInvalid;
+  }
+  if (width == 16) {
+    if (height == 4) return kBlock16x4;
+    if (height == 8) return kBlock16x8;
+    if (height == 16) return kBlock16x16;
+    if (height == 32) return kBlock16x32;
+    if (height == 64) return kBlock16x64;
+    return kBlockInvalid;
+  }
+  if (width == 32) {
+    if (height == 8) return kBlock32x8;
+    if (height == 16) return kBlock32x16;
+    if (height == 32) return kBlock32x32;
+    if (height == 64) return kBlock32x64;
+    return kBlockInvalid;
+  }
+  if (width == 64) {
+    if (height == 16) return kBlock64x16;
+    if (height == 32) return kBlock64x32;
+    if (height == 64) return kBlock64x64;
+    if (height == 128) return kBlock64x128;
+    return kBlockInvalid;
+  }
+  if (width == 128) {
+    if (height == 64) return kBlock128x64;
+    if (height == 128) return kBlock128x128;
+    return kBlockInvalid;
+  }
+  return kBlockInvalid;
+}
+
+template <int bitdepth>
+void WeightMaskTest<bitdepth>::Test(const int num_runs,
+                                    const bool use_fixed_values,
+                                    const int value_1, const int value_2) {
+  if (func_ == nullptr) return;
+  SetInputData(use_fixed_values, value_1, value_2);
+  const absl::Time start = absl::Now();
+  for (int i = 0; i < num_runs; ++i) {
+    func_(block_1_, block_2_, mask_, kMaxPredictionSize);
+  }
+  const absl::Duration elapsed_time = absl::Now() - start;
+  if (use_fixed_values) {
+    int fixed_value = (value_1 - value_2 == 0) ? 38 : 64;
+    if (mask_is_inverse_) fixed_value = 64 - fixed_value;
+    for (int y = 0; y < height_; ++y) {
+      for (int x = 0; x < width_; ++x) {
+        ASSERT_EQ(static_cast<int>(mask_[y * kMaxPredictionSize + x]),
+                  fixed_value)
+            << "x: " << x << " y: " << y;
+      }
+    }
+  } else {
+    const int id_offset = mask_is_inverse_ ? kMaxBlockSizes - 4 : 0;
+    const int id = id_offset +
+                   static_cast<int>(DimensionsToBlockSize(width_, height_)) - 4;
+    if (bitdepth == 8) {
+      test_utils::CheckMd5Digest(
+          absl::StrFormat("BlockSize %dx%d", width_, height_).c_str(),
+          "WeightMask", GetDigest8bpp(id), mask_, sizeof(mask_), elapsed_time);
+#if LIBGAV1_MAX_BITDEPTH >= 10
+    } else {
+      test_utils::CheckMd5Digest(
+          absl::StrFormat("BlockSize %dx%d", width_, height_).c_str(),
+          "WeightMask", GetDigest10bpp(id), mask_, sizeof(mask_), elapsed_time);
+#endif
+    }
+  }
+}
+
+const WeightMaskTestParam weight_mask_test_param[] = {
+    WeightMaskTestParam(8, 8, false),     WeightMaskTestParam(8, 16, false),
+    WeightMaskTestParam(8, 32, false),    WeightMaskTestParam(16, 8, false),
+    WeightMaskTestParam(16, 16, false),   WeightMaskTestParam(16, 32, false),
+    WeightMaskTestParam(16, 64, false),   WeightMaskTestParam(32, 8, false),
+    WeightMaskTestParam(32, 16, false),   WeightMaskTestParam(32, 32, false),
+    WeightMaskTestParam(32, 64, false),   WeightMaskTestParam(64, 16, false),
+    WeightMaskTestParam(64, 32, false),   WeightMaskTestParam(64, 64, false),
+    WeightMaskTestParam(64, 128, false),  WeightMaskTestParam(128, 64, false),
+    WeightMaskTestParam(128, 128, false), WeightMaskTestParam(8, 8, true),
+    WeightMaskTestParam(8, 16, true),     WeightMaskTestParam(8, 32, true),
+    WeightMaskTestParam(16, 8, true),     WeightMaskTestParam(16, 16, true),
+    WeightMaskTestParam(16, 32, true),    WeightMaskTestParam(16, 64, true),
+    WeightMaskTestParam(32, 8, true),     WeightMaskTestParam(32, 16, true),
+    WeightMaskTestParam(32, 32, true),    WeightMaskTestParam(32, 64, true),
+    WeightMaskTestParam(64, 16, true),    WeightMaskTestParam(64, 32, true),
+    WeightMaskTestParam(64, 64, true),    WeightMaskTestParam(64, 128, true),
+    WeightMaskTestParam(128, 64, true),   WeightMaskTestParam(128, 128, true),
+};
+
+using WeightMaskTest8bpp = WeightMaskTest<8>;
+
+TEST_P(WeightMaskTest8bpp, FixedValues) {
+  const int min = kCompoundPredictionRange[0][0];
+  const int max = kCompoundPredictionRange[0][1];
+  Test(1, true, min, min);
+  Test(1, true, min, max);
+  Test(1, true, max, min);
+  Test(1, true, max, max);
+}
+
+TEST_P(WeightMaskTest8bpp, RandomValues) { Test(1, false, -1, -1); }
+
+TEST_P(WeightMaskTest8bpp, DISABLED_Speed) {
+  Test(kNumSpeedTests, false, -1, -1);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, WeightMaskTest8bpp,
+                         testing::ValuesIn(weight_mask_test_param));
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, WeightMaskTest8bpp,
+                         testing::ValuesIn(weight_mask_test_param));
+#endif
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, WeightMaskTest8bpp,
+                         testing::ValuesIn(weight_mask_test_param));
+#endif
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using WeightMaskTest10bpp = WeightMaskTest<10>;
+
+TEST_P(WeightMaskTest10bpp, FixedValues) {
+  const int min = kCompoundPredictionRange[1][0];
+  const int max = kCompoundPredictionRange[1][1];
+  Test(1, true, min, min);
+  Test(1, true, min, max);
+  Test(1, true, max, min);
+  Test(1, true, max, max);
+}
+
+TEST_P(WeightMaskTest10bpp, RandomValues) { Test(1, false, -1, -1); }
+
+TEST_P(WeightMaskTest10bpp, DISABLED_Speed) {
+  Test(kNumSpeedTests, false, -1, -1);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, WeightMaskTest10bpp,
+                         testing::ValuesIn(weight_mask_test_param));
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, WeightMaskTest10bpp,
+                         testing::ValuesIn(weight_mask_test_param));
+#endif
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, WeightMaskTest10bpp,
+                         testing::ValuesIn(weight_mask_test_param));
+#endif
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+}  // namespace
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/x86/average_blend_sse4.cc b/src/dsp/x86/average_blend_sse4.cc
index 8e008d1..ec9f589 100644
--- a/src/dsp/x86/average_blend_sse4.cc
+++ b/src/dsp/x86/average_blend_sse4.cc
@@ -30,6 +30,7 @@
 
 namespace libgav1 {
 namespace dsp {
+namespace low_bitdepth {
 namespace {
 
 constexpr int kInterPostRoundBit = 4;
@@ -138,13 +139,232 @@ void Init8bpp() {
 }
 
 }  // namespace
+}  // namespace low_bitdepth
 
-void AverageBlendInit_SSE4_1() { Init8bpp(); }
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+constexpr int kInterPostRoundBitPlusOne = 5;
+
+template <const int width, const int offset>
+inline void AverageBlendRow(const uint16_t* prediction_0,
+                            const uint16_t* prediction_1,
+                            const __m128i& compound_offset,
+                            const __m128i& round_offset, const __m128i& max,
+                            const __m128i& zero, uint16_t* dst,
+                            const ptrdiff_t dest_stride) {
+  // pred_0/1 max range is 16b.
+  const __m128i pred_0 = LoadUnaligned16(prediction_0 + offset);
+  const __m128i pred_1 = LoadUnaligned16(prediction_1 + offset);
+  const __m128i pred_00 = _mm_cvtepu16_epi32(pred_0);
+  const __m128i pred_01 = _mm_unpackhi_epi16(pred_0, zero);
+  const __m128i pred_10 = _mm_cvtepu16_epi32(pred_1);
+  const __m128i pred_11 = _mm_unpackhi_epi16(pred_1, zero);
+
+  const __m128i pred_add_0 = _mm_add_epi32(pred_00, pred_10);
+  const __m128i pred_add_1 = _mm_add_epi32(pred_01, pred_11);
+  const __m128i compound_offset_0 = _mm_sub_epi32(pred_add_0, compound_offset);
+  const __m128i compound_offset_1 = _mm_sub_epi32(pred_add_1, compound_offset);
+  // RightShiftWithRounding and Clip3.
+  const __m128i round_0 = _mm_add_epi32(compound_offset_0, round_offset);
+  const __m128i round_1 = _mm_add_epi32(compound_offset_1, round_offset);
+  const __m128i res_0 = _mm_srai_epi32(round_0, kInterPostRoundBitPlusOne);
+  const __m128i res_1 = _mm_srai_epi32(round_1, kInterPostRoundBitPlusOne);
+  const __m128i result = _mm_min_epi16(_mm_packus_epi32(res_0, res_1), max);
+  if (width != 4) {
+    // Store width=8/16/32/64/128.
+    StoreUnaligned16(dst + offset, result);
+    return;
+  }
+  assert(width == 4);
+  StoreLo8(dst, result);
+  StoreHi8(dst + dest_stride, result);
+}
+
+void AverageBlend10bpp_SSE4_1(const void* prediction_0,
+                              const void* prediction_1, const int width,
+                              const int height, void* const dest,
+                              const ptrdiff_t dst_stride) {
+  auto* dst = static_cast<uint16_t*>(dest);
+  const ptrdiff_t dest_stride = dst_stride / sizeof(dst[0]);
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  const __m128i compound_offset =
+      _mm_set1_epi32(kCompoundOffset + kCompoundOffset);
+  const __m128i round_offset =
+      _mm_set1_epi32((1 << kInterPostRoundBitPlusOne) >> 1);
+  const __m128i max = _mm_set1_epi16((1 << kBitdepth10) - 1);
+  const __m128i zero = _mm_setzero_si128();
+  int y = height;
+
+  if (width == 4) {
+    const ptrdiff_t dest_stride2 = dest_stride << 1;
+    const ptrdiff_t width2 = width << 1;
+    do {
+      // row0,1
+      AverageBlendRow<4, 0>(pred_0, pred_1, compound_offset, round_offset, max,
+                            zero, dst, dest_stride);
+      dst += dest_stride2;
+      pred_0 += width2;
+      pred_1 += width2;
+      y -= 2;
+    } while (y != 0);
+    return;
+  }
+  if (width == 8) {
+    const ptrdiff_t dest_stride2 = dest_stride << 1;
+    const ptrdiff_t width2 = width << 1;
+    do {
+      // row0.
+      AverageBlendRow<8, 0>(pred_0, pred_1, compound_offset, round_offset, max,
+                            zero, dst, dest_stride);
+      // row1.
+      AverageBlendRow<8, 0>(pred_0 + width, pred_1 + width, compound_offset,
+                            round_offset, max, zero, dst + dest_stride,
+                            dest_stride);
+      dst += dest_stride2;
+      pred_0 += width2;
+      pred_1 += width2;
+      y -= 2;
+    } while (y != 0);
+    return;
+  }
+  if (width == 16) {
+    const ptrdiff_t dest_stride2 = dest_stride << 1;
+    const ptrdiff_t width2 = width << 1;
+    do {
+      // row0.
+      AverageBlendRow<8, 0>(pred_0, pred_1, compound_offset, round_offset, max,
+                            zero, dst, dest_stride);
+      AverageBlendRow<8, 8>(pred_0, pred_1, compound_offset, round_offset, max,
+                            zero, dst, dest_stride);
+      // row1.
+      AverageBlendRow<8, 0>(pred_0 + width, pred_1 + width, compound_offset,
+                            round_offset, max, zero, dst + dest_stride,
+                            dest_stride);
+      AverageBlendRow<8, 8>(pred_0 + width, pred_1 + width, compound_offset,
+                            round_offset, max, zero, dst + dest_stride,
+                            dest_stride);
+      dst += dest_stride2;
+      pred_0 += width2;
+      pred_1 += width2;
+      y -= 2;
+    } while (y != 0);
+    return;
+  }
+  if (width == 32) {
+    do {
+      // pred [0 - 15].
+      AverageBlendRow<8, 0>(pred_0, pred_1, compound_offset, round_offset, max,
+                            zero, dst, dest_stride);
+      AverageBlendRow<8, 8>(pred_0, pred_1, compound_offset, round_offset, max,
+                            zero, dst, dest_stride);
+      // pred [16 - 31].
+      AverageBlendRow<8, 16>(pred_0, pred_1, compound_offset, round_offset, max,
+                             zero, dst, dest_stride);
+      AverageBlendRow<8, 24>(pred_0, pred_1, compound_offset, round_offset, max,
+                             zero, dst, dest_stride);
+      dst += dest_stride;
+      pred_0 += width;
+      pred_1 += width;
+    } while (--y != 0);
+    return;
+  }
+  if (width == 64) {
+    do {
+      // pred [0 - 31].
+      AverageBlendRow<8, 0>(pred_0, pred_1, compound_offset, round_offset, max,
+                            zero, dst, dest_stride);
+      AverageBlendRow<8, 8>(pred_0, pred_1, compound_offset, round_offset, max,
+                            zero, dst, dest_stride);
+      AverageBlendRow<8, 16>(pred_0, pred_1, compound_offset, round_offset, max,
+                             zero, dst, dest_stride);
+      AverageBlendRow<8, 24>(pred_0, pred_1, compound_offset, round_offset, max,
+                             zero, dst, dest_stride);
+      // pred [31 - 63].
+      AverageBlendRow<8, 32>(pred_0, pred_1, compound_offset, round_offset, max,
+                             zero, dst, dest_stride);
+      AverageBlendRow<8, 40>(pred_0, pred_1, compound_offset, round_offset, max,
+                             zero, dst, dest_stride);
+      AverageBlendRow<8, 48>(pred_0, pred_1, compound_offset, round_offset, max,
+                             zero, dst, dest_stride);
+      AverageBlendRow<8, 56>(pred_0, pred_1, compound_offset, round_offset, max,
+                             zero, dst, dest_stride);
+      dst += dest_stride;
+      pred_0 += width;
+      pred_1 += width;
+    } while (--y != 0);
+    return;
+  }
+  assert(width == 128);
+  do {
+    // pred [0 - 31].
+    AverageBlendRow<8, 0>(pred_0, pred_1, compound_offset, round_offset, max,
+                          zero, dst, dest_stride);
+    AverageBlendRow<8, 8>(pred_0, pred_1, compound_offset, round_offset, max,
+                          zero, dst, dest_stride);
+    AverageBlendRow<8, 16>(pred_0, pred_1, compound_offset, round_offset, max,
+                           zero, dst, dest_stride);
+    AverageBlendRow<8, 24>(pred_0, pred_1, compound_offset, round_offset, max,
+                           zero, dst, dest_stride);
+    // pred [31 - 63].
+    AverageBlendRow<8, 32>(pred_0, pred_1, compound_offset, round_offset, max,
+                           zero, dst, dest_stride);
+    AverageBlendRow<8, 40>(pred_0, pred_1, compound_offset, round_offset, max,
+                           zero, dst, dest_stride);
+    AverageBlendRow<8, 48>(pred_0, pred_1, compound_offset, round_offset, max,
+                           zero, dst, dest_stride);
+    AverageBlendRow<8, 56>(pred_0, pred_1, compound_offset, round_offset, max,
+                           zero, dst, dest_stride);
+
+    // pred [64 - 95].
+    AverageBlendRow<8, 64>(pred_0, pred_1, compound_offset, round_offset, max,
+                           zero, dst, dest_stride);
+    AverageBlendRow<8, 72>(pred_0, pred_1, compound_offset, round_offset, max,
+                           zero, dst, dest_stride);
+    AverageBlendRow<8, 80>(pred_0, pred_1, compound_offset, round_offset, max,
+                           zero, dst, dest_stride);
+    AverageBlendRow<8, 88>(pred_0, pred_1, compound_offset, round_offset, max,
+                           zero, dst, dest_stride);
+    // pred [96 - 127].
+    AverageBlendRow<8, 96>(pred_0, pred_1, compound_offset, round_offset, max,
+                           zero, dst, dest_stride);
+    AverageBlendRow<8, 104>(pred_0, pred_1, compound_offset, round_offset, max,
+                            zero, dst, dest_stride);
+    AverageBlendRow<8, 112>(pred_0, pred_1, compound_offset, round_offset, max,
+                            zero, dst, dest_stride);
+    AverageBlendRow<8, 120>(pred_0, pred_1, compound_offset, round_offset, max,
+                            zero, dst, dest_stride);
+    dst += dest_stride;
+    pred_0 += width;
+    pred_1 += width;
+  } while (--y != 0);
+}
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+#if DSP_ENABLED_10BPP_SSE4_1(AverageBlend)
+  dsp->average_blend = AverageBlend10bpp_SSE4_1;
+#endif
+}
+
+}  // namespace
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+void AverageBlendInit_SSE4_1() {
+  low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+}
 
 }  // namespace dsp
 }  // namespace libgav1
 
-#else  // !LIBGAV1_TARGETING_SSE4_1
+#else   // !LIBGAV1_TARGETING_SSE4_1
 
 namespace libgav1 {
 namespace dsp {
diff --git a/src/dsp/x86/average_blend_sse4.h b/src/dsp/x86/average_blend_sse4.h
index 937e8e2..cd07112 100644
--- a/src/dsp/x86/average_blend_sse4.h
+++ b/src/dsp/x86/average_blend_sse4.h
@@ -32,9 +32,13 @@ void AverageBlendInit_SSE4_1();
 // If sse4 is enabled and the baseline isn't set due to a higher level of
 // optimization being enabled, signal the sse4 implementation should be used.
 #if LIBGAV1_TARGETING_SSE4_1
+
 #ifndef LIBGAV1_Dsp8bpp_AverageBlend
 #define LIBGAV1_Dsp8bpp_AverageBlend LIBGAV1_CPU_SSE4_1
 #endif
+#ifndef LIBGAV1_Dsp10bpp_AverageBlend
+#define LIBGAV1_Dsp10bpp_AverageBlend LIBGAV1_CPU_SSE4_1
+#endif
 
 #endif  // LIBGAV1_TARGETING_SSE4_1
 
diff --git a/src/dsp/x86/cdef_avx2.cc b/src/dsp/x86/cdef_avx2.cc
new file mode 100644
index 0000000..d41dc38
--- /dev/null
+++ b/src/dsp/x86/cdef_avx2.cc
@@ -0,0 +1,784 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/cdef.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_AVX2
+#include <immintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_avx2.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+#include "src/dsp/cdef.inc"
+
+// Used when calculating odd |cost[x]| values.
+// Holds elements 1 3 5 7 7 7 7 7
+alignas(32) constexpr uint32_t kCdefDivisionTableOddPairsPadded[] = {
+    420, 210, 140, 105, 420, 210, 140, 105,
+    105, 105, 105, 105, 105, 105, 105, 105};
+
+// ----------------------------------------------------------------------------
+// Refer to CdefDirection_C().
+//
+// int32_t partial[8][15] = {};
+// for (int i = 0; i < 8; ++i) {
+//   for (int j = 0; j < 8; ++j) {
+//     const int x = 1;
+//     partial[0][i + j] += x;
+//     partial[1][i + j / 2] += x;
+//     partial[2][i] += x;
+//     partial[3][3 + i - j / 2] += x;
+//     partial[4][7 + i - j] += x;
+//     partial[5][3 - i / 2 + j] += x;
+//     partial[6][j] += x;
+//     partial[7][i / 2 + j] += x;
+//   }
+// }
+//
+// Using the code above, generate the position count for partial[8][15].
+//
+// partial[0]: 1 2 3 4 5 6 7 8 7 6 5 4 3 2 1
+// partial[1]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
+// partial[2]: 8 8 8 8 8 8 8 8 0 0 0 0 0 0 0
+// partial[3]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
+// partial[4]: 1 2 3 4 5 6 7 8 7 6 5 4 3 2 1
+// partial[5]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
+// partial[6]: 8 8 8 8 8 8 8 8 0 0 0 0 0 0 0
+// partial[7]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
+//
+// The SIMD code shifts the input horizontally, then adds vertically to get the
+// correct partial value for the given position.
+// ----------------------------------------------------------------------------
+
+// ----------------------------------------------------------------------------
+// partial[0][i + j] += x;
+//
+// 00 01 02 03 04 05 06 07  00 00 00 00 00 00 00
+// 00 10 11 12 13 14 15 16  17 00 00 00 00 00 00
+// 00 00 20 21 22 23 24 25  26 27 00 00 00 00 00
+// 00 00 00 30 31 32 33 34  35 36 37 00 00 00 00
+// 00 00 00 00 40 41 42 43  44 45 46 47 00 00 00
+// 00 00 00 00 00 50 51 52  53 54 55 56 57 00 00
+// 00 00 00 00 00 00 60 61  62 63 64 65 66 67 00
+// 00 00 00 00 00 00 00 70  71 72 73 74 75 76 77
+//
+// partial[4] is the same except the source is reversed.
+LIBGAV1_ALWAYS_INLINE void AddPartial_D0_D4(__m256i* v_src_16,
+                                            __m256i* partial_lo,
+                                            __m256i* partial_hi) {
+  // 00 01 02 03 04 05 06 07
+  *partial_lo = v_src_16[0];
+  // 00 00 00 00 00 00 00 00
+  *partial_hi = _mm256_setzero_si256();
+
+  // 00 10 11 12 13 14 15 16
+  *partial_lo =
+      _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_src_16[1], 2));
+  // 17 00 00 00 00 00 00 00
+  *partial_hi =
+      _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_src_16[1], 14));
+
+  // 00 00 20 21 22 23 24 25
+  *partial_lo =
+      _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_src_16[2], 4));
+  // 26 27 00 00 00 00 00 00
+  *partial_hi =
+      _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_src_16[2], 12));
+
+  // 00 00 00 30 31 32 33 34
+  *partial_lo =
+      _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_src_16[3], 6));
+  // 35 36 37 00 00 00 00 00
+  *partial_hi =
+      _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_src_16[3], 10));
+
+  // 00 00 00 00 40 41 42 43
+  *partial_lo =
+      _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_src_16[4], 8));
+  // 44 45 46 47 00 00 00 00
+  *partial_hi =
+      _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_src_16[4], 8));
+
+  // 00 00 00 00 00 50 51 52
+  *partial_lo =
+      _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_src_16[5], 10));
+  // 53 54 55 56 57 00 00 00
+  *partial_hi =
+      _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_src_16[5], 6));
+
+  // 00 00 00 00 00 00 60 61
+  *partial_lo =
+      _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_src_16[6], 12));
+  // 62 63 64 65 66 67 00 00
+  *partial_hi =
+      _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_src_16[6], 4));
+
+  // 00 00 00 00 00 00 00 70
+  *partial_lo =
+      _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_src_16[7], 14));
+  // 71 72 73 74 75 76 77 00
+  *partial_hi =
+      _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_src_16[7], 2));
+}
+
+// ----------------------------------------------------------------------------
+// partial[1][i + j / 2] += x;
+//
+// A0 = src[0] + src[1], A1 = src[2] + src[3], ...
+//
+// A0 A1 A2 A3 00 00 00 00  00 00 00 00 00 00 00
+// 00 B0 B1 B2 B3 00 00 00  00 00 00 00 00 00 00
+// 00 00 C0 C1 C2 C3 00 00  00 00 00 00 00 00 00
+// 00 00 00 D0 D1 D2 D3 00  00 00 00 00 00 00 00
+// 00 00 00 00 E0 E1 E2 E3  00 00 00 00 00 00 00
+// 00 00 00 00 00 F0 F1 F2  F3 00 00 00 00 00 00
+// 00 00 00 00 00 00 G0 G1  G2 G3 00 00 00 00 00
+// 00 00 00 00 00 00 00 H0  H1 H2 H3 00 00 00 00
+//
+// partial[3] is the same except the source is reversed.
+LIBGAV1_ALWAYS_INLINE void AddPartial_D1_D3(__m256i* v_src_16,
+                                            __m256i* partial_lo,
+                                            __m256i* partial_hi) {
+  __m256i v_d1_temp[8];
+  const __m256i v_zero = _mm256_setzero_si256();
+
+  for (int i = 0; i < 8; ++i) {
+    v_d1_temp[i] = _mm256_hadd_epi16(v_src_16[i], v_zero);
+  }
+
+  *partial_lo = *partial_hi = v_zero;
+  // A0 A1 A2 A3 00 00 00 00
+  *partial_lo = _mm256_add_epi16(*partial_lo, v_d1_temp[0]);
+
+  // 00 B0 B1 B2 B3 00 00 00
+  *partial_lo =
+      _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_d1_temp[1], 2));
+
+  // 00 00 C0 C1 C2 C3 00 00
+  *partial_lo =
+      _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_d1_temp[2], 4));
+  // 00 00 00 D0 D1 D2 D3 00
+  *partial_lo =
+      _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_d1_temp[3], 6));
+  // 00 00 00 00 E0 E1 E2 E3
+  *partial_lo =
+      _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_d1_temp[4], 8));
+
+  // 00 00 00 00 00 F0 F1 F2
+  *partial_lo =
+      _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_d1_temp[5], 10));
+  // F3 00 00 00 00 00 00 00
+  *partial_hi =
+      _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_d1_temp[5], 6));
+
+  // 00 00 00 00 00 00 G0 G1
+  *partial_lo =
+      _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_d1_temp[6], 12));
+  // G2 G3 00 00 00 00 00 00
+  *partial_hi =
+      _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_d1_temp[6], 4));
+
+  // 00 00 00 00 00 00 00 H0
+  *partial_lo =
+      _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_d1_temp[7], 14));
+  // H1 H2 H3 00 00 00 00 00
+  *partial_hi =
+      _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_d1_temp[7], 2));
+}
+
+// ----------------------------------------------------------------------------
+// partial[7][i / 2 + j] += x;
+//
+// 00 01 02 03 04 05 06 07  00 00 00 00 00 00 00
+// 10 11 12 13 14 15 16 17  00 00 00 00 00 00 00
+// 00 20 21 22 23 24 25 26  27 00 00 00 00 00 00
+// 00 30 31 32 33 34 35 36  37 00 00 00 00 00 00
+// 00 00 40 41 42 43 44 45  46 47 00 00 00 00 00
+// 00 00 50 51 52 53 54 55  56 57 00 00 00 00 00
+// 00 00 00 60 61 62 63 64  65 66 67 00 00 00 00
+// 00 00 00 70 71 72 73 74  75 76 77 00 00 00 00
+//
+// partial[5] is the same except the source is reversed.
+LIBGAV1_ALWAYS_INLINE void AddPartial_D7_D5(__m256i* v_src, __m256i* partial_lo,
+                                            __m256i* partial_hi) {
+  __m256i v_pair_add[4];
+  // Add vertical source pairs.
+  v_pair_add[0] = _mm256_add_epi16(v_src[0], v_src[1]);
+  v_pair_add[1] = _mm256_add_epi16(v_src[2], v_src[3]);
+  v_pair_add[2] = _mm256_add_epi16(v_src[4], v_src[5]);
+  v_pair_add[3] = _mm256_add_epi16(v_src[6], v_src[7]);
+
+  // 00 01 02 03 04 05 06 07
+  // 10 11 12 13 14 15 16 17
+  *partial_lo = v_pair_add[0];
+  // 00 00 00 00 00 00 00 00
+  // 00 00 00 00 00 00 00 00
+  *partial_hi = _mm256_setzero_si256();
+
+  // 00 20 21 22 23 24 25 26
+  // 00 30 31 32 33 34 35 36
+  *partial_lo =
+      _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_pair_add[1], 2));
+  // 27 00 00 00 00 00 00 00
+  // 37 00 00 00 00 00 00 00
+  *partial_hi =
+      _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_pair_add[1], 14));
+
+  // 00 00 40 41 42 43 44 45
+  // 00 00 50 51 52 53 54 55
+  *partial_lo =
+      _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_pair_add[2], 4));
+  // 46 47 00 00 00 00 00 00
+  // 56 57 00 00 00 00 00 00
+  *partial_hi =
+      _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_pair_add[2], 12));
+
+  // 00 00 00 60 61 62 63 64
+  // 00 00 00 70 71 72 73 74
+  *partial_lo =
+      _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_pair_add[3], 6));
+  // 65 66 67 00 00 00 00 00
+  // 75 76 77 00 00 00 00 00
+  *partial_hi =
+      _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_pair_add[3], 10));
+}
+
+LIBGAV1_ALWAYS_INLINE void AddPartial(const uint8_t* src, ptrdiff_t stride,
+                                      __m256i* partial) {
+  // 8x8 input
+  // 00 01 02 03 04 05 06 07
+  // 10 11 12 13 14 15 16 17
+  // 20 21 22 23 24 25 26 27
+  // 30 31 32 33 34 35 36 37
+  // 40 41 42 43 44 45 46 47
+  // 50 51 52 53 54 55 56 57
+  // 60 61 62 63 64 65 66 67
+  // 70 71 72 73 74 75 76 77
+  __m256i v_src[8];
+  for (auto& i : v_src) {
+    i = _mm256_castsi128_si256(LoadLo8(src));
+    // Dup lower lane.
+    i = _mm256_permute2x128_si256(i, i, 0x0);
+    src += stride;
+  }
+
+  const __m256i v_zero = _mm256_setzero_si256();
+  // partial for direction 2
+  // --------------------------------------------------------------------------
+  // partial[2][i] += x;
+  // 00 10 20 30 40 50 60 70  xx xx xx xx xx xx xx xx
+  // 01 11 21 33 41 51 61 71  xx xx xx xx xx xx xx xx
+  // 02 12 22 33 42 52 62 72  xx xx xx xx xx xx xx xx
+  // 03 13 23 33 43 53 63 73  xx xx xx xx xx xx xx xx
+  // 04 14 24 34 44 54 64 74  xx xx xx xx xx xx xx xx
+  // 05 15 25 35 45 55 65 75  xx xx xx xx xx xx xx xx
+  // 06 16 26 36 46 56 66 76  xx xx xx xx xx xx xx xx
+  // 07 17 27 37 47 57 67 77  xx xx xx xx xx xx xx xx
+  const __m256i v_src_4_0 = _mm256_unpacklo_epi64(v_src[0], v_src[4]);
+  const __m256i v_src_5_1 = _mm256_unpacklo_epi64(v_src[1], v_src[5]);
+  const __m256i v_src_6_2 = _mm256_unpacklo_epi64(v_src[2], v_src[6]);
+  const __m256i v_src_7_3 = _mm256_unpacklo_epi64(v_src[3], v_src[7]);
+  const __m256i v_hsum_4_0 = _mm256_sad_epu8(v_src_4_0, v_zero);
+  const __m256i v_hsum_5_1 = _mm256_sad_epu8(v_src_5_1, v_zero);
+  const __m256i v_hsum_6_2 = _mm256_sad_epu8(v_src_6_2, v_zero);
+  const __m256i v_hsum_7_3 = _mm256_sad_epu8(v_src_7_3, v_zero);
+  const __m256i v_hsum_1_0 = _mm256_unpacklo_epi16(v_hsum_4_0, v_hsum_5_1);
+  const __m256i v_hsum_3_2 = _mm256_unpacklo_epi16(v_hsum_6_2, v_hsum_7_3);
+  const __m256i v_hsum_5_4 = _mm256_unpackhi_epi16(v_hsum_4_0, v_hsum_5_1);
+  const __m256i v_hsum_7_6 = _mm256_unpackhi_epi16(v_hsum_6_2, v_hsum_7_3);
+  partial[2] =
+      _mm256_unpacklo_epi64(_mm256_unpacklo_epi32(v_hsum_1_0, v_hsum_3_2),
+                            _mm256_unpacklo_epi32(v_hsum_5_4, v_hsum_7_6));
+
+  const __m256i extend_reverse = SetrM128i(
+      _mm_set_epi32(static_cast<int>(0x80078006), static_cast<int>(0x80058004),
+                    static_cast<int>(0x80038002), static_cast<int>(0x80018000)),
+      _mm_set_epi32(static_cast<int>(0x80008001), static_cast<int>(0x80028003),
+                    static_cast<int>(0x80048005),
+                    static_cast<int>(0x80068007)));
+
+  for (auto& i : v_src) {
+    // Zero extend unsigned 8 to 16. The upper lane is reversed.
+    i = _mm256_shuffle_epi8(i, extend_reverse);
+  }
+
+  // partial for direction 6
+  // --------------------------------------------------------------------------
+  // partial[6][j] += x;
+  // 00 01 02 03 04 05 06 07  xx xx xx xx xx xx xx xx
+  // 10 11 12 13 14 15 16 17  xx xx xx xx xx xx xx xx
+  // 20 21 22 23 24 25 26 27  xx xx xx xx xx xx xx xx
+  // 30 31 32 33 34 35 36 37  xx xx xx xx xx xx xx xx
+  // 40 41 42 43 44 45 46 47  xx xx xx xx xx xx xx xx
+  // 50 51 52 53 54 55 56 57  xx xx xx xx xx xx xx xx
+  // 60 61 62 63 64 65 66 67  xx xx xx xx xx xx xx xx
+  // 70 71 72 73 74 75 76 77  xx xx xx xx xx xx xx xx
+  partial[6] = v_src[0];
+  for (int i = 1; i < 8; ++i) {
+    partial[6] = _mm256_add_epi16(partial[6], v_src[i]);
+  }
+
+  AddPartial_D0_D4(v_src, &partial[0], &partial[4]);
+  AddPartial_D1_D3(v_src, &partial[1], &partial[3]);
+  AddPartial_D7_D5(v_src, &partial[7], &partial[5]);
+}
+
+inline __m256i SumVectorPair_S32(__m256i a) {
+  a = _mm256_hadd_epi32(a, a);
+  a = _mm256_add_epi32(a, _mm256_srli_si256(a, 4));
+  return a;
+}
+
+// |cost[0]| and |cost[4]| square the input and sum with the corresponding
+// element from the other end of the vector:
+// |kCdefDivisionTable[]| element:
+// cost[0] += (Square(partial[0][i]) + Square(partial[0][14 - i])) *
+//             kCdefDivisionTable[i + 1];
+// cost[0] += Square(partial[0][7]) * kCdefDivisionTable[8];
+inline void Cost0Or4_Pair(uint32_t* cost, const __m256i partial_0,
+                          const __m256i partial_4,
+                          const __m256i division_table) {
+  const __m256i division_table_0 =
+      _mm256_permute2x128_si256(division_table, division_table, 0x0);
+  const __m256i division_table_1 =
+      _mm256_permute2x128_si256(division_table, division_table, 0x11);
+
+  // partial_lo
+  const __m256i a = partial_0;
+  // partial_hi
+  const __m256i b = partial_4;
+
+  // Reverse and clear upper 2 bytes.
+  const __m256i reverser = _mm256_broadcastsi128_si256(_mm_set_epi32(
+      static_cast<int>(0x80800100), 0x03020504, 0x07060908, 0x0b0a0d0c));
+
+  // 14 13 12 11 10 09 08 ZZ
+  const __m256i b_reversed = _mm256_shuffle_epi8(b, reverser);
+  // 00 14 01 13 02 12 03 11
+  const __m256i ab_lo = _mm256_unpacklo_epi16(a, b_reversed);
+  // 04 10 05 09 06 08 07 ZZ
+  const __m256i ab_hi = _mm256_unpackhi_epi16(a, b_reversed);
+
+  // Square(partial[0][i]) + Square(partial[0][14 - i])
+  const __m256i square_lo = _mm256_madd_epi16(ab_lo, ab_lo);
+  const __m256i square_hi = _mm256_madd_epi16(ab_hi, ab_hi);
+
+  const __m256i c = _mm256_mullo_epi32(square_lo, division_table_0);
+  const __m256i d = _mm256_mullo_epi32(square_hi, division_table_1);
+  const __m256i e = SumVectorPair_S32(_mm256_add_epi32(c, d));
+  // Copy upper 32bit sum to lower lane.
+  const __m128i sums =
+      _mm256_castsi256_si128(_mm256_permute4x64_epi64(e, 0x08));
+  cost[0] = _mm_cvtsi128_si32(sums);
+  cost[4] = _mm_cvtsi128_si32(_mm_srli_si128(sums, 8));
+}
+
+template <int index_a, int index_b>
+inline void CostOdd_Pair(uint32_t* cost, const __m256i partial_a,
+                         const __m256i partial_b,
+                         const __m256i division_table[2]) {
+  // partial_lo
+  const __m256i a = partial_a;
+  // partial_hi
+  const __m256i b = partial_b;
+
+  // Reverse and clear upper 10 bytes.
+  const __m256i reverser = _mm256_broadcastsi128_si256(
+      _mm_set_epi32(static_cast<int>(0x80808080), static_cast<int>(0x80808080),
+                    static_cast<int>(0x80800100), 0x03020504));
+
+  // 10 09 08 ZZ ZZ ZZ ZZ ZZ
+  const __m256i b_reversed = _mm256_shuffle_epi8(b, reverser);
+  // 00 10 01 09 02 08 03 ZZ
+  const __m256i ab_lo = _mm256_unpacklo_epi16(a, b_reversed);
+  // 04 ZZ 05 ZZ 06 ZZ 07 ZZ
+  const __m256i ab_hi = _mm256_unpackhi_epi16(a, b_reversed);
+
+  // Square(partial[0][i]) + Square(partial[0][14 - i])
+  const __m256i square_lo = _mm256_madd_epi16(ab_lo, ab_lo);
+  const __m256i square_hi = _mm256_madd_epi16(ab_hi, ab_hi);
+
+  const __m256i c = _mm256_mullo_epi32(square_lo, division_table[0]);
+  const __m256i d = _mm256_mullo_epi32(square_hi, division_table[1]);
+  const __m256i e = SumVectorPair_S32(_mm256_add_epi32(c, d));
+  // Copy upper 32bit sum to lower lane.
+  const __m128i sums =
+      _mm256_castsi256_si128(_mm256_permute4x64_epi64(e, 0x08));
+  cost[index_a] = _mm_cvtsi128_si32(sums);
+  cost[index_b] = _mm_cvtsi128_si32(_mm_srli_si128(sums, 8));
+}
+
+inline void Cost2And6_Pair(uint32_t* cost, const __m256i partial_a,
+                           const __m256i partial_b,
+                           const __m256i division_table) {
+  // The upper lane is a "don't care", so only use the lower lane for
+  // calculating cost.
+  const __m256i a = _mm256_permute2x128_si256(partial_a, partial_b, 0x20);
+
+  const __m256i square_a = _mm256_madd_epi16(a, a);
+  const __m256i b = _mm256_mullo_epi32(square_a, division_table);
+  const __m256i c = SumVectorPair_S32(b);
+  // Copy upper 32bit sum to lower lane.
+  const __m128i sums =
+      _mm256_castsi256_si128(_mm256_permute4x64_epi64(c, 0x08));
+  cost[2] = _mm_cvtsi128_si32(sums);
+  cost[6] = _mm_cvtsi128_si32(_mm_srli_si128(sums, 8));
+}
+
+void CdefDirection_AVX2(const void* const source, ptrdiff_t stride,
+                        uint8_t* const direction, int* const variance) {
+  assert(direction != nullptr);
+  assert(variance != nullptr);
+  const auto* src = static_cast<const uint8_t*>(source);
+  uint32_t cost[8];
+
+  // partial[0] = add partial 0,4 low
+  // partial[1] = add partial 1,3 low
+  // partial[2] = add partial 2 low
+  // partial[3] = add partial 1,3 high
+  // partial[4] = add partial 0,4 high
+  // partial[5] = add partial 7,5 high
+  // partial[6] = add partial 6 low
+  // partial[7] = add partial 7,5 low
+  __m256i partial[8];
+
+  AddPartial(src, stride, partial);
+
+  const __m256i division_table = LoadUnaligned32(kCdefDivisionTable);
+  const __m256i division_table_7 =
+      _mm256_broadcastd_epi32(_mm_cvtsi32_si128(kCdefDivisionTable[7]));
+
+  Cost2And6_Pair(cost, partial[2], partial[6], division_table_7);
+
+  Cost0Or4_Pair(cost, partial[0], partial[4], division_table);
+
+  const __m256i division_table_odd[2] = {
+      LoadUnaligned32(kCdefDivisionTableOddPairsPadded),
+      LoadUnaligned32(kCdefDivisionTableOddPairsPadded + 8)};
+
+  CostOdd_Pair<1, 3>(cost, partial[1], partial[3], division_table_odd);
+  CostOdd_Pair<7, 5>(cost, partial[7], partial[5], division_table_odd);
+
+  uint32_t best_cost = 0;
+  *direction = 0;
+  for (int i = 0; i < 8; ++i) {
+    if (cost[i] > best_cost) {
+      best_cost = cost[i];
+      *direction = i;
+    }
+  }
+  *variance = (best_cost - cost[(*direction + 4) & 7]) >> 10;
+}
+
+// -------------------------------------------------------------------------
+// CdefFilter
+
+// Load 4 vectors based on the given |direction|.
+inline void LoadDirection(const uint16_t* const src, const ptrdiff_t stride,
+                          __m128i* output, const int direction) {
+  // Each |direction| describes a different set of source values. Expand this
+  // set by negating each set. For |direction| == 0 this gives a diagonal line
+  // from top right to bottom left. The first value is y, the second x. Negative
+  // y values move up.
+  //    a       b         c       d
+  // {-1, 1}, {1, -1}, {-2, 2}, {2, -2}
+  //         c
+  //       a
+  //     0
+  //   b
+  // d
+  const int y_0 = kCdefDirections[direction][0][0];
+  const int x_0 = kCdefDirections[direction][0][1];
+  const int y_1 = kCdefDirections[direction][1][0];
+  const int x_1 = kCdefDirections[direction][1][1];
+  output[0] = LoadUnaligned16(src - y_0 * stride - x_0);
+  output[1] = LoadUnaligned16(src + y_0 * stride + x_0);
+  output[2] = LoadUnaligned16(src - y_1 * stride - x_1);
+  output[3] = LoadUnaligned16(src + y_1 * stride + x_1);
+}
+
+// Load 4 vectors based on the given |direction|. Use when |block_width| == 4 to
+// do 2 rows at a time.
+void LoadDirection4(const uint16_t* const src, const ptrdiff_t stride,
+                    __m128i* output, const int direction) {
+  const int y_0 = kCdefDirections[direction][0][0];
+  const int x_0 = kCdefDirections[direction][0][1];
+  const int y_1 = kCdefDirections[direction][1][0];
+  const int x_1 = kCdefDirections[direction][1][1];
+  output[0] = LoadHi8(LoadLo8(src - y_0 * stride - x_0),
+                      src - y_0 * stride + stride - x_0);
+  output[1] = LoadHi8(LoadLo8(src + y_0 * stride + x_0),
+                      src + y_0 * stride + stride + x_0);
+  output[2] = LoadHi8(LoadLo8(src - y_1 * stride - x_1),
+                      src - y_1 * stride + stride - x_1);
+  output[3] = LoadHi8(LoadLo8(src + y_1 * stride + x_1),
+                      src + y_1 * stride + stride + x_1);
+}
+
+inline __m256i Constrain(const __m256i& pixel, const __m256i& reference,
+                         const __m128i& damping, const __m256i& threshold) {
+  const __m256i diff = _mm256_sub_epi16(pixel, reference);
+  const __m256i abs_diff = _mm256_abs_epi16(diff);
+  // sign(diff) * Clip3(threshold - (std::abs(diff) >> damping),
+  //                    0, std::abs(diff))
+  const __m256i shifted_diff = _mm256_srl_epi16(abs_diff, damping);
+  // For bitdepth == 8, the threshold range is [0, 15] and the damping range is
+  // [3, 6]. If pixel == kCdefLargeValue(0x4000), shifted_diff will always be
+  // larger than threshold. Subtract using saturation will return 0 when pixel
+  // == kCdefLargeValue.
+  static_assert(kCdefLargeValue == 0x4000, "Invalid kCdefLargeValue");
+  const __m256i thresh_minus_shifted_diff =
+      _mm256_subs_epu16(threshold, shifted_diff);
+  const __m256i clamp_abs_diff =
+      _mm256_min_epi16(thresh_minus_shifted_diff, abs_diff);
+  // Restore the sign.
+  return _mm256_sign_epi16(clamp_abs_diff, diff);
+}
+
+inline __m256i ApplyConstrainAndTap(const __m256i& pixel, const __m256i& val,
+                                    const __m256i& tap, const __m128i& damping,
+                                    const __m256i& threshold) {
+  const __m256i constrained = Constrain(val, pixel, damping, threshold);
+  return _mm256_mullo_epi16(constrained, tap);
+}
+
+template <int width, bool enable_primary = true, bool enable_secondary = true>
+void CdefFilter_AVX2(const uint16_t* src, const ptrdiff_t src_stride,
+                     const int height, const int primary_strength,
+                     const int secondary_strength, const int damping,
+                     const int direction, void* dest,
+                     const ptrdiff_t dst_stride) {
+  static_assert(width == 8 || width == 4, "Invalid CDEF width.");
+  static_assert(enable_primary || enable_secondary, "");
+  constexpr bool clipping_required = enable_primary && enable_secondary;
+  auto* dst = static_cast<uint8_t*>(dest);
+  __m128i primary_damping_shift, secondary_damping_shift;
+
+  // FloorLog2() requires input to be > 0.
+  // 8-bit damping range: Y: [3, 6], UV: [2, 5].
+  if (enable_primary) {
+    // primary_strength: [0, 15] -> FloorLog2: [0, 3] so a clamp is necessary
+    // for UV filtering.
+    primary_damping_shift =
+        _mm_cvtsi32_si128(std::max(0, damping - FloorLog2(primary_strength)));
+  }
+  if (enable_secondary) {
+    // secondary_strength: [0, 4] -> FloorLog2: [0, 2] so no clamp to 0 is
+    // necessary.
+    assert(damping - FloorLog2(secondary_strength) >= 0);
+    secondary_damping_shift =
+        _mm_cvtsi32_si128(damping - FloorLog2(secondary_strength));
+  }
+  const __m256i primary_tap_0 = _mm256_broadcastw_epi16(
+      _mm_cvtsi32_si128(kCdefPrimaryTaps[primary_strength & 1][0]));
+  const __m256i primary_tap_1 = _mm256_broadcastw_epi16(
+      _mm_cvtsi32_si128(kCdefPrimaryTaps[primary_strength & 1][1]));
+  const __m256i secondary_tap_0 =
+      _mm256_broadcastw_epi16(_mm_cvtsi32_si128(kCdefSecondaryTap0));
+  const __m256i secondary_tap_1 =
+      _mm256_broadcastw_epi16(_mm_cvtsi32_si128(kCdefSecondaryTap1));
+  const __m256i cdef_large_value_mask = _mm256_broadcastw_epi16(
+      _mm_cvtsi32_si128(static_cast<int16_t>(~kCdefLargeValue)));
+  const __m256i primary_threshold =
+      _mm256_broadcastw_epi16(_mm_cvtsi32_si128(primary_strength));
+  const __m256i secondary_threshold =
+      _mm256_broadcastw_epi16(_mm_cvtsi32_si128(secondary_strength));
+
+  int y = height;
+  do {
+    __m128i pixel_128;
+    if (width == 8) {
+      pixel_128 = LoadUnaligned16(src);
+    } else {
+      pixel_128 = LoadHi8(LoadLo8(src), src + src_stride);
+    }
+
+    __m256i pixel = SetrM128i(pixel_128, pixel_128);
+
+    __m256i min = pixel;
+    __m256i max = pixel;
+    __m256i sum_pair;
+
+    if (enable_primary) {
+      // Primary |direction|.
+      __m128i primary_val_128[4];
+      if (width == 8) {
+        LoadDirection(src, src_stride, primary_val_128, direction);
+      } else {
+        LoadDirection4(src, src_stride, primary_val_128, direction);
+      }
+
+      __m256i primary_val[2];
+      primary_val[0] = SetrM128i(primary_val_128[0], primary_val_128[1]);
+      primary_val[1] = SetrM128i(primary_val_128[2], primary_val_128[3]);
+
+      if (clipping_required) {
+        min = _mm256_min_epu16(min, primary_val[0]);
+        min = _mm256_min_epu16(min, primary_val[1]);
+
+        // The source is 16 bits, however, we only really care about the lower
+        // 8 bits.  The upper 8 bits contain the "large" flag.  After the final
+        // primary max has been calculated, zero out the upper 8 bits.  Use this
+        // to find the "16 bit" max.
+        const __m256i max_p01 = _mm256_max_epu8(primary_val[0], primary_val[1]);
+        max = _mm256_max_epu16(
+            max, _mm256_and_si256(max_p01, cdef_large_value_mask));
+      }
+
+      sum_pair = ApplyConstrainAndTap(pixel, primary_val[0], primary_tap_0,
+                                      primary_damping_shift, primary_threshold);
+      sum_pair = _mm256_add_epi16(
+          sum_pair,
+          ApplyConstrainAndTap(pixel, primary_val[1], primary_tap_1,
+                               primary_damping_shift, primary_threshold));
+    } else {
+      sum_pair = _mm256_setzero_si256();
+    }
+
+    if (enable_secondary) {
+      // Secondary |direction| values (+/- 2). Clamp |direction|.
+      __m128i secondary_val_128[8];
+      if (width == 8) {
+        LoadDirection(src, src_stride, secondary_val_128, direction + 2);
+        LoadDirection(src, src_stride, secondary_val_128 + 4, direction - 2);
+      } else {
+        LoadDirection4(src, src_stride, secondary_val_128, direction + 2);
+        LoadDirection4(src, src_stride, secondary_val_128 + 4, direction - 2);
+      }
+
+      __m256i secondary_val[4];
+      secondary_val[0] = SetrM128i(secondary_val_128[0], secondary_val_128[1]);
+      secondary_val[1] = SetrM128i(secondary_val_128[2], secondary_val_128[3]);
+      secondary_val[2] = SetrM128i(secondary_val_128[4], secondary_val_128[5]);
+      secondary_val[3] = SetrM128i(secondary_val_128[6], secondary_val_128[7]);
+
+      if (clipping_required) {
+        min = _mm256_min_epu16(min, secondary_val[0]);
+        min = _mm256_min_epu16(min, secondary_val[1]);
+        min = _mm256_min_epu16(min, secondary_val[2]);
+        min = _mm256_min_epu16(min, secondary_val[3]);
+
+        const __m256i max_s01 =
+            _mm256_max_epu8(secondary_val[0], secondary_val[1]);
+        const __m256i max_s23 =
+            _mm256_max_epu8(secondary_val[2], secondary_val[3]);
+        const __m256i max_s = _mm256_max_epu8(max_s01, max_s23);
+        max = _mm256_max_epu8(max,
+                              _mm256_and_si256(max_s, cdef_large_value_mask));
+      }
+
+      sum_pair = _mm256_add_epi16(
+          sum_pair,
+          ApplyConstrainAndTap(pixel, secondary_val[0], secondary_tap_0,
+                               secondary_damping_shift, secondary_threshold));
+      sum_pair = _mm256_add_epi16(
+          sum_pair,
+          ApplyConstrainAndTap(pixel, secondary_val[1], secondary_tap_1,
+                               secondary_damping_shift, secondary_threshold));
+      sum_pair = _mm256_add_epi16(
+          sum_pair,
+          ApplyConstrainAndTap(pixel, secondary_val[2], secondary_tap_0,
+                               secondary_damping_shift, secondary_threshold));
+      sum_pair = _mm256_add_epi16(
+          sum_pair,
+          ApplyConstrainAndTap(pixel, secondary_val[3], secondary_tap_1,
+                               secondary_damping_shift, secondary_threshold));
+    }
+
+    __m128i sum = _mm_add_epi16(_mm256_castsi256_si128(sum_pair),
+                                _mm256_extracti128_si256(sum_pair, 1));
+
+    // Clip3(pixel + ((8 + sum - (sum < 0)) >> 4), min, max))
+    const __m128i sum_lt_0 = _mm_srai_epi16(sum, 15);
+    // 8 + sum
+    sum = _mm_add_epi16(sum, _mm_set1_epi16(8));
+    // (... - (sum < 0)) >> 4
+    sum = _mm_add_epi16(sum, sum_lt_0);
+    sum = _mm_srai_epi16(sum, 4);
+    // pixel + ...
+    sum = _mm_add_epi16(sum, _mm256_castsi256_si128(pixel));
+    if (clipping_required) {
+      const __m128i min_128 = _mm_min_epu16(_mm256_castsi256_si128(min),
+                                            _mm256_extracti128_si256(min, 1));
+
+      const __m128i max_128 = _mm_max_epu16(_mm256_castsi256_si128(max),
+                                            _mm256_extracti128_si256(max, 1));
+      // Clip3
+      sum = _mm_min_epi16(sum, max_128);
+      sum = _mm_max_epi16(sum, min_128);
+    }
+
+    const __m128i result = _mm_packus_epi16(sum, sum);
+    if (width == 8) {
+      src += src_stride;
+      StoreLo8(dst, result);
+      dst += dst_stride;
+      --y;
+    } else {
+      src += src_stride << 1;
+      Store4(dst, result);
+      dst += dst_stride;
+      Store4(dst, _mm_srli_si128(result, 4));
+      dst += dst_stride;
+      y -= 2;
+    }
+  } while (y != 0);
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+  assert(dsp != nullptr);
+  dsp->cdef_direction = CdefDirection_AVX2;
+
+  dsp->cdef_filters[0][0] = CdefFilter_AVX2<4>;
+  dsp->cdef_filters[0][1] =
+      CdefFilter_AVX2<4, /*enable_primary=*/true, /*enable_secondary=*/false>;
+  dsp->cdef_filters[0][2] = CdefFilter_AVX2<4, /*enable_primary=*/false>;
+  dsp->cdef_filters[1][0] = CdefFilter_AVX2<8>;
+  dsp->cdef_filters[1][1] =
+      CdefFilter_AVX2<8, /*enable_primary=*/true, /*enable_secondary=*/false>;
+  dsp->cdef_filters[1][2] = CdefFilter_AVX2<8, /*enable_primary=*/false>;
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+void CdefInit_AVX2() { low_bitdepth::Init8bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+#else   // !LIBGAV1_TARGETING_AVX2
+namespace libgav1 {
+namespace dsp {
+
+void CdefInit_AVX2() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_TARGETING_AVX2
diff --git a/src/dsp/x86/cdef_avx2.h b/src/dsp/x86/cdef_avx2.h
new file mode 100644
index 0000000..41f2d3f
--- /dev/null
+++ b/src/dsp/x86/cdef_avx2.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_CDEF_AVX2_H_
+#define LIBGAV1_SRC_DSP_X86_CDEF_AVX2_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::cdef_direction and Dsp::cdef_filters. This function is not
+// thread-safe.
+void CdefInit_AVX2();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_TARGETING_AVX2
+
+#ifndef LIBGAV1_Dsp8bpp_CdefDirection
+#define LIBGAV1_Dsp8bpp_CdefDirection LIBGAV1_CPU_AVX2
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_CdefFilters
+#define LIBGAV1_Dsp8bpp_CdefFilters LIBGAV1_CPU_AVX2
+#endif
+
+#endif  // LIBGAV1_TARGETING_AVX2
+
+#endif  // LIBGAV1_SRC_DSP_X86_CDEF_AVX2_H_
diff --git a/src/dsp/x86/cdef_sse4.cc b/src/dsp/x86/cdef_sse4.cc
index 3211a2d..6ede778 100644
--- a/src/dsp/x86/cdef_sse4.cc
+++ b/src/dsp/x86/cdef_sse4.cc
@@ -349,8 +349,8 @@ inline uint32_t SumVector_S32(__m128i a) {
 inline uint32_t Cost0Or4(const __m128i a, const __m128i b,
                          const __m128i division_table[2]) {
   // Reverse and clear upper 2 bytes.
-  const __m128i reverser =
-      _mm_set_epi32(0x80800100, 0x03020504, 0x07060908, 0x0b0a0d0c);
+  const __m128i reverser = _mm_set_epi32(static_cast<int>(0x80800100),
+                                         0x03020504, 0x07060908, 0x0b0a0d0c);
   // 14 13 12 11 10 09 08 ZZ
   const __m128i b_reversed = _mm_shuffle_epi8(b, reverser);
   // 00 14 01 13 02 12 03 11
@@ -371,7 +371,8 @@ inline uint32_t CostOdd(const __m128i a, const __m128i b,
                         const __m128i division_table[2]) {
   // Reverse and clear upper 10 bytes.
   const __m128i reverser =
-      _mm_set_epi32(0x80808080, 0x80808080, 0x80800100, 0x03020504);
+      _mm_set_epi32(static_cast<int>(0x80808080), static_cast<int>(0x80808080),
+                    static_cast<int>(0x80800100), 0x03020504);
   // 10 09 08 ZZ ZZ ZZ ZZ ZZ
   const __m128i b_reversed = _mm_shuffle_epi8(b, reverser);
   // 00 10 01 09 02 08 03 ZZ
@@ -717,7 +718,7 @@ void CdefInit_SSE4_1() { low_bitdepth::Init8bpp(); }
 
 }  // namespace dsp
 }  // namespace libgav1
-#else  // !LIBGAV1_TARGETING_SSE4_1
+#else   // !LIBGAV1_TARGETING_SSE4_1
 namespace libgav1 {
 namespace dsp {
 
diff --git a/src/dsp/x86/common_avx2.h b/src/dsp/x86/common_avx2.h
index 4ce7de2..373116a 100644
--- a/src/dsp/x86/common_avx2.h
+++ b/src/dsp/x86/common_avx2.h
@@ -27,109 +27,60 @@
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
+#include <cstring>
 
 namespace libgav1 {
 namespace dsp {
-
-//------------------------------------------------------------------------------
-// Compatibility functions.
-
-inline __m256i SetrM128i(const __m128i lo, const __m128i hi) {
-  // For compatibility with older gcc toolchains (< 8) use
-  // _mm256_inserti128_si256 over _mm256_setr_m128i. Newer gcc implementations
-  // are implemented similarly to the following, clang uses a different method
-  // but no differences in assembly have been observed.
-  return _mm256_inserti128_si256(_mm256_castsi128_si256(lo), hi, 1);
-}
-
-//------------------------------------------------------------------------------
-// Load functions.
-
-inline __m256i LoadAligned32(const void* a) {
-  assert((reinterpret_cast<uintptr_t>(a) & 0x1f) == 0);
-  return _mm256_load_si256(static_cast<const __m256i*>(a));
-}
-
-inline void LoadAligned64(const void* a, __m256i dst[2]) {
-  assert((reinterpret_cast<uintptr_t>(a) & 0x1f) == 0);
-  dst[0] = _mm256_load_si256(static_cast<const __m256i*>(a) + 0);
-  dst[1] = _mm256_load_si256(static_cast<const __m256i*>(a) + 1);
-}
-
-inline __m256i LoadUnaligned32(const void* a) {
-  return _mm256_loadu_si256(static_cast<const __m256i*>(a));
-}
-
-//------------------------------------------------------------------------------
-// Load functions to avoid MemorySanitizer's use-of-uninitialized-value warning.
-
-inline __m256i MaskOverreads(const __m256i source,
-                             const ptrdiff_t over_read_in_bytes) {
-  __m256i dst = source;
-#if LIBGAV1_MSAN
-  if (over_read_in_bytes >= 32) return _mm256_setzero_si256();
-  if (over_read_in_bytes > 0) {
-    __m128i m = _mm_set1_epi8(-1);
-    for (ptrdiff_t i = 0; i < over_read_in_bytes % 16; ++i) {
-      m = _mm_srli_si128(m, 1);
-    }
-    const __m256i mask = (over_read_in_bytes < 16)
-                             ? SetrM128i(_mm_set1_epi8(-1), m)
-                             : SetrM128i(m, _mm_setzero_si128());
-    dst = _mm256_and_si256(dst, mask);
-  }
-#else
-  static_cast<void>(over_read_in_bytes);
-#endif
-  return dst;
-}
-
-inline __m256i LoadAligned32Msan(const void* const source,
-                                 const ptrdiff_t over_read_in_bytes) {
-  return MaskOverreads(LoadAligned32(source), over_read_in_bytes);
-}
-
-inline void LoadAligned64Msan(const void* const source,
-                              const ptrdiff_t over_read_in_bytes,
-                              __m256i dst[2]) {
-  dst[0] = MaskOverreads(LoadAligned32(source), over_read_in_bytes);
-  dst[1] = MaskOverreads(LoadAligned32(static_cast<const __m256i*>(source) + 1),
-                         over_read_in_bytes);
-}
-
-inline __m256i LoadUnaligned32Msan(const void* const source,
-                                   const ptrdiff_t over_read_in_bytes) {
-  return MaskOverreads(LoadUnaligned32(source), over_read_in_bytes);
-}
-
-//------------------------------------------------------------------------------
-// Store functions.
-
-inline void StoreAligned32(void* a, const __m256i v) {
-  assert((reinterpret_cast<uintptr_t>(a) & 0x1f) == 0);
-  _mm256_store_si256(static_cast<__m256i*>(a), v);
-}
-
-inline void StoreAligned64(void* a, const __m256i v[2]) {
-  assert((reinterpret_cast<uintptr_t>(a) & 0x1f) == 0);
-  _mm256_store_si256(static_cast<__m256i*>(a) + 0, v[0]);
-  _mm256_store_si256(static_cast<__m256i*>(a) + 1, v[1]);
-}
-
-inline void StoreUnaligned32(void* a, const __m256i v) {
-  _mm256_storeu_si256(static_cast<__m256i*>(a), v);
-}
-
-//------------------------------------------------------------------------------
-// Arithmetic utilities.
-
-inline __m256i RightShiftWithRounding_S16(const __m256i v_val_d, int bits) {
-  assert(bits <= 16);
-  const __m256i v_bias_d =
-      _mm256_set1_epi16(static_cast<int16_t>((1 << bits) >> 1));
-  const __m256i v_tmp_d = _mm256_add_epi16(v_val_d, v_bias_d);
-  return _mm256_srai_epi16(v_tmp_d, bits);
-}
+namespace avx2 {
+
+#include "src/dsp/x86/common_avx2.inc"
+#include "src/dsp/x86/common_sse4.inc"
+
+}  // namespace avx2
+
+// NOLINTBEGIN(misc-unused-using-decls)
+// These function aliases shall not be visible to external code. They are
+// restricted to x86/*_avx2.cc files only. This scheme exists to distinguish two
+// possible implementations of common functions, which may differ based on
+// whether the compiler is permitted to use avx2 instructions.
+
+// common_sse4.inc
+using avx2::Load2;
+using avx2::Load2x2;
+using avx2::Load4;
+using avx2::Load4x2;
+using avx2::LoadAligned16;
+using avx2::LoadAligned16Msan;
+using avx2::LoadHi8;
+using avx2::LoadHi8Msan;
+using avx2::LoadLo8;
+using avx2::LoadLo8Msan;
+using avx2::LoadUnaligned16;
+using avx2::LoadUnaligned16Msan;
+using avx2::MaskHighNBytes;
+using avx2::RightShiftWithRounding_S16;
+using avx2::RightShiftWithRounding_S32;
+using avx2::RightShiftWithRounding_U16;
+using avx2::RightShiftWithRounding_U32;
+using avx2::Store2;
+using avx2::Store4;
+using avx2::StoreAligned16;
+using avx2::StoreHi8;
+using avx2::StoreLo8;
+using avx2::StoreUnaligned16;
+
+// common_avx2.inc
+using avx2::LoadAligned32;
+using avx2::LoadAligned32Msan;
+using avx2::LoadAligned64;
+using avx2::LoadAligned64Msan;
+using avx2::LoadUnaligned32;
+using avx2::LoadUnaligned32Msan;
+using avx2::SetrM128i;
+using avx2::StoreAligned32;
+using avx2::StoreAligned64;
+using avx2::StoreUnaligned32;
+// NOLINTEND
 
 }  // namespace dsp
 }  // namespace libgav1
diff --git a/src/dsp/x86/common_avx2.inc b/src/dsp/x86/common_avx2.inc
new file mode 100644
index 0000000..53b4e2e
--- /dev/null
+++ b/src/dsp/x86/common_avx2.inc
@@ -0,0 +1,121 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+//------------------------------------------------------------------------------
+// Compatibility functions.
+
+inline __m256i SetrM128i(const __m128i lo, const __m128i hi) {
+  // For compatibility with older gcc toolchains (< 8) use
+  // _mm256_inserti128_si256 over _mm256_setr_m128i. Newer gcc implementations
+  // are implemented similarly to the following, clang uses a different method
+  // but no differences in assembly have been observed.
+  return _mm256_inserti128_si256(_mm256_castsi128_si256(lo), hi, 1);
+}
+
+//------------------------------------------------------------------------------
+// Load functions.
+
+inline __m256i LoadAligned32(const void* a) {
+  assert((reinterpret_cast<uintptr_t>(a) & 0x1f) == 0);
+  return _mm256_load_si256(static_cast<const __m256i*>(a));
+}
+
+inline void LoadAligned64(const void* a, __m256i dst[2]) {
+  assert((reinterpret_cast<uintptr_t>(a) & 0x1f) == 0);
+  dst[0] = _mm256_load_si256(static_cast<const __m256i*>(a) + 0);
+  dst[1] = _mm256_load_si256(static_cast<const __m256i*>(a) + 1);
+}
+
+inline __m256i LoadUnaligned32(const void* a) {
+  return _mm256_loadu_si256(static_cast<const __m256i*>(a));
+}
+
+//------------------------------------------------------------------------------
+// Load functions to avoid MemorySanitizer's use-of-uninitialized-value warning.
+
+inline __m256i MaskOverreads(const __m256i source,
+                             const ptrdiff_t over_read_in_bytes) {
+  __m256i dst = source;
+#if LIBGAV1_MSAN
+  if (over_read_in_bytes >= 32) return _mm256_setzero_si256();
+  if (over_read_in_bytes > 0) {
+    __m128i m = _mm_set1_epi8(-1);
+    for (ptrdiff_t i = 0; i < over_read_in_bytes % 16; ++i) {
+      m = _mm_srli_si128(m, 1);
+    }
+    const __m256i mask = (over_read_in_bytes < 16)
+                             ? SetrM128i(_mm_set1_epi8(-1), m)
+                             : SetrM128i(m, _mm_setzero_si128());
+    dst = _mm256_and_si256(dst, mask);
+  }
+#else
+  static_cast<void>(over_read_in_bytes);
+#endif
+  return dst;
+}
+
+inline __m256i LoadAligned32Msan(const void* const source,
+                                 const ptrdiff_t over_read_in_bytes) {
+  return MaskOverreads(LoadAligned32(source), over_read_in_bytes);
+}
+
+inline void LoadAligned64Msan(const void* const source,
+                              const ptrdiff_t over_read_in_bytes,
+                              __m256i dst[2]) {
+  dst[0] = MaskOverreads(LoadAligned32(source), over_read_in_bytes);
+  dst[1] = MaskOverreads(LoadAligned32(static_cast<const __m256i*>(source) + 1),
+                         over_read_in_bytes);
+}
+
+inline __m256i LoadUnaligned32Msan(const void* const source,
+                                   const ptrdiff_t over_read_in_bytes) {
+  return MaskOverreads(LoadUnaligned32(source), over_read_in_bytes);
+}
+
+//------------------------------------------------------------------------------
+// Store functions.
+
+inline void StoreAligned32(void* a, const __m256i v) {
+  assert((reinterpret_cast<uintptr_t>(a) & 0x1f) == 0);
+  _mm256_store_si256(static_cast<__m256i*>(a), v);
+}
+
+inline void StoreAligned64(void* a, const __m256i v[2]) {
+  assert((reinterpret_cast<uintptr_t>(a) & 0x1f) == 0);
+  _mm256_store_si256(static_cast<__m256i*>(a) + 0, v[0]);
+  _mm256_store_si256(static_cast<__m256i*>(a) + 1, v[1]);
+}
+
+inline void StoreUnaligned32(void* a, const __m256i v) {
+  _mm256_storeu_si256(static_cast<__m256i*>(a), v);
+}
+
+//------------------------------------------------------------------------------
+// Arithmetic utilities.
+
+inline __m256i RightShiftWithRounding_S16(const __m256i v_val_d, int bits) {
+  assert(bits <= 16);
+  const __m256i v_bias_d =
+      _mm256_set1_epi16(static_cast<int16_t>((1 << bits) >> 1));
+  const __m256i v_tmp_d = _mm256_add_epi16(v_val_d, v_bias_d);
+  return _mm256_srai_epi16(v_tmp_d, bits);
+}
+
+inline __m256i RightShiftWithRounding_S32(const __m256i v_val_d, int bits) {
+  const __m256i v_bias_d = _mm256_set1_epi32((1 << bits) >> 1);
+  const __m256i v_tmp_d = _mm256_add_epi32(v_val_d, v_bias_d);
+  return _mm256_srai_epi32(v_tmp_d, bits);
+}
diff --git a/src/dsp/x86/common_sse4.h b/src/dsp/x86/common_sse4.h
index c510f8c..41a3a68 100644
--- a/src/dsp/x86/common_sse4.h
+++ b/src/dsp/x86/common_sse4.h
@@ -28,7 +28,6 @@
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
-#include <cstdlib>
 #include <cstring>
 
 #if 0
@@ -71,192 +70,58 @@ inline void PrintRegX(const int r, const char* const name) {
 #define PR(var, N) PrintReg(var, #var, N)
 #define PD(var) PrintReg(var, #var);
 #define PX(var) PrintRegX(var, #var);
-#endif  // 0
-
-namespace libgav1 {
-namespace dsp {
-
-//------------------------------------------------------------------------------
-// Load functions.
-
-inline __m128i Load2(const void* src) {
-  int16_t val;
-  memcpy(&val, src, sizeof(val));
-  return _mm_cvtsi32_si128(val);
-}
-
-inline __m128i Load2x2(const void* src1, const void* src2) {
-  uint16_t val1;
-  uint16_t val2;
-  memcpy(&val1, src1, sizeof(val1));
-  memcpy(&val2, src2, sizeof(val2));
-  return _mm_cvtsi32_si128(val1 | (val2 << 16));
-}
-
-// Load 2 uint8_t values into |lane| * 2 and |lane| * 2 + 1.
-template <int lane>
-inline __m128i Load2(const void* const buf, __m128i val) {
-  uint16_t temp;
-  memcpy(&temp, buf, 2);
-  return _mm_insert_epi16(val, temp, lane);
-}
-
-inline __m128i Load4(const void* src) {
-  // With new compilers such as clang 8.0.0 we can use the new _mm_loadu_si32
-  // intrinsic. Both _mm_loadu_si32(src) and the code here are compiled into a
-  // movss instruction.
-  //
-  // Until compiler support of _mm_loadu_si32 is widespread, use of
-  // _mm_loadu_si32 is banned.
-  int val;
-  memcpy(&val, src, sizeof(val));
-  return _mm_cvtsi32_si128(val);
-}
-
-inline __m128i Load4x2(const void* src1, const void* src2) {
-  // With new compilers such as clang 8.0.0 we can use the new _mm_loadu_si32
-  // intrinsic. Both _mm_loadu_si32(src) and the code here are compiled into a
-  // movss instruction.
-  //
-  // Until compiler support of _mm_loadu_si32 is widespread, use of
-  // _mm_loadu_si32 is banned.
-  int val1, val2;
-  memcpy(&val1, src1, sizeof(val1));
-  memcpy(&val2, src2, sizeof(val2));
-  return _mm_insert_epi32(_mm_cvtsi32_si128(val1), val2, 1);
-}
 
-inline __m128i LoadLo8(const void* a) {
-  return _mm_loadl_epi64(static_cast<const __m128i*>(a));
-}
-
-inline __m128i LoadHi8(const __m128i v, const void* a) {
-  const __m128 x =
-      _mm_loadh_pi(_mm_castsi128_ps(v), static_cast<const __m64*>(a));
-  return _mm_castps_si128(x);
-}
-
-inline __m128i LoadUnaligned16(const void* a) {
-  return _mm_loadu_si128(static_cast<const __m128i*>(a));
-}
-
-inline __m128i LoadAligned16(const void* a) {
-  assert((reinterpret_cast<uintptr_t>(a) & 0xf) == 0);
-  return _mm_load_si128(static_cast<const __m128i*>(a));
-}
-
-//------------------------------------------------------------------------------
-// Load functions to avoid MemorySanitizer's use-of-uninitialized-value warning.
-
-inline __m128i MaskOverreads(const __m128i source,
-                             const ptrdiff_t over_read_in_bytes) {
-  __m128i dst = source;
 #if LIBGAV1_MSAN
-  if (over_read_in_bytes > 0) {
-    __m128i mask = _mm_set1_epi8(-1);
-    for (ptrdiff_t i = 0; i < over_read_in_bytes; ++i) {
-      mask = _mm_srli_si128(mask, 1);
-    }
-    dst = _mm_and_si128(dst, mask);
-  }
-#else
-  static_cast<void>(over_read_in_bytes);
-#endif
-  return dst;
-}
+#include <sanitizer/msan_interface.h>
 
-inline __m128i LoadLo8Msan(const void* const source,
-                           const ptrdiff_t over_read_in_bytes) {
-  return MaskOverreads(LoadLo8(source), over_read_in_bytes + 8);
+inline void PrintShadow(const void* r, const char* const name,
+                        const size_t size) {
+  fprintf(stderr, "Shadow for %s:\n", name);
+  __msan_print_shadow(r, size);
 }
+#define PS(var, N) PrintShadow(var, #var, N)
 
-inline __m128i LoadHi8Msan(const __m128i v, const void* source,
-                           const ptrdiff_t over_read_in_bytes) {
-  return MaskOverreads(LoadHi8(v, source), over_read_in_bytes);
-}
-
-inline __m128i LoadAligned16Msan(const void* const source,
-                                 const ptrdiff_t over_read_in_bytes) {
-  return MaskOverreads(LoadAligned16(source), over_read_in_bytes);
-}
+#endif  // LIBGAV1_MSAN
 
-inline __m128i LoadUnaligned16Msan(const void* const source,
-                                   const ptrdiff_t over_read_in_bytes) {
-  return MaskOverreads(LoadUnaligned16(source), over_read_in_bytes);
-}
-
-//------------------------------------------------------------------------------
-// Store functions.
-
-inline void Store2(void* dst, const __m128i x) {
-  const int val = _mm_cvtsi128_si32(x);
-  memcpy(dst, &val, 2);
-}
-
-inline void Store4(void* dst, const __m128i x) {
-  const int val = _mm_cvtsi128_si32(x);
-  memcpy(dst, &val, sizeof(val));
-}
-
-inline void StoreLo8(void* a, const __m128i v) {
-  _mm_storel_epi64(static_cast<__m128i*>(a), v);
-}
-
-inline void StoreHi8(void* a, const __m128i v) {
-  _mm_storeh_pi(static_cast<__m64*>(a), _mm_castsi128_ps(v));
-}
-
-inline void StoreAligned16(void* a, const __m128i v) {
-  assert((reinterpret_cast<uintptr_t>(a) & 0xf) == 0);
-  _mm_store_si128(static_cast<__m128i*>(a), v);
-}
-
-inline void StoreUnaligned16(void* a, const __m128i v) {
-  _mm_storeu_si128(static_cast<__m128i*>(a), v);
-}
-
-//------------------------------------------------------------------------------
-// Arithmetic utilities.
-
-inline __m128i RightShiftWithRounding_U16(const __m128i v_val_d, int bits) {
-  assert(bits <= 16);
-  // Shift out all but the last bit.
-  const __m128i v_tmp_d = _mm_srli_epi16(v_val_d, bits - 1);
-  // Avg with zero will shift by 1 and round.
-  return _mm_avg_epu16(v_tmp_d, _mm_setzero_si128());
-}
-
-inline __m128i RightShiftWithRounding_S16(const __m128i v_val_d, int bits) {
-  assert(bits <= 16);
-  const __m128i v_bias_d =
-      _mm_set1_epi16(static_cast<int16_t>((1 << bits) >> 1));
-  const __m128i v_tmp_d = _mm_add_epi16(v_val_d, v_bias_d);
-  return _mm_srai_epi16(v_tmp_d, bits);
-}
-
-inline __m128i RightShiftWithRounding_U32(const __m128i v_val_d, int bits) {
-  const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
-  const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d);
-  return _mm_srli_epi32(v_tmp_d, bits);
-}
-
-inline __m128i RightShiftWithRounding_S32(const __m128i v_val_d, int bits) {
-  const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
-  const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d);
-  return _mm_srai_epi32(v_tmp_d, bits);
-}
-
-//------------------------------------------------------------------------------
-// Masking utilities
-inline __m128i MaskHighNBytes(int n) {
-  static constexpr uint8_t kMask[32] = {
-      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-      0,   0,   0,   0,   0,   255, 255, 255, 255, 255, 255,
-      255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-  };
+#endif  // 0
 
-  return LoadUnaligned16(kMask + n);
-}
+namespace libgav1 {
+namespace dsp {
+namespace sse4 {
+
+#include "src/dsp/x86/common_sse4.inc"
+
+}  // namespace sse4
+
+// NOLINTBEGIN(misc-unused-using-decls)
+// These function aliases shall not be visible to external code. They are
+// restricted to x86/*_sse4.cc files only. This scheme exists to distinguish two
+// possible implementations of common functions, which may differ based on
+// whether the compiler is permitted to use avx2 instructions.
+using sse4::Load2;
+using sse4::Load2x2;
+using sse4::Load4;
+using sse4::Load4x2;
+using sse4::LoadAligned16;
+using sse4::LoadAligned16Msan;
+using sse4::LoadHi8;
+using sse4::LoadHi8Msan;
+using sse4::LoadLo8;
+using sse4::LoadLo8Msan;
+using sse4::LoadUnaligned16;
+using sse4::LoadUnaligned16Msan;
+using sse4::MaskHighNBytes;
+using sse4::RightShiftWithRounding_S16;
+using sse4::RightShiftWithRounding_S32;
+using sse4::RightShiftWithRounding_U16;
+using sse4::RightShiftWithRounding_U32;
+using sse4::Store2;
+using sse4::Store4;
+using sse4::StoreAligned16;
+using sse4::StoreHi8;
+using sse4::StoreLo8;
+using sse4::StoreUnaligned16;
+// NOLINTEND
 
 }  // namespace dsp
 }  // namespace libgav1
diff --git a/src/dsp/x86/common_sse4.inc b/src/dsp/x86/common_sse4.inc
new file mode 100644
index 0000000..35c56b8
--- /dev/null
+++ b/src/dsp/x86/common_sse4.inc
@@ -0,0 +1,206 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+//------------------------------------------------------------------------------
+// Load functions.
+
+inline __m128i Load2(const void* src) {
+  int16_t val;
+  memcpy(&val, src, sizeof(val));
+  return _mm_cvtsi32_si128(val);
+}
+
+inline __m128i Load2x2(const void* src1, const void* src2) {
+  uint16_t val1;
+  uint16_t val2;
+  memcpy(&val1, src1, sizeof(val1));
+  memcpy(&val2, src2, sizeof(val2));
+  return _mm_cvtsi32_si128(val1 | (val2 << 16));
+}
+
+// Load 2 uint8_t values into |lane| * 2 and |lane| * 2 + 1.
+template <int lane>
+inline __m128i Load2(const void* const buf, __m128i val) {
+  int16_t temp;
+  memcpy(&temp, buf, 2);
+  return _mm_insert_epi16(val, temp, lane);
+}
+
+inline __m128i Load4(const void* src) {
+  // With new compilers such as clang 8.0.0 we can use the new _mm_loadu_si32
+  // intrinsic. Both _mm_loadu_si32(src) and the code here are compiled into a
+  // movss instruction.
+  //
+  // Until compiler support of _mm_loadu_si32 is widespread, use of
+  // _mm_loadu_si32 is banned.
+  int val;
+  memcpy(&val, src, sizeof(val));
+  return _mm_cvtsi32_si128(val);
+}
+
+inline __m128i Load4x2(const void* src1, const void* src2) {
+  // With new compilers such as clang 8.0.0 we can use the new _mm_loadu_si32
+  // intrinsic. Both _mm_loadu_si32(src) and the code here are compiled into a
+  // movss instruction.
+  //
+  // Until compiler support of _mm_loadu_si32 is widespread, use of
+  // _mm_loadu_si32 is banned.
+  int val1, val2;
+  memcpy(&val1, src1, sizeof(val1));
+  memcpy(&val2, src2, sizeof(val2));
+  return _mm_insert_epi32(_mm_cvtsi32_si128(val1), val2, 1);
+}
+
+inline __m128i LoadLo8(const void* a) {
+  return _mm_loadl_epi64(static_cast<const __m128i*>(a));
+}
+
+inline __m128i LoadHi8(const __m128i v, const void* a) {
+  const __m128 x =
+      _mm_loadh_pi(_mm_castsi128_ps(v), static_cast<const __m64*>(a));
+  return _mm_castps_si128(x);
+}
+
+inline __m128i LoadUnaligned16(const void* a) {
+  return _mm_loadu_si128(static_cast<const __m128i*>(a));
+}
+
+inline __m128i LoadAligned16(const void* a) {
+  assert((reinterpret_cast<uintptr_t>(a) & 0xf) == 0);
+  return _mm_load_si128(static_cast<const __m128i*>(a));
+}
+
+//------------------------------------------------------------------------------
+// Load functions to avoid MemorySanitizer's use-of-uninitialized-value warning.
+
+inline __m128i MaskOverreads(const __m128i source,
+                             const ptrdiff_t over_read_in_bytes) {
+  __m128i dst = source;
+#if LIBGAV1_MSAN
+  if (over_read_in_bytes > 0) {
+    __m128i mask = _mm_set1_epi8(-1);
+    for (ptrdiff_t i = 0; i < over_read_in_bytes; ++i) {
+      mask = _mm_srli_si128(mask, 1);
+    }
+    dst = _mm_and_si128(dst, mask);
+  }
+#else
+  static_cast<void>(over_read_in_bytes);
+#endif
+  return dst;
+}
+
+inline __m128i LoadLo8Msan(const void* const source,
+                           const ptrdiff_t over_read_in_bytes) {
+  return MaskOverreads(LoadLo8(source), over_read_in_bytes + 8);
+}
+
+inline __m128i LoadHi8Msan(const __m128i v, const void* source,
+                           const ptrdiff_t over_read_in_bytes) {
+  return MaskOverreads(LoadHi8(v, source), over_read_in_bytes);
+}
+
+inline __m128i LoadAligned16Msan(const void* const source,
+                                 const ptrdiff_t over_read_in_bytes) {
+  return MaskOverreads(LoadAligned16(source), over_read_in_bytes);
+}
+
+inline __m128i LoadUnaligned16Msan(const void* const source,
+                                   const ptrdiff_t over_read_in_bytes) {
+  return MaskOverreads(LoadUnaligned16(source), over_read_in_bytes);
+}
+
+//------------------------------------------------------------------------------
+// Store functions.
+
+inline void Store2(void* dst, const __m128i x) {
+  const int val = _mm_cvtsi128_si32(x);
+  memcpy(dst, &val, 2);
+}
+
+inline void Store4(void* dst, const __m128i x) {
+  const int val = _mm_cvtsi128_si32(x);
+  memcpy(dst, &val, sizeof(val));
+}
+
+inline void StoreLo8(void* a, const __m128i v) {
+  _mm_storel_epi64(static_cast<__m128i*>(a), v);
+}
+
+inline void StoreHi8(void* a, const __m128i v) {
+  _mm_storeh_pi(static_cast<__m64*>(a), _mm_castsi128_ps(v));
+}
+
+inline void StoreAligned16(void* a, const __m128i v) {
+  assert((reinterpret_cast<uintptr_t>(a) & 0xf) == 0);
+  _mm_store_si128(static_cast<__m128i*>(a), v);
+}
+
+inline void StoreUnaligned16(void* a, const __m128i v) {
+  _mm_storeu_si128(static_cast<__m128i*>(a), v);
+}
+
+//------------------------------------------------------------------------------
+// Arithmetic utilities.
+
+inline __m128i RightShiftWithRounding_U16(const __m128i v_val_d, int bits) {
+  assert(bits <= 16);
+  // Shift out all but the last bit.
+  const __m128i v_tmp_d = _mm_srli_epi16(v_val_d, bits - 1);
+  // Avg with zero will shift by 1 and round.
+  return _mm_avg_epu16(v_tmp_d, _mm_setzero_si128());
+}
+
+inline __m128i RightShiftWithRounding_S16(const __m128i v_val_d, int bits) {
+  assert(bits < 16);
+  const __m128i v_bias_d =
+      _mm_set1_epi16(static_cast<int16_t>((1 << bits) >> 1));
+  const __m128i v_tmp_d = _mm_add_epi16(v_val_d, v_bias_d);
+  return _mm_srai_epi16(v_tmp_d, bits);
+}
+
+inline __m128i RightShiftWithRounding_U32(const __m128i v_val_d, int bits) {
+  const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
+  const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d);
+  return _mm_srli_epi32(v_tmp_d, bits);
+}
+
+inline __m128i RightShiftWithRounding_S32(const __m128i v_val_d, int bits) {
+  const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
+  const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d);
+  return _mm_srai_epi32(v_tmp_d, bits);
+}
+
+// Use this when |bits| is not an immediate value.
+inline __m128i VariableRightShiftWithRounding_S32(const __m128i v_val_d,
+                                                  int bits) {
+  const __m128i v_bias_d =
+      _mm_set1_epi32(static_cast<int32_t>((1 << bits) >> 1));
+  const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d);
+  return _mm_sra_epi32(v_tmp_d, _mm_cvtsi32_si128(bits));
+}
+
+//------------------------------------------------------------------------------
+// Masking utilities
+inline __m128i MaskHighNBytes(int n) {
+  static constexpr uint8_t kMask[32] = {
+      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+      0,   0,   0,   0,   0,   255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+  };
+
+  return LoadUnaligned16(kMask + n);
+}
diff --git a/src/dsp/x86/convolve_avx2.cc b/src/dsp/x86/convolve_avx2.cc
index 3df2120..2ecb77c 100644
--- a/src/dsp/x86/convolve_avx2.cc
+++ b/src/dsp/x86/convolve_avx2.cc
@@ -26,7 +26,6 @@
 #include "src/dsp/constants.h"
 #include "src/dsp/dsp.h"
 #include "src/dsp/x86/common_avx2.h"
-#include "src/dsp/x86/common_sse4.h"
 #include "src/utils/common.h"
 #include "src/utils/constants.h"
 
@@ -35,7 +34,7 @@ namespace dsp {
 namespace low_bitdepth {
 namespace {
 
-constexpr int kHorizontalOffset = 3;
+#include "src/dsp/x86/convolve_sse4.inc"
 
 // Multiply every entry in |src[]| by the corresponding entry in |taps[]| and
 // sum. The filters in |taps[]| are pre-shifted by 1. This prevents the final
@@ -118,58 +117,15 @@ __m256i SimpleHorizontalTaps(const __m256i* const src,
 }
 
 template <int filter_index>
-__m128i SumHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride,
-                             const __m128i* const v_tap) {
-  // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17
-  const __m128i v_src = LoadHi8(LoadLo8(&src[0]), &src[src_stride]);
-
-  if (filter_index == 3) {
-    // 03 04 04 05 05 06 06 07 13 14 14 15 15 16 16 17
-    const __m128i v_src_43 = _mm_shuffle_epi8(
-        v_src, _mm_set_epi32(0x0f0e0e0d, 0x0d0c0c0b, 0x07060605, 0x05040403));
-    const __m128i v_sum_43 = _mm_maddubs_epi16(v_src_43, v_tap[0]);  // k4k3
-    return v_sum_43;
-  }
-
-  // 02 03 03 04 04 05 05 06 12 13 13 14 14 15 15 16
-  const __m128i v_src_32 = _mm_shuffle_epi8(
-      v_src, _mm_set_epi32(0x0e0d0d0c, 0x0c0b0b0a, 0x06050504, 0x04030302));
-  // 04 05 05 06 06 07 07 xx 14 15 15 16 16 17 17 xx
-  const __m128i v_src_54 = _mm_shuffle_epi8(
-      v_src, _mm_set_epi32(0x800f0f0e, 0x0e0d0d0c, 0x80070706, 0x06050504));
-  const __m128i v_madd_32 = _mm_maddubs_epi16(v_src_32, v_tap[0]);  // k3k2
-  const __m128i v_madd_54 = _mm_maddubs_epi16(v_src_54, v_tap[1]);  // k5k4
-  const __m128i v_sum_5432 = _mm_add_epi16(v_madd_54, v_madd_32);
-  return v_sum_5432;
-}
-
-template <int filter_index>
-__m128i SimpleHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride,
-                                const __m128i* const v_tap) {
-  __m128i sum = SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
-
-  // Normally the Horizontal pass does the downshift in two passes:
-  // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
-  // kInterRoundBitsHorizontal). Each one uses a rounding shift. Combining them
-  // requires adding the rounding offset from the skipped shift.
-  constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2);
-
-  sum = _mm_add_epi16(sum, _mm_set1_epi16(first_shift_rounding_bit));
-  sum = RightShiftWithRounding_S16(sum, kFilterBits - 1);
-  return _mm_packus_epi16(sum, sum);
-}
-
-template <int filter_index>
-__m128i HorizontalTaps8To16_2x2(const uint8_t* src, const ptrdiff_t src_stride,
-                                const __m128i* const v_tap) {
-  const __m128i sum =
-      SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
+__m256i HorizontalTaps8To16(const __m256i* const src,
+                            const __m256i* const v_tap) {
+  const __m256i sum = SumHorizontalTaps<filter_index>(src, v_tap);
 
   return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
 }
 
 // Filter 2xh sizes.
-template <int num_taps, int step, int filter_index, bool is_2d = false,
+template <int num_taps, int filter_index, bool is_2d = false,
           bool is_compound = false>
 void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride,
                       void* const dest, const ptrdiff_t pred_stride,
@@ -183,7 +139,8 @@ void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride,
   assert(num_taps <= 4);
   if (num_taps <= 4) {
     if (!is_compound) {
-      int y = 0;
+      int y = height;
+      if (is_2d) y -= 1;
       do {
         if (is_2d) {
           const __m128i sum =
@@ -202,8 +159,8 @@ void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride,
         }
 
         src += src_stride << 1;
-        y += 2;
-      } while (y < height - 1);
+        y -= 2;
+      } while (y != 0);
 
       // The 2d filters have an odd |height| because the horizontal pass
       // generates context for the vertical pass.
@@ -236,7 +193,7 @@ void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride,
 }
 
 // Filter widths >= 4.
-template <int num_taps, int step, int filter_index, bool is_2d = false,
+template <int num_taps, int filter_index, bool is_2d = false,
           bool is_compound = false>
 void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride,
                       void* const dest, const ptrdiff_t pred_stride,
@@ -251,7 +208,22 @@ void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride,
       int x = 0;
       do {
         if (is_2d || is_compound) {
-          // placeholder
+          // Load into 2 128 bit lanes.
+          const __m256i src_long =
+              SetrM128i(LoadUnaligned16(&src[x]), LoadUnaligned16(&src[x + 8]));
+          const __m256i result =
+              HorizontalTaps8To16<filter_index>(&src_long, v_tap);
+          const __m256i src_long2 = SetrM128i(LoadUnaligned16(&src[x + 16]),
+                                              LoadUnaligned16(&src[x + 24]));
+          const __m256i result2 =
+              HorizontalTaps8To16<filter_index>(&src_long2, v_tap);
+          if (is_2d) {
+            StoreAligned32(&dest16[x], result);
+            StoreAligned32(&dest16[x + 16], result2);
+          } else {
+            StoreUnaligned32(&dest16[x], result);
+            StoreUnaligned32(&dest16[x + 16], result2);
+          }
         } else {
           // Load src used to calculate dest8[7:0] and dest8[23:16].
           const __m256i src_long = LoadUnaligned32(&src[x]);
@@ -264,7 +236,7 @@ void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride,
           // Combine results and store.
           StoreUnaligned32(&dest8[x], _mm256_unpacklo_epi64(result, result2));
         }
-        x += step * 4;
+        x += 32;
       } while (x < width);
       src += src_stride;
       dest8 += pred_stride;
@@ -272,9 +244,26 @@ void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride,
     } while (--y != 0);
   } else if (width == 16) {
     int y = height;
+    if (is_2d) y -= 1;
     do {
       if (is_2d || is_compound) {
-        // placeholder
+        // Load into 2 128 bit lanes.
+        const __m256i src_long =
+            SetrM128i(LoadUnaligned16(&src[0]), LoadUnaligned16(&src[8]));
+        const __m256i result =
+            HorizontalTaps8To16<filter_index>(&src_long, v_tap);
+        const __m256i src_long2 =
+            SetrM128i(LoadUnaligned16(&src[src_stride]),
+                      LoadUnaligned16(&src[8 + src_stride]));
+        const __m256i result2 =
+            HorizontalTaps8To16<filter_index>(&src_long2, v_tap);
+        if (is_2d) {
+          StoreAligned32(&dest16[0], result);
+          StoreAligned32(&dest16[pred_stride], result2);
+        } else {
+          StoreUnaligned32(&dest16[0], result);
+          StoreUnaligned32(&dest16[pred_stride], result2);
+        }
       } else {
         // Load into 2 128 bit lanes.
         const __m256i src_long = SetrM128i(LoadUnaligned16(&src[0]),
@@ -295,11 +284,37 @@ void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride,
       dest16 += pred_stride * 2;
       y -= 2;
     } while (y != 0);
+
+    // The 2d filters have an odd |height| during the horizontal pass, so
+    // filter the remaining row.
+    if (is_2d) {
+      const __m256i src_long =
+          SetrM128i(LoadUnaligned16(&src[0]), LoadUnaligned16(&src[8]));
+      const __m256i result =
+          HorizontalTaps8To16<filter_index>(&src_long, v_tap);
+      StoreAligned32(&dest16[0], result);
+    }
+
   } else if (width == 8) {
     int y = height;
+    if (is_2d) y -= 1;
     do {
+      // Load into 2 128 bit lanes.
+      const __m128i this_row = LoadUnaligned16(&src[0]);
+      const __m128i next_row = LoadUnaligned16(&src[src_stride]);
+      const __m256i src_long = SetrM128i(this_row, next_row);
       if (is_2d || is_compound) {
-        // placeholder
+        const __m256i result =
+            HorizontalTaps8To16<filter_index>(&src_long, v_tap);
+        if (is_2d) {
+          StoreAligned16(&dest16[0], _mm256_castsi256_si128(result));
+          StoreAligned16(&dest16[pred_stride],
+                         _mm256_extracti128_si256(result, 1));
+        } else {
+          StoreUnaligned16(&dest16[0], _mm256_castsi256_si128(result));
+          StoreUnaligned16(&dest16[pred_stride],
+                           _mm256_extracti128_si256(result, 1));
+        }
       } else {
         const __m128i this_row = LoadUnaligned16(&src[0]);
         const __m128i next_row = LoadUnaligned16(&src[src_stride]);
@@ -315,11 +330,29 @@ void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride,
       dest16 += pred_stride * 2;
       y -= 2;
     } while (y != 0);
+
+    // The 2d filters have an odd |height| during the horizontal pass, so
+    // filter the remaining row.
+    if (is_2d) {
+      const __m256i src_long = _mm256_castsi128_si256(LoadUnaligned16(&src[0]));
+      const __m256i result =
+          HorizontalTaps8To16<filter_index>(&src_long, v_tap);
+      StoreAligned16(&dest16[0], _mm256_castsi256_si128(result));
+    }
+
   } else {  // width == 4
     int y = height;
+    if (is_2d) y -= 1;
     do {
+      // Load into 2 128 bit lanes.
+      const __m128i this_row = LoadUnaligned16(&src[0]);
+      const __m128i next_row = LoadUnaligned16(&src[src_stride]);
+      const __m256i src_long = SetrM128i(this_row, next_row);
       if (is_2d || is_compound) {
-        // placeholder
+        const __m256i result =
+            HorizontalTaps8To16<filter_index>(&src_long, v_tap);
+        StoreLo8(&dest16[0], _mm256_castsi256_si128(result));
+        StoreLo8(&dest16[pred_stride], _mm256_extracti128_si256(result, 1));
       } else {
         const __m128i this_row = LoadUnaligned16(&src[0]);
         const __m128i next_row = LoadUnaligned16(&src[src_stride]);
@@ -335,93 +368,176 @@ void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride,
       dest16 += pred_stride * 2;
       y -= 2;
     } while (y != 0);
+
+    // The 2d filters have an odd |height| during the horizontal pass, so
+    // filter the remaining row.
+    if (is_2d) {
+      const __m256i src_long = _mm256_castsi128_si256(LoadUnaligned16(&src[0]));
+      const __m256i result =
+          HorizontalTaps8To16<filter_index>(&src_long, v_tap);
+      StoreLo8(&dest16[0], _mm256_castsi256_si128(result));
+    }
   }
 }
 
 template <int num_taps, bool is_2d_vertical = false>
 LIBGAV1_ALWAYS_INLINE void SetupTaps(const __m128i* const filter,
-                                     __m128i* v_tap) {
+                                     __m256i* v_tap) {
   if (num_taps == 8) {
-    v_tap[0] = _mm_shufflelo_epi16(*filter, 0x0);   // k1k0
-    v_tap[1] = _mm_shufflelo_epi16(*filter, 0x55);  // k3k2
-    v_tap[2] = _mm_shufflelo_epi16(*filter, 0xaa);  // k5k4
-    v_tap[3] = _mm_shufflelo_epi16(*filter, 0xff);  // k7k6
     if (is_2d_vertical) {
-      v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
-      v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]);
-      v_tap[2] = _mm_cvtepi8_epi16(v_tap[2]);
-      v_tap[3] = _mm_cvtepi8_epi16(v_tap[3]);
+      v_tap[0] = _mm256_broadcastd_epi32(*filter);                      // k1k0
+      v_tap[1] = _mm256_broadcastd_epi32(_mm_srli_si128(*filter, 4));   // k3k2
+      v_tap[2] = _mm256_broadcastd_epi32(_mm_srli_si128(*filter, 8));   // k5k4
+      v_tap[3] = _mm256_broadcastd_epi32(_mm_srli_si128(*filter, 12));  // k7k6
     } else {
-      v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
-      v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]);
-      v_tap[2] = _mm_unpacklo_epi64(v_tap[2], v_tap[2]);
-      v_tap[3] = _mm_unpacklo_epi64(v_tap[3], v_tap[3]);
+      v_tap[0] = _mm256_broadcastw_epi16(*filter);                     // k1k0
+      v_tap[1] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 2));  // k3k2
+      v_tap[2] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 4));  // k5k4
+      v_tap[3] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 6));  // k7k6
     }
   } else if (num_taps == 6) {
-    const __m128i adjusted_filter = _mm_srli_si128(*filter, 1);
-    v_tap[0] = _mm_shufflelo_epi16(adjusted_filter, 0x0);   // k2k1
-    v_tap[1] = _mm_shufflelo_epi16(adjusted_filter, 0x55);  // k4k3
-    v_tap[2] = _mm_shufflelo_epi16(adjusted_filter, 0xaa);  // k6k5
     if (is_2d_vertical) {
-      v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
-      v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]);
-      v_tap[2] = _mm_cvtepi8_epi16(v_tap[2]);
+      v_tap[0] = _mm256_broadcastd_epi32(_mm_srli_si128(*filter, 2));   // k2k1
+      v_tap[1] = _mm256_broadcastd_epi32(_mm_srli_si128(*filter, 6));   // k4k3
+      v_tap[2] = _mm256_broadcastd_epi32(_mm_srli_si128(*filter, 10));  // k6k5
     } else {
-      v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
-      v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]);
-      v_tap[2] = _mm_unpacklo_epi64(v_tap[2], v_tap[2]);
+      v_tap[0] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 1));  // k2k1
+      v_tap[1] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 3));  // k4k3
+      v_tap[2] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 5));  // k6k5
     }
   } else if (num_taps == 4) {
-    v_tap[0] = _mm_shufflelo_epi16(*filter, 0x55);  // k3k2
-    v_tap[1] = _mm_shufflelo_epi16(*filter, 0xaa);  // k5k4
     if (is_2d_vertical) {
-      v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
-      v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]);
+      v_tap[0] = _mm256_broadcastd_epi32(_mm_srli_si128(*filter, 4));  // k3k2
+      v_tap[1] = _mm256_broadcastd_epi32(_mm_srli_si128(*filter, 8));  // k5k4
     } else {
-      v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
-      v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]);
+      v_tap[0] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 2));  // k3k2
+      v_tap[1] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 4));  // k5k4
     }
   } else {  // num_taps == 2
-    const __m128i adjusted_filter = _mm_srli_si128(*filter, 1);
-    v_tap[0] = _mm_shufflelo_epi16(adjusted_filter, 0x55);  // k4k3
     if (is_2d_vertical) {
-      v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
+      v_tap[0] = _mm256_broadcastd_epi32(_mm_srli_si128(*filter, 6));  // k4k3
     } else {
-      v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
+      v_tap[0] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 3));  // k4k3
     }
   }
 }
 
-template <int num_taps, bool is_2d_vertical = false>
-LIBGAV1_ALWAYS_INLINE void SetupTaps(const __m128i* const filter,
-                                     __m256i* v_tap) {
-  if (num_taps == 8) {
-    v_tap[0] = _mm256_broadcastw_epi16(*filter);                     // k1k0
-    v_tap[1] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 2));  // k3k2
-    v_tap[2] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 4));  // k5k4
-    v_tap[3] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 6));  // k7k6
-    if (is_2d_vertical) {
-      // placeholder
-    }
-  } else if (num_taps == 6) {
-    v_tap[0] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 1));  // k2k1
-    v_tap[1] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 3));  // k4k3
-    v_tap[2] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 5));  // k6k5
-    if (is_2d_vertical) {
-      // placeholder
-    }
-  } else if (num_taps == 4) {
-    v_tap[0] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 2));  // k3k2
-    v_tap[1] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 4));  // k5k4
-    if (is_2d_vertical) {
-      // placeholder
-    }
-  } else {  // num_taps == 2
-    v_tap[0] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 3));  // k4k3
-    if (is_2d_vertical) {
-      // placeholder
+template <int num_taps, bool is_compound>
+__m256i SimpleSum2DVerticalTaps(const __m256i* const src,
+                                const __m256i* const taps) {
+  __m256i sum_lo =
+      _mm256_madd_epi16(_mm256_unpacklo_epi16(src[0], src[1]), taps[0]);
+  __m256i sum_hi =
+      _mm256_madd_epi16(_mm256_unpackhi_epi16(src[0], src[1]), taps[0]);
+  if (num_taps >= 4) {
+    __m256i madd_lo =
+        _mm256_madd_epi16(_mm256_unpacklo_epi16(src[2], src[3]), taps[1]);
+    __m256i madd_hi =
+        _mm256_madd_epi16(_mm256_unpackhi_epi16(src[2], src[3]), taps[1]);
+    sum_lo = _mm256_add_epi32(sum_lo, madd_lo);
+    sum_hi = _mm256_add_epi32(sum_hi, madd_hi);
+    if (num_taps >= 6) {
+      madd_lo =
+          _mm256_madd_epi16(_mm256_unpacklo_epi16(src[4], src[5]), taps[2]);
+      madd_hi =
+          _mm256_madd_epi16(_mm256_unpackhi_epi16(src[4], src[5]), taps[2]);
+      sum_lo = _mm256_add_epi32(sum_lo, madd_lo);
+      sum_hi = _mm256_add_epi32(sum_hi, madd_hi);
+      if (num_taps == 8) {
+        madd_lo =
+            _mm256_madd_epi16(_mm256_unpacklo_epi16(src[6], src[7]), taps[3]);
+        madd_hi =
+            _mm256_madd_epi16(_mm256_unpackhi_epi16(src[6], src[7]), taps[3]);
+        sum_lo = _mm256_add_epi32(sum_lo, madd_lo);
+        sum_hi = _mm256_add_epi32(sum_hi, madd_hi);
+      }
     }
   }
+
+  if (is_compound) {
+    return _mm256_packs_epi32(
+        RightShiftWithRounding_S32(sum_lo, kInterRoundBitsCompoundVertical - 1),
+        RightShiftWithRounding_S32(sum_hi,
+                                   kInterRoundBitsCompoundVertical - 1));
+  }
+
+  return _mm256_packs_epi32(
+      RightShiftWithRounding_S32(sum_lo, kInterRoundBitsVertical - 1),
+      RightShiftWithRounding_S32(sum_hi, kInterRoundBitsVertical - 1));
+}
+
+template <int num_taps, bool is_compound = false>
+void Filter2DVertical16xH(const uint16_t* src, void* const dst,
+                          const ptrdiff_t dst_stride, const int width,
+                          const int height, const __m256i* const taps) {
+  assert(width >= 8);
+  constexpr int next_row = num_taps - 1;
+  // The Horizontal pass uses |width| as |stride| for the intermediate buffer.
+  const ptrdiff_t src_stride = width;
+
+  auto* dst8 = static_cast<uint8_t*>(dst);
+  auto* dst16 = static_cast<uint16_t*>(dst);
+
+  int x = 0;
+  do {
+    __m256i srcs[8];
+    const uint16_t* src_x = src + x;
+    srcs[0] = LoadAligned32(src_x);
+    src_x += src_stride;
+    if (num_taps >= 4) {
+      srcs[1] = LoadAligned32(src_x);
+      src_x += src_stride;
+      srcs[2] = LoadAligned32(src_x);
+      src_x += src_stride;
+      if (num_taps >= 6) {
+        srcs[3] = LoadAligned32(src_x);
+        src_x += src_stride;
+        srcs[4] = LoadAligned32(src_x);
+        src_x += src_stride;
+        if (num_taps == 8) {
+          srcs[5] = LoadAligned32(src_x);
+          src_x += src_stride;
+          srcs[6] = LoadAligned32(src_x);
+          src_x += src_stride;
+        }
+      }
+    }
+
+    auto* dst8_x = dst8 + x;
+    auto* dst16_x = dst16 + x;
+    int y = height;
+    do {
+      srcs[next_row] = LoadAligned32(src_x);
+      src_x += src_stride;
+
+      const __m256i sum =
+          SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps);
+      if (is_compound) {
+        StoreUnaligned32(dst16_x, sum);
+        dst16_x += dst_stride;
+      } else {
+        const __m128i packed_sum = _mm_packus_epi16(
+            _mm256_castsi256_si128(sum), _mm256_extracti128_si256(sum, 1));
+        StoreUnaligned16(dst8_x, packed_sum);
+        dst8_x += dst_stride;
+      }
+
+      srcs[0] = srcs[1];
+      if (num_taps >= 4) {
+        srcs[1] = srcs[2];
+        srcs[2] = srcs[3];
+        if (num_taps >= 6) {
+          srcs[3] = srcs[4];
+          srcs[4] = srcs[5];
+          if (num_taps == 8) {
+            srcs[5] = srcs[6];
+            srcs[6] = srcs[7];
+          }
+        }
+      }
+    } while (--y != 0);
+    x += 16;
+  } while (x < width);
 }
 
 template <bool is_2d = false, bool is_compound = false>
@@ -436,16 +552,16 @@ LIBGAV1_ALWAYS_INLINE void DoHorizontalPass2xH(
 
   if (filter_index == 4) {  // 4 tap.
     SetupTaps<4>(&v_horizontal_filter, v_tap);
-    FilterHorizontal<4, 8, 4, is_2d, is_compound>(
-        src, src_stride, dst, dst_stride, width, height, v_tap);
+    FilterHorizontal<4, 4, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+                                               width, height, v_tap);
   } else if (filter_index == 5) {  // 4 tap.
     SetupTaps<4>(&v_horizontal_filter, v_tap);
-    FilterHorizontal<4, 8, 5, is_2d, is_compound>(
-        src, src_stride, dst, dst_stride, width, height, v_tap);
+    FilterHorizontal<4, 5, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+                                               width, height, v_tap);
   } else {  // 2 tap.
     SetupTaps<2>(&v_horizontal_filter, v_tap);
-    FilterHorizontal<2, 8, 3, is_2d, is_compound>(
-        src, src_stride, dst, dst_stride, width, height, v_tap);
+    FilterHorizontal<2, 3, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+                                               width, height, v_tap);
   }
 }
 
@@ -461,28 +577,792 @@ LIBGAV1_ALWAYS_INLINE void DoHorizontalPass(
 
   if (filter_index == 2) {  // 8 tap.
     SetupTaps<8>(&v_horizontal_filter, v_tap);
-    FilterHorizontal<8, 8, 2, is_2d, is_compound>(
-        src, src_stride, dst, dst_stride, width, height, v_tap);
+    FilterHorizontal<8, 2, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+                                               width, height, v_tap);
   } else if (filter_index == 1) {  // 6 tap.
     SetupTaps<6>(&v_horizontal_filter, v_tap);
-    FilterHorizontal<6, 8, 1, is_2d, is_compound>(
-        src, src_stride, dst, dst_stride, width, height, v_tap);
+    FilterHorizontal<6, 1, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+                                               width, height, v_tap);
   } else if (filter_index == 0) {  // 6 tap.
     SetupTaps<6>(&v_horizontal_filter, v_tap);
-    FilterHorizontal<6, 8, 0, is_2d, is_compound>(
-        src, src_stride, dst, dst_stride, width, height, v_tap);
+    FilterHorizontal<6, 0, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+                                               width, height, v_tap);
   } else if (filter_index == 4) {  // 4 tap.
     SetupTaps<4>(&v_horizontal_filter, v_tap);
-    FilterHorizontal<4, 8, 4, is_2d, is_compound>(
-        src, src_stride, dst, dst_stride, width, height, v_tap);
+    FilterHorizontal<4, 4, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+                                               width, height, v_tap);
   } else if (filter_index == 5) {  // 4 tap.
     SetupTaps<4>(&v_horizontal_filter, v_tap);
-    FilterHorizontal<4, 8, 5, is_2d, is_compound>(
-        src, src_stride, dst, dst_stride, width, height, v_tap);
+    FilterHorizontal<4, 5, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+                                               width, height, v_tap);
   } else {  // 2 tap.
     SetupTaps<2>(&v_horizontal_filter, v_tap);
-    FilterHorizontal<2, 8, 3, is_2d, is_compound>(
-        src, src_stride, dst, dst_stride, width, height, v_tap);
+    FilterHorizontal<2, 3, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+                                               width, height, v_tap);
+  }
+}
+
+void Convolve2D_AVX2(const void* const reference,
+                     const ptrdiff_t reference_stride,
+                     const int horizontal_filter_index,
+                     const int vertical_filter_index,
+                     const int horizontal_filter_id,
+                     const int vertical_filter_id, const int width,
+                     const int height, void* prediction,
+                     const ptrdiff_t pred_stride) {
+  const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
+  const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
+  const int vertical_taps = GetNumTapsInFilter(vert_filter_index);
+
+  // The output of the horizontal filter is guaranteed to fit in 16 bits.
+  alignas(32) uint16_t
+      intermediate_result[kMaxSuperBlockSizeInPixels *
+                          (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
+  const int intermediate_height = height + vertical_taps - 1;
+
+  const ptrdiff_t src_stride = reference_stride;
+  const auto* src = static_cast<const uint8_t*>(reference) -
+                    (vertical_taps / 2 - 1) * src_stride - kHorizontalOffset;
+  if (width > 2) {
+    DoHorizontalPass</*is_2d=*/true>(src, src_stride, intermediate_result,
+                                     width, width, intermediate_height,
+                                     horizontal_filter_id, horiz_filter_index);
+  } else {
+    // Use non avx2 version for smaller widths.
+    DoHorizontalPass2xH</*is_2d=*/true>(
+        src, src_stride, intermediate_result, width, width, intermediate_height,
+        horizontal_filter_id, horiz_filter_index);
+  }
+
+  // Vertical filter.
+  auto* dest = static_cast<uint8_t*>(prediction);
+  const ptrdiff_t dest_stride = pred_stride;
+  assert(vertical_filter_id != 0);
+
+  const __m128i v_filter =
+      LoadLo8(kHalfSubPixelFilters[vert_filter_index][vertical_filter_id]);
+
+  // Use 256 bits for width > 8.
+  if (width > 8) {
+    __m256i taps_256[4];
+    const __m128i v_filter_ext = _mm_cvtepi8_epi16(v_filter);
+
+    if (vertical_taps == 8) {
+      SetupTaps<8, /*is_2d_vertical=*/true>(&v_filter_ext, taps_256);
+      Filter2DVertical16xH<8>(intermediate_result, dest, dest_stride, width,
+                              height, taps_256);
+    } else if (vertical_taps == 6) {
+      SetupTaps<6, /*is_2d_vertical=*/true>(&v_filter_ext, taps_256);
+      Filter2DVertical16xH<6>(intermediate_result, dest, dest_stride, width,
+                              height, taps_256);
+    } else if (vertical_taps == 4) {
+      SetupTaps<4, /*is_2d_vertical=*/true>(&v_filter_ext, taps_256);
+      Filter2DVertical16xH<4>(intermediate_result, dest, dest_stride, width,
+                              height, taps_256);
+    } else {  // |vertical_taps| == 2
+      SetupTaps<2, /*is_2d_vertical=*/true>(&v_filter_ext, taps_256);
+      Filter2DVertical16xH<2>(intermediate_result, dest, dest_stride, width,
+                              height, taps_256);
+    }
+  } else {  // width <= 8
+    __m128i taps[4];
+    // Use 128 bit code.
+    if (vertical_taps == 8) {
+      SetupTaps<8, /*is_2d_vertical=*/true>(&v_filter, taps);
+      if (width == 2) {
+        Filter2DVertical2xH<8>(intermediate_result, dest, dest_stride, height,
+                               taps);
+      } else if (width == 4) {
+        Filter2DVertical4xH<8>(intermediate_result, dest, dest_stride, height,
+                               taps);
+      } else {
+        Filter2DVertical<8>(intermediate_result, dest, dest_stride, width,
+                            height, taps);
+      }
+    } else if (vertical_taps == 6) {
+      SetupTaps<6, /*is_2d_vertical=*/true>(&v_filter, taps);
+      if (width == 2) {
+        Filter2DVertical2xH<6>(intermediate_result, dest, dest_stride, height,
+                               taps);
+      } else if (width == 4) {
+        Filter2DVertical4xH<6>(intermediate_result, dest, dest_stride, height,
+                               taps);
+      } else {
+        Filter2DVertical<6>(intermediate_result, dest, dest_stride, width,
+                            height, taps);
+      }
+    } else if (vertical_taps == 4) {
+      SetupTaps<4, /*is_2d_vertical=*/true>(&v_filter, taps);
+      if (width == 2) {
+        Filter2DVertical2xH<4>(intermediate_result, dest, dest_stride, height,
+                               taps);
+      } else if (width == 4) {
+        Filter2DVertical4xH<4>(intermediate_result, dest, dest_stride, height,
+                               taps);
+      } else {
+        Filter2DVertical<4>(intermediate_result, dest, dest_stride, width,
+                            height, taps);
+      }
+    } else {  // |vertical_taps| == 2
+      SetupTaps<2, /*is_2d_vertical=*/true>(&v_filter, taps);
+      if (width == 2) {
+        Filter2DVertical2xH<2>(intermediate_result, dest, dest_stride, height,
+                               taps);
+      } else if (width == 4) {
+        Filter2DVertical4xH<2>(intermediate_result, dest, dest_stride, height,
+                               taps);
+      } else {
+        Filter2DVertical<2>(intermediate_result, dest, dest_stride, width,
+                            height, taps);
+      }
+    }
+  }
+}
+
+// The 1D compound shift is always |kInterRoundBitsHorizontal|, even for 1D
+// Vertical calculations.
+__m256i Compound1DShift(const __m256i sum) {
+  return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
+}
+
+template <int filter_index, bool unpack_high = false>
+__m256i SumVerticalTaps(const __m256i* const srcs, const __m256i* const v_tap) {
+  __m256i v_src[4];
+
+  if (!unpack_high) {
+    if (filter_index < 2) {
+      // 6 taps.
+      v_src[0] = _mm256_unpacklo_epi8(srcs[0], srcs[1]);
+      v_src[1] = _mm256_unpacklo_epi8(srcs[2], srcs[3]);
+      v_src[2] = _mm256_unpacklo_epi8(srcs[4], srcs[5]);
+    } else if (filter_index == 2) {
+      // 8 taps.
+      v_src[0] = _mm256_unpacklo_epi8(srcs[0], srcs[1]);
+      v_src[1] = _mm256_unpacklo_epi8(srcs[2], srcs[3]);
+      v_src[2] = _mm256_unpacklo_epi8(srcs[4], srcs[5]);
+      v_src[3] = _mm256_unpacklo_epi8(srcs[6], srcs[7]);
+    } else if (filter_index == 3) {
+      // 2 taps.
+      v_src[0] = _mm256_unpacklo_epi8(srcs[0], srcs[1]);
+    } else if (filter_index > 3) {
+      // 4 taps.
+      v_src[0] = _mm256_unpacklo_epi8(srcs[0], srcs[1]);
+      v_src[1] = _mm256_unpacklo_epi8(srcs[2], srcs[3]);
+    }
+  } else {
+    if (filter_index < 2) {
+      // 6 taps.
+      v_src[0] = _mm256_unpackhi_epi8(srcs[0], srcs[1]);
+      v_src[1] = _mm256_unpackhi_epi8(srcs[2], srcs[3]);
+      v_src[2] = _mm256_unpackhi_epi8(srcs[4], srcs[5]);
+    } else if (filter_index == 2) {
+      // 8 taps.
+      v_src[0] = _mm256_unpackhi_epi8(srcs[0], srcs[1]);
+      v_src[1] = _mm256_unpackhi_epi8(srcs[2], srcs[3]);
+      v_src[2] = _mm256_unpackhi_epi8(srcs[4], srcs[5]);
+      v_src[3] = _mm256_unpackhi_epi8(srcs[6], srcs[7]);
+    } else if (filter_index == 3) {
+      // 2 taps.
+      v_src[0] = _mm256_unpackhi_epi8(srcs[0], srcs[1]);
+    } else if (filter_index > 3) {
+      // 4 taps.
+      v_src[0] = _mm256_unpackhi_epi8(srcs[0], srcs[1]);
+      v_src[1] = _mm256_unpackhi_epi8(srcs[2], srcs[3]);
+    }
+  }
+  return SumOnePassTaps<filter_index>(v_src, v_tap);
+}
+
+template <int filter_index, bool is_compound = false>
+void FilterVertical32xH(const uint8_t* src, const ptrdiff_t src_stride,
+                        void* const dst, const ptrdiff_t dst_stride,
+                        const int width, const int height,
+                        const __m256i* const v_tap) {
+  const int num_taps = GetNumTapsInFilter(filter_index);
+  const int next_row = num_taps - 1;
+  auto* dst8 = static_cast<uint8_t*>(dst);
+  auto* dst16 = static_cast<uint16_t*>(dst);
+  assert(width >= 32);
+  int x = 0;
+  do {
+    const uint8_t* src_x = src + x;
+    __m256i srcs[8];
+    srcs[0] = LoadUnaligned32(src_x);
+    src_x += src_stride;
+    if (num_taps >= 4) {
+      srcs[1] = LoadUnaligned32(src_x);
+      src_x += src_stride;
+      srcs[2] = LoadUnaligned32(src_x);
+      src_x += src_stride;
+      if (num_taps >= 6) {
+        srcs[3] = LoadUnaligned32(src_x);
+        src_x += src_stride;
+        srcs[4] = LoadUnaligned32(src_x);
+        src_x += src_stride;
+        if (num_taps == 8) {
+          srcs[5] = LoadUnaligned32(src_x);
+          src_x += src_stride;
+          srcs[6] = LoadUnaligned32(src_x);
+          src_x += src_stride;
+        }
+      }
+    }
+
+    auto* dst8_x = dst8 + x;
+    auto* dst16_x = dst16 + x;
+    int y = height;
+    do {
+      srcs[next_row] = LoadUnaligned32(src_x);
+      src_x += src_stride;
+
+      const __m256i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+      const __m256i sums_hi =
+          SumVerticalTaps<filter_index, /*unpack_high=*/true>(srcs, v_tap);
+      if (is_compound) {
+        const __m256i results =
+            Compound1DShift(_mm256_permute2x128_si256(sums, sums_hi, 0x20));
+        const __m256i results_hi =
+            Compound1DShift(_mm256_permute2x128_si256(sums, sums_hi, 0x31));
+        StoreUnaligned32(dst16_x, results);
+        StoreUnaligned32(dst16_x + 16, results_hi);
+        dst16_x += dst_stride;
+      } else {
+        const __m256i results =
+            RightShiftWithRounding_S16(sums, kFilterBits - 1);
+        const __m256i results_hi =
+            RightShiftWithRounding_S16(sums_hi, kFilterBits - 1);
+        const __m256i packed_results = _mm256_packus_epi16(results, results_hi);
+
+        StoreUnaligned32(dst8_x, packed_results);
+        dst8_x += dst_stride;
+      }
+
+      srcs[0] = srcs[1];
+      if (num_taps >= 4) {
+        srcs[1] = srcs[2];
+        srcs[2] = srcs[3];
+        if (num_taps >= 6) {
+          srcs[3] = srcs[4];
+          srcs[4] = srcs[5];
+          if (num_taps == 8) {
+            srcs[5] = srcs[6];
+            srcs[6] = srcs[7];
+          }
+        }
+      }
+    } while (--y != 0);
+    x += 32;
+  } while (x < width);
+}
+
+template <int filter_index, bool is_compound = false>
+void FilterVertical16xH(const uint8_t* src, const ptrdiff_t src_stride,
+                        void* const dst, const ptrdiff_t dst_stride,
+                        const int /*width*/, const int height,
+                        const __m256i* const v_tap) {
+  const int num_taps = GetNumTapsInFilter(filter_index);
+  const int next_row = num_taps;
+  auto* dst8 = static_cast<uint8_t*>(dst);
+  auto* dst16 = static_cast<uint16_t*>(dst);
+
+  const uint8_t* src_x = src;
+  __m256i srcs[8 + 1];
+  // The upper 128 bits hold the filter data for the next row.
+  srcs[0] = _mm256_castsi128_si256(LoadUnaligned16(src_x));
+  src_x += src_stride;
+  if (num_taps >= 4) {
+    srcs[1] = _mm256_castsi128_si256(LoadUnaligned16(src_x));
+    src_x += src_stride;
+    srcs[0] =
+        _mm256_inserti128_si256(srcs[0], _mm256_castsi256_si128(srcs[1]), 1);
+    srcs[2] = _mm256_castsi128_si256(LoadUnaligned16(src_x));
+    src_x += src_stride;
+    srcs[1] =
+        _mm256_inserti128_si256(srcs[1], _mm256_castsi256_si128(srcs[2]), 1);
+    if (num_taps >= 6) {
+      srcs[3] = _mm256_castsi128_si256(LoadUnaligned16(src_x));
+      src_x += src_stride;
+      srcs[2] =
+          _mm256_inserti128_si256(srcs[2], _mm256_castsi256_si128(srcs[3]), 1);
+      srcs[4] = _mm256_castsi128_si256(LoadUnaligned16(src_x));
+      src_x += src_stride;
+      srcs[3] =
+          _mm256_inserti128_si256(srcs[3], _mm256_castsi256_si128(srcs[4]), 1);
+      if (num_taps == 8) {
+        srcs[5] = _mm256_castsi128_si256(LoadUnaligned16(src_x));
+        src_x += src_stride;
+        srcs[4] = _mm256_inserti128_si256(srcs[4],
+                                          _mm256_castsi256_si128(srcs[5]), 1);
+        srcs[6] = _mm256_castsi128_si256(LoadUnaligned16(src_x));
+        src_x += src_stride;
+        srcs[5] = _mm256_inserti128_si256(srcs[5],
+                                          _mm256_castsi256_si128(srcs[6]), 1);
+      }
+    }
+  }
+
+  int y = height;
+  do {
+    srcs[next_row - 1] = _mm256_castsi128_si256(LoadUnaligned16(src_x));
+    src_x += src_stride;
+
+    srcs[next_row - 2] = _mm256_inserti128_si256(
+        srcs[next_row - 2], _mm256_castsi256_si128(srcs[next_row - 1]), 1);
+
+    srcs[next_row] = _mm256_castsi128_si256(LoadUnaligned16(src_x));
+    src_x += src_stride;
+
+    srcs[next_row - 1] = _mm256_inserti128_si256(
+        srcs[next_row - 1], _mm256_castsi256_si128(srcs[next_row]), 1);
+
+    const __m256i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+    const __m256i sums_hi =
+        SumVerticalTaps<filter_index, /*unpack_high=*/true>(srcs, v_tap);
+    if (is_compound) {
+      const __m256i results =
+          Compound1DShift(_mm256_permute2x128_si256(sums, sums_hi, 0x20));
+      const __m256i results_hi =
+          Compound1DShift(_mm256_permute2x128_si256(sums, sums_hi, 0x31));
+
+      StoreUnaligned32(dst16, results);
+      StoreUnaligned32(dst16 + dst_stride, results_hi);
+      dst16 += dst_stride << 1;
+    } else {
+      const __m256i results = RightShiftWithRounding_S16(sums, kFilterBits - 1);
+      const __m256i results_hi =
+          RightShiftWithRounding_S16(sums_hi, kFilterBits - 1);
+      const __m256i packed_results = _mm256_packus_epi16(results, results_hi);
+      const __m128i this_dst = _mm256_castsi256_si128(packed_results);
+      const auto next_dst = _mm256_extracti128_si256(packed_results, 1);
+
+      StoreUnaligned16(dst8, this_dst);
+      StoreUnaligned16(dst8 + dst_stride, next_dst);
+      dst8 += dst_stride << 1;
+    }
+
+    srcs[0] = srcs[2];
+    if (num_taps >= 4) {
+      srcs[1] = srcs[3];
+      srcs[2] = srcs[4];
+      if (num_taps >= 6) {
+        srcs[3] = srcs[5];
+        srcs[4] = srcs[6];
+        if (num_taps == 8) {
+          srcs[5] = srcs[7];
+          srcs[6] = srcs[8];
+        }
+      }
+    }
+    y -= 2;
+  } while (y != 0);
+}
+
+template <int filter_index, bool is_compound = false>
+void FilterVertical8xH(const uint8_t* src, const ptrdiff_t src_stride,
+                       void* const dst, const ptrdiff_t dst_stride,
+                       const int /*width*/, const int height,
+                       const __m256i* const v_tap) {
+  const int num_taps = GetNumTapsInFilter(filter_index);
+  const int next_row = num_taps;
+  auto* dst8 = static_cast<uint8_t*>(dst);
+  auto* dst16 = static_cast<uint16_t*>(dst);
+
+  const uint8_t* src_x = src;
+  __m256i srcs[8 + 1];
+  // The upper 128 bits hold the filter data for the next row.
+  srcs[0] = _mm256_castsi128_si256(LoadLo8(src_x));
+  src_x += src_stride;
+  if (num_taps >= 4) {
+    srcs[1] = _mm256_castsi128_si256(LoadLo8(src_x));
+    src_x += src_stride;
+    srcs[0] =
+        _mm256_inserti128_si256(srcs[0], _mm256_castsi256_si128(srcs[1]), 1);
+    srcs[2] = _mm256_castsi128_si256(LoadLo8(src_x));
+    src_x += src_stride;
+    srcs[1] =
+        _mm256_inserti128_si256(srcs[1], _mm256_castsi256_si128(srcs[2]), 1);
+    if (num_taps >= 6) {
+      srcs[3] = _mm256_castsi128_si256(LoadLo8(src_x));
+      src_x += src_stride;
+      srcs[2] =
+          _mm256_inserti128_si256(srcs[2], _mm256_castsi256_si128(srcs[3]), 1);
+      srcs[4] = _mm256_castsi128_si256(LoadLo8(src_x));
+      src_x += src_stride;
+      srcs[3] =
+          _mm256_inserti128_si256(srcs[3], _mm256_castsi256_si128(srcs[4]), 1);
+      if (num_taps == 8) {
+        srcs[5] = _mm256_castsi128_si256(LoadLo8(src_x));
+        src_x += src_stride;
+        srcs[4] = _mm256_inserti128_si256(srcs[4],
+                                          _mm256_castsi256_si128(srcs[5]), 1);
+        srcs[6] = _mm256_castsi128_si256(LoadLo8(src_x));
+        src_x += src_stride;
+        srcs[5] = _mm256_inserti128_si256(srcs[5],
+                                          _mm256_castsi256_si128(srcs[6]), 1);
+      }
+    }
+  }
+
+  int y = height;
+  do {
+    srcs[next_row - 1] = _mm256_castsi128_si256(LoadLo8(src_x));
+    src_x += src_stride;
+
+    srcs[next_row - 2] = _mm256_inserti128_si256(
+        srcs[next_row - 2], _mm256_castsi256_si128(srcs[next_row - 1]), 1);
+
+    srcs[next_row] = _mm256_castsi128_si256(LoadLo8(src_x));
+    src_x += src_stride;
+
+    srcs[next_row - 1] = _mm256_inserti128_si256(
+        srcs[next_row - 1], _mm256_castsi256_si128(srcs[next_row]), 1);
+
+    const __m256i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+    if (is_compound) {
+      const __m256i results = Compound1DShift(sums);
+      const __m128i this_dst = _mm256_castsi256_si128(results);
+      const auto next_dst = _mm256_extracti128_si256(results, 1);
+
+      StoreUnaligned16(dst16, this_dst);
+      StoreUnaligned16(dst16 + dst_stride, next_dst);
+      dst16 += dst_stride << 1;
+    } else {
+      const __m256i results = RightShiftWithRounding_S16(sums, kFilterBits - 1);
+      const __m256i packed_results = _mm256_packus_epi16(results, results);
+      const __m128i this_dst = _mm256_castsi256_si128(packed_results);
+      const auto next_dst = _mm256_extracti128_si256(packed_results, 1);
+
+      StoreLo8(dst8, this_dst);
+      StoreLo8(dst8 + dst_stride, next_dst);
+      dst8 += dst_stride << 1;
+    }
+
+    srcs[0] = srcs[2];
+    if (num_taps >= 4) {
+      srcs[1] = srcs[3];
+      srcs[2] = srcs[4];
+      if (num_taps >= 6) {
+        srcs[3] = srcs[5];
+        srcs[4] = srcs[6];
+        if (num_taps == 8) {
+          srcs[5] = srcs[7];
+          srcs[6] = srcs[8];
+        }
+      }
+    }
+    y -= 2;
+  } while (y != 0);
+}
+
+template <int filter_index, bool is_compound = false>
+void FilterVertical8xH(const uint8_t* src, const ptrdiff_t src_stride,
+                       void* const dst, const ptrdiff_t dst_stride,
+                       const int /*width*/, const int height,
+                       const __m128i* const v_tap) {
+  const int num_taps = GetNumTapsInFilter(filter_index);
+  const int next_row = num_taps - 1;
+  auto* dst8 = static_cast<uint8_t*>(dst);
+  auto* dst16 = static_cast<uint16_t*>(dst);
+
+  const uint8_t* src_x = src;
+  __m128i srcs[8];
+  srcs[0] = LoadLo8(src_x);
+  src_x += src_stride;
+  if (num_taps >= 4) {
+    srcs[1] = LoadLo8(src_x);
+    src_x += src_stride;
+    srcs[2] = LoadLo8(src_x);
+    src_x += src_stride;
+    if (num_taps >= 6) {
+      srcs[3] = LoadLo8(src_x);
+      src_x += src_stride;
+      srcs[4] = LoadLo8(src_x);
+      src_x += src_stride;
+      if (num_taps == 8) {
+        srcs[5] = LoadLo8(src_x);
+        src_x += src_stride;
+        srcs[6] = LoadLo8(src_x);
+        src_x += src_stride;
+      }
+    }
+  }
+
+  int y = height;
+  do {
+    srcs[next_row] = LoadLo8(src_x);
+    src_x += src_stride;
+
+    const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+    if (is_compound) {
+      const __m128i results = Compound1DShift(sums);
+      StoreUnaligned16(dst16, results);
+      dst16 += dst_stride;
+    } else {
+      const __m128i results = RightShiftWithRounding_S16(sums, kFilterBits - 1);
+      StoreLo8(dst8, _mm_packus_epi16(results, results));
+      dst8 += dst_stride;
+    }
+
+    srcs[0] = srcs[1];
+    if (num_taps >= 4) {
+      srcs[1] = srcs[2];
+      srcs[2] = srcs[3];
+      if (num_taps >= 6) {
+        srcs[3] = srcs[4];
+        srcs[4] = srcs[5];
+        if (num_taps == 8) {
+          srcs[5] = srcs[6];
+          srcs[6] = srcs[7];
+        }
+      }
+    }
+  } while (--y != 0);
+}
+
+void ConvolveVertical_AVX2(const void* const reference,
+                           const ptrdiff_t reference_stride,
+                           const int /*horizontal_filter_index*/,
+                           const int vertical_filter_index,
+                           const int /*horizontal_filter_id*/,
+                           const int vertical_filter_id, const int width,
+                           const int height, void* prediction,
+                           const ptrdiff_t pred_stride) {
+  const int filter_index = GetFilterIndex(vertical_filter_index, height);
+  const int vertical_taps = GetNumTapsInFilter(filter_index);
+  const ptrdiff_t src_stride = reference_stride;
+  const auto* src = static_cast<const uint8_t*>(reference) -
+                    (vertical_taps / 2 - 1) * src_stride;
+  auto* dest = static_cast<uint8_t*>(prediction);
+  const ptrdiff_t dest_stride = pred_stride;
+  assert(vertical_filter_id != 0);
+
+  const __m128i v_filter =
+      LoadLo8(kHalfSubPixelFilters[filter_index][vertical_filter_id]);
+
+  // Use 256 bits for width > 4.
+  if (width > 4) {
+    __m256i taps_256[4];
+    if (filter_index < 2) {  // 6 tap.
+      SetupTaps<6>(&v_filter, taps_256);
+      if (width == 8) {
+        FilterVertical8xH<0>(src, src_stride, dest, dest_stride, width, height,
+                             taps_256);
+      } else if (width == 16) {
+        FilterVertical16xH<0>(src, src_stride, dest, dest_stride, width, height,
+                              taps_256);
+      } else {
+        FilterVertical32xH<0>(src, src_stride, dest, dest_stride, width, height,
+                              taps_256);
+      }
+    } else if (filter_index == 2) {  // 8 tap.
+      SetupTaps<8>(&v_filter, taps_256);
+      if (width == 8) {
+        FilterVertical8xH<2>(src, src_stride, dest, dest_stride, width, height,
+                             taps_256);
+      } else if (width == 16) {
+        FilterVertical16xH<2>(src, src_stride, dest, dest_stride, width, height,
+                              taps_256);
+      } else {
+        FilterVertical32xH<2>(src, src_stride, dest, dest_stride, width, height,
+                              taps_256);
+      }
+    } else if (filter_index == 3) {  // 2 tap.
+      SetupTaps<2>(&v_filter, taps_256);
+      if (width == 8) {
+        FilterVertical8xH<3>(src, src_stride, dest, dest_stride, width, height,
+                             taps_256);
+      } else if (width == 16) {
+        FilterVertical16xH<3>(src, src_stride, dest, dest_stride, width, height,
+                              taps_256);
+      } else {
+        FilterVertical32xH<3>(src, src_stride, dest, dest_stride, width, height,
+                              taps_256);
+      }
+    } else if (filter_index == 4) {  // 4 tap.
+      SetupTaps<4>(&v_filter, taps_256);
+      if (width == 8) {
+        FilterVertical8xH<4>(src, src_stride, dest, dest_stride, width, height,
+                             taps_256);
+      } else if (width == 16) {
+        FilterVertical16xH<4>(src, src_stride, dest, dest_stride, width, height,
+                              taps_256);
+      } else {
+        FilterVertical32xH<4>(src, src_stride, dest, dest_stride, width, height,
+                              taps_256);
+      }
+    } else {
+      SetupTaps<4>(&v_filter, taps_256);
+      if (width == 8) {
+        FilterVertical8xH<5>(src, src_stride, dest, dest_stride, width, height,
+                             taps_256);
+      } else if (width == 16) {
+        FilterVertical16xH<5>(src, src_stride, dest, dest_stride, width, height,
+                              taps_256);
+      } else {
+        FilterVertical32xH<5>(src, src_stride, dest, dest_stride, width, height,
+                              taps_256);
+      }
+    }
+  } else {  // width <= 8
+    // Use 128 bit code.
+    __m128i taps[4];
+
+    if (filter_index < 2) {  // 6 tap.
+      SetupTaps<6>(&v_filter, taps);
+      if (width == 2) {
+        FilterVertical2xH<6, 0>(src, src_stride, dest, dest_stride, height,
+                                taps);
+      } else {
+        FilterVertical4xH<6, 0>(src, src_stride, dest, dest_stride, height,
+                                taps);
+      }
+    } else if (filter_index == 2) {  // 8 tap.
+      SetupTaps<8>(&v_filter, taps);
+      if (width == 2) {
+        FilterVertical2xH<8, 2>(src, src_stride, dest, dest_stride, height,
+                                taps);
+      } else {
+        FilterVertical4xH<8, 2>(src, src_stride, dest, dest_stride, height,
+                                taps);
+      }
+    } else if (filter_index == 3) {  // 2 tap.
+      SetupTaps<2>(&v_filter, taps);
+      if (width == 2) {
+        FilterVertical2xH<2, 3>(src, src_stride, dest, dest_stride, height,
+                                taps);
+      } else {
+        FilterVertical4xH<2, 3>(src, src_stride, dest, dest_stride, height,
+                                taps);
+      }
+    } else if (filter_index == 4) {  // 4 tap.
+      SetupTaps<4>(&v_filter, taps);
+      if (width == 2) {
+        FilterVertical2xH<4, 4>(src, src_stride, dest, dest_stride, height,
+                                taps);
+      } else {
+        FilterVertical4xH<4, 4>(src, src_stride, dest, dest_stride, height,
+                                taps);
+      }
+    } else {
+      SetupTaps<4>(&v_filter, taps);
+      if (width == 2) {
+        FilterVertical2xH<4, 5>(src, src_stride, dest, dest_stride, height,
+                                taps);
+      } else {
+        FilterVertical4xH<4, 5>(src, src_stride, dest, dest_stride, height,
+                                taps);
+      }
+    }
+  }
+}
+
+void ConvolveCompoundVertical_AVX2(
+    const void* const reference, const ptrdiff_t reference_stride,
+    const int /*horizontal_filter_index*/, const int vertical_filter_index,
+    const int /*horizontal_filter_id*/, const int vertical_filter_id,
+    const int width, const int height, void* prediction,
+    const ptrdiff_t /*pred_stride*/) {
+  const int filter_index = GetFilterIndex(vertical_filter_index, height);
+  const int vertical_taps = GetNumTapsInFilter(filter_index);
+  const ptrdiff_t src_stride = reference_stride;
+  const auto* src = static_cast<const uint8_t*>(reference) -
+                    (vertical_taps / 2 - 1) * src_stride;
+  auto* dest = static_cast<uint8_t*>(prediction);
+  const ptrdiff_t dest_stride = width;
+  assert(vertical_filter_id != 0);
+
+  const __m128i v_filter =
+      LoadLo8(kHalfSubPixelFilters[filter_index][vertical_filter_id]);
+
+  // Use 256 bits for width > 4.
+  if (width > 4) {
+    __m256i taps_256[4];
+    if (filter_index < 2) {  // 6 tap.
+      SetupTaps<6>(&v_filter, taps_256);
+      if (width == 8) {
+        FilterVertical8xH<0, /*is_compound=*/true>(
+            src, src_stride, dest, dest_stride, width, height, taps_256);
+      } else if (width == 16) {
+        FilterVertical16xH<0, /*is_compound=*/true>(
+            src, src_stride, dest, dest_stride, width, height, taps_256);
+      } else {
+        FilterVertical32xH<0, /*is_compound=*/true>(
+            src, src_stride, dest, dest_stride, width, height, taps_256);
+      }
+    } else if (filter_index == 2) {  // 8 tap.
+      SetupTaps<8>(&v_filter, taps_256);
+      if (width == 8) {
+        FilterVertical8xH<2, /*is_compound=*/true>(
+            src, src_stride, dest, dest_stride, width, height, taps_256);
+      } else if (width == 16) {
+        FilterVertical16xH<2, /*is_compound=*/true>(
+            src, src_stride, dest, dest_stride, width, height, taps_256);
+      } else {
+        FilterVertical32xH<2, /*is_compound=*/true>(
+            src, src_stride, dest, dest_stride, width, height, taps_256);
+      }
+    } else if (filter_index == 3) {  // 2 tap.
+      SetupTaps<2>(&v_filter, taps_256);
+      if (width == 8) {
+        FilterVertical8xH<3, /*is_compound=*/true>(
+            src, src_stride, dest, dest_stride, width, height, taps_256);
+      } else if (width == 16) {
+        FilterVertical16xH<3, /*is_compound=*/true>(
+            src, src_stride, dest, dest_stride, width, height, taps_256);
+      } else {
+        FilterVertical32xH<3, /*is_compound=*/true>(
+            src, src_stride, dest, dest_stride, width, height, taps_256);
+      }
+    } else if (filter_index == 4) {  // 4 tap.
+      SetupTaps<4>(&v_filter, taps_256);
+      if (width == 8) {
+        FilterVertical8xH<4, /*is_compound=*/true>(
+            src, src_stride, dest, dest_stride, width, height, taps_256);
+      } else if (width == 16) {
+        FilterVertical16xH<4, /*is_compound=*/true>(
+            src, src_stride, dest, dest_stride, width, height, taps_256);
+      } else {
+        FilterVertical32xH<4, /*is_compound=*/true>(
+            src, src_stride, dest, dest_stride, width, height, taps_256);
+      }
+    } else {
+      SetupTaps<4>(&v_filter, taps_256);
+      if (width == 8) {
+        FilterVertical8xH<5, /*is_compound=*/true>(
+            src, src_stride, dest, dest_stride, width, height, taps_256);
+      } else if (width == 16) {
+        FilterVertical16xH<5, /*is_compound=*/true>(
+            src, src_stride, dest, dest_stride, width, height, taps_256);
+      } else {
+        FilterVertical32xH<5, /*is_compound=*/true>(
+            src, src_stride, dest, dest_stride, width, height, taps_256);
+      }
+    }
+  } else {  // width <= 4
+    // Use 128 bit code.
+    __m128i taps[4];
+
+    if (filter_index < 2) {  // 6 tap.
+      SetupTaps<6>(&v_filter, taps);
+      FilterVertical4xH<6, 0, /*is_compound=*/true>(src, src_stride, dest,
+                                                    dest_stride, height, taps);
+    } else if (filter_index == 2) {  // 8 tap.
+      SetupTaps<8>(&v_filter, taps);
+      FilterVertical4xH<8, 2, /*is_compound=*/true>(src, src_stride, dest,
+                                                    dest_stride, height, taps);
+    } else if (filter_index == 3) {  // 2 tap.
+      SetupTaps<2>(&v_filter, taps);
+      FilterVertical4xH<2, 3, /*is_compound=*/true>(src, src_stride, dest,
+                                                    dest_stride, height, taps);
+    } else if (filter_index == 4) {  // 4 tap.
+      SetupTaps<4>(&v_filter, taps);
+      FilterVertical4xH<4, 4, /*is_compound=*/true>(src, src_stride, dest,
+                                                    dest_stride, height, taps);
+    } else {
+      SetupTaps<4>(&v_filter, taps);
+      FilterVertical4xH<4, 5, /*is_compound=*/true>(src, src_stride, dest,
+                                                    dest_stride, height, taps);
+    }
   }
 }
 
@@ -509,10 +1389,140 @@ void ConvolveHorizontal_AVX2(const void* const reference,
   }
 }
 
+void ConvolveCompoundHorizontal_AVX2(
+    const void* const reference, const ptrdiff_t reference_stride,
+    const int horizontal_filter_index, const int /*vertical_filter_index*/,
+    const int horizontal_filter_id, const int /*vertical_filter_id*/,
+    const int width, const int height, void* prediction,
+    const ptrdiff_t pred_stride) {
+  const int filter_index = GetFilterIndex(horizontal_filter_index, width);
+  // Set |src| to the outermost tap.
+  const auto* src = static_cast<const uint8_t*>(reference) - kHorizontalOffset;
+  auto* dest = static_cast<uint8_t*>(prediction);
+  // All compound functions output to the predictor buffer with |pred_stride|
+  // equal to |width|.
+  assert(pred_stride == width);
+  // Compound functions start at 4x4.
+  assert(width >= 4 && height >= 4);
+
+#ifdef NDEBUG
+  // Quiet compiler error.
+  (void)pred_stride;
+#endif
+
+  DoHorizontalPass</*is_2d=*/false, /*is_compound=*/true>(
+      src, reference_stride, dest, width, width, height, horizontal_filter_id,
+      filter_index);
+}
+
+void ConvolveCompound2D_AVX2(const void* const reference,
+                             const ptrdiff_t reference_stride,
+                             const int horizontal_filter_index,
+                             const int vertical_filter_index,
+                             const int horizontal_filter_id,
+                             const int vertical_filter_id, const int width,
+                             const int height, void* prediction,
+                             const ptrdiff_t pred_stride) {
+  const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
+  const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
+  const int vertical_taps = GetNumTapsInFilter(vert_filter_index);
+
+  // The output of the horizontal filter is guaranteed to fit in 16 bits.
+  alignas(32) uint16_t
+      intermediate_result[kMaxSuperBlockSizeInPixels *
+                          (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
+  const int intermediate_height = height + vertical_taps - 1;
+
+  const ptrdiff_t src_stride = reference_stride;
+  const auto* src = static_cast<const uint8_t*>(reference) -
+                    (vertical_taps / 2 - 1) * src_stride - kHorizontalOffset;
+  DoHorizontalPass</*is_2d=*/true, /*is_compound=*/true>(
+      src, src_stride, intermediate_result, width, width, intermediate_height,
+      horizontal_filter_id, horiz_filter_index);
+
+  // Vertical filter.
+  auto* dest = static_cast<uint8_t*>(prediction);
+  const ptrdiff_t dest_stride = pred_stride;
+  assert(vertical_filter_id != 0);
+
+  const __m128i v_filter =
+      LoadLo8(kHalfSubPixelFilters[vert_filter_index][vertical_filter_id]);
+
+  // Use 256 bits for width > 8.
+  if (width > 8) {
+    __m256i taps_256[4];
+    const __m128i v_filter_ext = _mm_cvtepi8_epi16(v_filter);
+
+    if (vertical_taps == 8) {
+      SetupTaps<8, /*is_2d_vertical=*/true>(&v_filter_ext, taps_256);
+      Filter2DVertical16xH<8, /*is_compound=*/true>(
+          intermediate_result, dest, dest_stride, width, height, taps_256);
+    } else if (vertical_taps == 6) {
+      SetupTaps<6, /*is_2d_vertical=*/true>(&v_filter_ext, taps_256);
+      Filter2DVertical16xH<6, /*is_compound=*/true>(
+          intermediate_result, dest, dest_stride, width, height, taps_256);
+    } else if (vertical_taps == 4) {
+      SetupTaps<4, /*is_2d_vertical=*/true>(&v_filter_ext, taps_256);
+      Filter2DVertical16xH<4, /*is_compound=*/true>(
+          intermediate_result, dest, dest_stride, width, height, taps_256);
+    } else {  // |vertical_taps| == 2
+      SetupTaps<2, /*is_2d_vertical=*/true>(&v_filter_ext, taps_256);
+      Filter2DVertical16xH<2, /*is_compound=*/true>(
+          intermediate_result, dest, dest_stride, width, height, taps_256);
+    }
+  } else {  // width <= 8
+    __m128i taps[4];
+    // Use 128 bit code.
+    if (vertical_taps == 8) {
+      SetupTaps<8, /*is_2d_vertical=*/true>(&v_filter, taps);
+      if (width == 4) {
+        Filter2DVertical4xH<8, /*is_compound=*/true>(intermediate_result, dest,
+                                                     dest_stride, height, taps);
+      } else {
+        Filter2DVertical<8, /*is_compound=*/true>(
+            intermediate_result, dest, dest_stride, width, height, taps);
+      }
+    } else if (vertical_taps == 6) {
+      SetupTaps<6, /*is_2d_vertical=*/true>(&v_filter, taps);
+      if (width == 4) {
+        Filter2DVertical4xH<6, /*is_compound=*/true>(intermediate_result, dest,
+                                                     dest_stride, height, taps);
+      } else {
+        Filter2DVertical<6, /*is_compound=*/true>(
+            intermediate_result, dest, dest_stride, width, height, taps);
+      }
+    } else if (vertical_taps == 4) {
+      SetupTaps<4, /*is_2d_vertical=*/true>(&v_filter, taps);
+      if (width == 4) {
+        Filter2DVertical4xH<4, /*is_compound=*/true>(intermediate_result, dest,
+                                                     dest_stride, height, taps);
+      } else {
+        Filter2DVertical<4, /*is_compound=*/true>(
+            intermediate_result, dest, dest_stride, width, height, taps);
+      }
+    } else {  // |vertical_taps| == 2
+      SetupTaps<2, /*is_2d_vertical=*/true>(&v_filter, taps);
+      if (width == 4) {
+        Filter2DVertical4xH<2, /*is_compound=*/true>(intermediate_result, dest,
+                                                     dest_stride, height, taps);
+      } else {
+        Filter2DVertical<2, /*is_compound=*/true>(
+            intermediate_result, dest, dest_stride, width, height, taps);
+      }
+    }
+  }
+}
+
 void Init8bpp() {
   Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
   assert(dsp != nullptr);
   dsp->convolve[0][0][0][1] = ConvolveHorizontal_AVX2;
+  dsp->convolve[0][0][1][0] = ConvolveVertical_AVX2;
+  dsp->convolve[0][0][1][1] = Convolve2D_AVX2;
+
+  dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_AVX2;
+  dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_AVX2;
+  dsp->convolve[0][1][1][1] = ConvolveCompound2D_AVX2;
 }
 
 }  // namespace
@@ -523,7 +1533,7 @@ void ConvolveInit_AVX2() { low_bitdepth::Init8bpp(); }
 }  // namespace dsp
 }  // namespace libgav1
 
-#else  // !LIBGAV1_TARGETING_AVX2
+#else   // !LIBGAV1_TARGETING_AVX2
 namespace libgav1 {
 namespace dsp {
 
diff --git a/src/dsp/x86/convolve_avx2.h b/src/dsp/x86/convolve_avx2.h
index 6179d98..e509bc9 100644
--- a/src/dsp/x86/convolve_avx2.h
+++ b/src/dsp/x86/convolve_avx2.h
@@ -38,6 +38,22 @@ void ConvolveInit_AVX2();
 #define LIBGAV1_Dsp8bpp_ConvolveHorizontal LIBGAV1_CPU_AVX2
 #endif
 
+#ifndef LIBGAV1_Dsp8bpp_ConvolveCompoundHorizontal
+#define LIBGAV1_Dsp8bpp_ConvolveCompoundHorizontal LIBGAV1_CPU_AVX2
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveVertical
+#define LIBGAV1_Dsp8bpp_ConvolveVertical LIBGAV1_CPU_AVX2
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_Convolve2D
+#define LIBGAV1_Dsp8bpp_Convolve2D LIBGAV1_CPU_AVX2
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveCompoundVertical
+#define LIBGAV1_Dsp8bpp_ConvolveCompoundVertical LIBGAV1_CPU_AVX2
+#endif
+
 #endif  // LIBGAV1_TARGETING_AVX2
 
 #endif  // LIBGAV1_SRC_DSP_X86_CONVOLVE_AVX2_H_
diff --git a/src/dsp/x86/convolve_sse4.cc b/src/dsp/x86/convolve_sse4.cc
index 3a0fff5..9b72fe4 100644
--- a/src/dsp/x86/convolve_sse4.cc
+++ b/src/dsp/x86/convolve_sse4.cc
@@ -34,41 +34,7 @@ namespace dsp {
 namespace low_bitdepth {
 namespace {
 
-#include "src/dsp/convolve.inc"
-
-// Multiply every entry in |src[]| by the corresponding entry in |taps[]| and
-// sum. The filters in |taps[]| are pre-shifted by 1. This prevents the final
-// sum from outranging int16_t.
-template <int filter_index>
-__m128i SumOnePassTaps(const __m128i* const src, const __m128i* const taps) {
-  __m128i sum;
-  if (filter_index < 2) {
-    // 6 taps.
-    const __m128i v_madd_21 = _mm_maddubs_epi16(src[0], taps[0]);  // k2k1
-    const __m128i v_madd_43 = _mm_maddubs_epi16(src[1], taps[1]);  // k4k3
-    const __m128i v_madd_65 = _mm_maddubs_epi16(src[2], taps[2]);  // k6k5
-    sum = _mm_add_epi16(v_madd_21, v_madd_43);
-    sum = _mm_add_epi16(sum, v_madd_65);
-  } else if (filter_index == 2) {
-    // 8 taps.
-    const __m128i v_madd_10 = _mm_maddubs_epi16(src[0], taps[0]);  // k1k0
-    const __m128i v_madd_32 = _mm_maddubs_epi16(src[1], taps[1]);  // k3k2
-    const __m128i v_madd_54 = _mm_maddubs_epi16(src[2], taps[2]);  // k5k4
-    const __m128i v_madd_76 = _mm_maddubs_epi16(src[3], taps[3]);  // k7k6
-    const __m128i v_sum_3210 = _mm_add_epi16(v_madd_10, v_madd_32);
-    const __m128i v_sum_7654 = _mm_add_epi16(v_madd_54, v_madd_76);
-    sum = _mm_add_epi16(v_sum_7654, v_sum_3210);
-  } else if (filter_index == 3) {
-    // 2 taps.
-    sum = _mm_maddubs_epi16(src[0], taps[0]);  // k4k3
-  } else {
-    // 4 taps.
-    const __m128i v_madd_32 = _mm_maddubs_epi16(src[0], taps[0]);  // k3k2
-    const __m128i v_madd_54 = _mm_maddubs_epi16(src[1], taps[1]);  // k5k4
-    sum = _mm_add_epi16(v_madd_32, v_madd_54);
-  }
-  return sum;
-}
+#include "src/dsp/x86/convolve_sse4.inc"
 
 template <int filter_index>
 __m128i SumHorizontalTaps(const uint8_t* const src,
@@ -125,68 +91,7 @@ __m128i HorizontalTaps8To16(const uint8_t* const src,
   return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
 }
 
-template <int filter_index>
-__m128i SumHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride,
-                             const __m128i* const v_tap) {
-  const __m128i input0 = LoadLo8(&src[2]);
-  const __m128i input1 = LoadLo8(&src[2 + src_stride]);
-
-  if (filter_index == 3) {
-    // 03 04 04 05 05 06 06 07 ....
-    const __m128i input0_dup =
-        _mm_srli_si128(_mm_unpacklo_epi8(input0, input0), 3);
-    // 13 14 14 15 15 16 16 17 ....
-    const __m128i input1_dup =
-        _mm_srli_si128(_mm_unpacklo_epi8(input1, input1), 3);
-    const __m128i v_src_43 = _mm_unpacklo_epi64(input0_dup, input1_dup);
-    const __m128i v_sum_43 = _mm_maddubs_epi16(v_src_43, v_tap[0]);  // k4k3
-    return v_sum_43;
-  }
-
-  // 02 03 03 04 04 05 05 06 06 07 ....
-  const __m128i input0_dup =
-      _mm_srli_si128(_mm_unpacklo_epi8(input0, input0), 1);
-  // 12 13 13 14 14 15 15 16 16 17 ....
-  const __m128i input1_dup =
-      _mm_srli_si128(_mm_unpacklo_epi8(input1, input1), 1);
-  // 04 05 05 06 06 07 07 08 ...
-  const __m128i input0_dup_54 = _mm_srli_si128(input0_dup, 4);
-  // 14 15 15 16 16 17 17 18 ...
-  const __m128i input1_dup_54 = _mm_srli_si128(input1_dup, 4);
-  const __m128i v_src_32 = _mm_unpacklo_epi64(input0_dup, input1_dup);
-  const __m128i v_src_54 = _mm_unpacklo_epi64(input0_dup_54, input1_dup_54);
-  const __m128i v_madd_32 = _mm_maddubs_epi16(v_src_32, v_tap[0]);  // k3k2
-  const __m128i v_madd_54 = _mm_maddubs_epi16(v_src_54, v_tap[1]);  // k5k4
-  const __m128i v_sum_5432 = _mm_add_epi16(v_madd_54, v_madd_32);
-  return v_sum_5432;
-}
-
-template <int filter_index>
-__m128i SimpleHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride,
-                                const __m128i* const v_tap) {
-  __m128i sum = SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
-
-  // Normally the Horizontal pass does the downshift in two passes:
-  // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
-  // kInterRoundBitsHorizontal). Each one uses a rounding shift. Combining them
-  // requires adding the rounding offset from the skipped shift.
-  constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2);
-
-  sum = _mm_add_epi16(sum, _mm_set1_epi16(first_shift_rounding_bit));
-  sum = RightShiftWithRounding_S16(sum, kFilterBits - 1);
-  return _mm_packus_epi16(sum, sum);
-}
-
-template <int filter_index>
-__m128i HorizontalTaps8To16_2x2(const uint8_t* src, const ptrdiff_t src_stride,
-                                const __m128i* const v_tap) {
-  const __m128i sum =
-      SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
-
-  return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
-}
-
-template <int num_taps, int step, int filter_index, bool is_2d = false,
+template <int num_taps, int filter_index, bool is_2d = false,
           bool is_compound = false>
 void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride,
                       void* const dest, const ptrdiff_t pred_stride,
@@ -197,7 +102,7 @@ void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride,
 
   // 4 tap filters are never used when width > 4.
   if (num_taps != 4 && width > 4) {
-    int y = 0;
+    int y = height;
     do {
       int x = 0;
       do {
@@ -214,12 +119,12 @@ void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride,
               SimpleHorizontalTaps<filter_index>(&src[x], v_tap);
           StoreLo8(&dest8[x], result);
         }
-        x += step;
+        x += 8;
       } while (x < width);
       src += src_stride;
       dest8 += pred_stride;
       dest16 += pred_stride;
-    } while (++y < height);
+    } while (--y != 0);
     return;
   }
 
@@ -229,7 +134,7 @@ void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride,
   assert(num_taps <= 4);
   if (num_taps <= 4) {
     if (width == 4) {
-      int y = 0;
+      int y = height;
       do {
         if (is_2d || is_compound) {
           const __m128i v_sum = HorizontalTaps8To16<filter_index>(src, v_tap);
@@ -241,12 +146,13 @@ void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride,
         src += src_stride;
         dest8 += pred_stride;
         dest16 += pred_stride;
-      } while (++y < height);
+      } while (--y != 0);
       return;
     }
 
     if (!is_compound) {
-      int y = 0;
+      int y = height;
+      if (is_2d) y -= 1;
       do {
         if (is_2d) {
           const __m128i sum =
@@ -265,8 +171,8 @@ void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride,
         }
 
         src += src_stride << 1;
-        y += 2;
-      } while (y < height - 1);
+        y -= 2;
+      } while (y != 0);
 
       // The 2d filters have an odd |height| because the horizontal pass
       // generates context for the vertical pass.
@@ -298,303 +204,6 @@ void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride,
   }
 }
 
-template <int num_taps, bool is_2d_vertical = false>
-LIBGAV1_ALWAYS_INLINE void SetupTaps(const __m128i* const filter,
-                                     __m128i* v_tap) {
-  if (num_taps == 8) {
-    v_tap[0] = _mm_shufflelo_epi16(*filter, 0x0);   // k1k0
-    v_tap[1] = _mm_shufflelo_epi16(*filter, 0x55);  // k3k2
-    v_tap[2] = _mm_shufflelo_epi16(*filter, 0xaa);  // k5k4
-    v_tap[3] = _mm_shufflelo_epi16(*filter, 0xff);  // k7k6
-    if (is_2d_vertical) {
-      v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
-      v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]);
-      v_tap[2] = _mm_cvtepi8_epi16(v_tap[2]);
-      v_tap[3] = _mm_cvtepi8_epi16(v_tap[3]);
-    } else {
-      v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
-      v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]);
-      v_tap[2] = _mm_unpacklo_epi64(v_tap[2], v_tap[2]);
-      v_tap[3] = _mm_unpacklo_epi64(v_tap[3], v_tap[3]);
-    }
-  } else if (num_taps == 6) {
-    const __m128i adjusted_filter = _mm_srli_si128(*filter, 1);
-    v_tap[0] = _mm_shufflelo_epi16(adjusted_filter, 0x0);   // k2k1
-    v_tap[1] = _mm_shufflelo_epi16(adjusted_filter, 0x55);  // k4k3
-    v_tap[2] = _mm_shufflelo_epi16(adjusted_filter, 0xaa);  // k6k5
-    if (is_2d_vertical) {
-      v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
-      v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]);
-      v_tap[2] = _mm_cvtepi8_epi16(v_tap[2]);
-    } else {
-      v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
-      v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]);
-      v_tap[2] = _mm_unpacklo_epi64(v_tap[2], v_tap[2]);
-    }
-  } else if (num_taps == 4) {
-    v_tap[0] = _mm_shufflelo_epi16(*filter, 0x55);  // k3k2
-    v_tap[1] = _mm_shufflelo_epi16(*filter, 0xaa);  // k5k4
-    if (is_2d_vertical) {
-      v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
-      v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]);
-    } else {
-      v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
-      v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]);
-    }
-  } else {  // num_taps == 2
-    const __m128i adjusted_filter = _mm_srli_si128(*filter, 1);
-    v_tap[0] = _mm_shufflelo_epi16(adjusted_filter, 0x55);  // k4k3
-    if (is_2d_vertical) {
-      v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
-    } else {
-      v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
-    }
-  }
-}
-
-template <int num_taps, bool is_compound>
-__m128i SimpleSum2DVerticalTaps(const __m128i* const src,
-                                const __m128i* const taps) {
-  __m128i sum_lo = _mm_madd_epi16(_mm_unpacklo_epi16(src[0], src[1]), taps[0]);
-  __m128i sum_hi = _mm_madd_epi16(_mm_unpackhi_epi16(src[0], src[1]), taps[0]);
-  if (num_taps >= 4) {
-    __m128i madd_lo =
-        _mm_madd_epi16(_mm_unpacklo_epi16(src[2], src[3]), taps[1]);
-    __m128i madd_hi =
-        _mm_madd_epi16(_mm_unpackhi_epi16(src[2], src[3]), taps[1]);
-    sum_lo = _mm_add_epi32(sum_lo, madd_lo);
-    sum_hi = _mm_add_epi32(sum_hi, madd_hi);
-    if (num_taps >= 6) {
-      madd_lo = _mm_madd_epi16(_mm_unpacklo_epi16(src[4], src[5]), taps[2]);
-      madd_hi = _mm_madd_epi16(_mm_unpackhi_epi16(src[4], src[5]), taps[2]);
-      sum_lo = _mm_add_epi32(sum_lo, madd_lo);
-      sum_hi = _mm_add_epi32(sum_hi, madd_hi);
-      if (num_taps == 8) {
-        madd_lo = _mm_madd_epi16(_mm_unpacklo_epi16(src[6], src[7]), taps[3]);
-        madd_hi = _mm_madd_epi16(_mm_unpackhi_epi16(src[6], src[7]), taps[3]);
-        sum_lo = _mm_add_epi32(sum_lo, madd_lo);
-        sum_hi = _mm_add_epi32(sum_hi, madd_hi);
-      }
-    }
-  }
-
-  if (is_compound) {
-    return _mm_packs_epi32(
-        RightShiftWithRounding_S32(sum_lo, kInterRoundBitsCompoundVertical - 1),
-        RightShiftWithRounding_S32(sum_hi,
-                                   kInterRoundBitsCompoundVertical - 1));
-  }
-
-  return _mm_packs_epi32(
-      RightShiftWithRounding_S32(sum_lo, kInterRoundBitsVertical - 1),
-      RightShiftWithRounding_S32(sum_hi, kInterRoundBitsVertical - 1));
-}
-
-template <int num_taps, bool is_compound = false>
-void Filter2DVertical(const uint16_t* src, void* const dst,
-                      const ptrdiff_t dst_stride, const int width,
-                      const int height, const __m128i* const taps) {
-  assert(width >= 8);
-  constexpr int next_row = num_taps - 1;
-  // The Horizontal pass uses |width| as |stride| for the intermediate buffer.
-  const ptrdiff_t src_stride = width;
-
-  auto* dst8 = static_cast<uint8_t*>(dst);
-  auto* dst16 = static_cast<uint16_t*>(dst);
-
-  int x = 0;
-  do {
-    __m128i srcs[8];
-    const uint16_t* src_x = src + x;
-    srcs[0] = LoadAligned16(src_x);
-    src_x += src_stride;
-    if (num_taps >= 4) {
-      srcs[1] = LoadAligned16(src_x);
-      src_x += src_stride;
-      srcs[2] = LoadAligned16(src_x);
-      src_x += src_stride;
-      if (num_taps >= 6) {
-        srcs[3] = LoadAligned16(src_x);
-        src_x += src_stride;
-        srcs[4] = LoadAligned16(src_x);
-        src_x += src_stride;
-        if (num_taps == 8) {
-          srcs[5] = LoadAligned16(src_x);
-          src_x += src_stride;
-          srcs[6] = LoadAligned16(src_x);
-          src_x += src_stride;
-        }
-      }
-    }
-
-    int y = 0;
-    do {
-      srcs[next_row] = LoadAligned16(src_x);
-      src_x += src_stride;
-
-      const __m128i sum =
-          SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps);
-      if (is_compound) {
-        StoreUnaligned16(dst16 + x + y * dst_stride, sum);
-      } else {
-        StoreLo8(dst8 + x + y * dst_stride, _mm_packus_epi16(sum, sum));
-      }
-
-      srcs[0] = srcs[1];
-      if (num_taps >= 4) {
-        srcs[1] = srcs[2];
-        srcs[2] = srcs[3];
-        if (num_taps >= 6) {
-          srcs[3] = srcs[4];
-          srcs[4] = srcs[5];
-          if (num_taps == 8) {
-            srcs[5] = srcs[6];
-            srcs[6] = srcs[7];
-          }
-        }
-      }
-    } while (++y < height);
-    x += 8;
-  } while (x < width);
-}
-
-// Take advantage of |src_stride| == |width| to process two rows at a time.
-template <int num_taps, bool is_compound = false>
-void Filter2DVertical4xH(const uint16_t* src, void* const dst,
-                         const ptrdiff_t dst_stride, const int height,
-                         const __m128i* const taps) {
-  auto* dst8 = static_cast<uint8_t*>(dst);
-  auto* dst16 = static_cast<uint16_t*>(dst);
-
-  __m128i srcs[9];
-  srcs[0] = LoadAligned16(src);
-  src += 8;
-  if (num_taps >= 4) {
-    srcs[2] = LoadAligned16(src);
-    src += 8;
-    srcs[1] = _mm_unpacklo_epi64(_mm_srli_si128(srcs[0], 8), srcs[2]);
-    if (num_taps >= 6) {
-      srcs[4] = LoadAligned16(src);
-      src += 8;
-      srcs[3] = _mm_unpacklo_epi64(_mm_srli_si128(srcs[2], 8), srcs[4]);
-      if (num_taps == 8) {
-        srcs[6] = LoadAligned16(src);
-        src += 8;
-        srcs[5] = _mm_unpacklo_epi64(_mm_srli_si128(srcs[4], 8), srcs[6]);
-      }
-    }
-  }
-
-  int y = 0;
-  do {
-    srcs[num_taps] = LoadAligned16(src);
-    src += 8;
-    srcs[num_taps - 1] = _mm_unpacklo_epi64(
-        _mm_srli_si128(srcs[num_taps - 2], 8), srcs[num_taps]);
-
-    const __m128i sum =
-        SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps);
-    if (is_compound) {
-      StoreUnaligned16(dst16, sum);
-      dst16 += 4 << 1;
-    } else {
-      const __m128i results = _mm_packus_epi16(sum, sum);
-      Store4(dst8, results);
-      dst8 += dst_stride;
-      Store4(dst8, _mm_srli_si128(results, 4));
-      dst8 += dst_stride;
-    }
-
-    srcs[0] = srcs[2];
-    if (num_taps >= 4) {
-      srcs[1] = srcs[3];
-      srcs[2] = srcs[4];
-      if (num_taps >= 6) {
-        srcs[3] = srcs[5];
-        srcs[4] = srcs[6];
-        if (num_taps == 8) {
-          srcs[5] = srcs[7];
-          srcs[6] = srcs[8];
-        }
-      }
-    }
-    y += 2;
-  } while (y < height);
-}
-
-// Take advantage of |src_stride| == |width| to process four rows at a time.
-template <int num_taps>
-void Filter2DVertical2xH(const uint16_t* src, void* const dst,
-                         const ptrdiff_t dst_stride, const int height,
-                         const __m128i* const taps) {
-  constexpr int next_row = (num_taps < 6) ? 4 : 8;
-
-  auto* dst8 = static_cast<uint8_t*>(dst);
-
-  __m128i srcs[9];
-  srcs[0] = LoadAligned16(src);
-  src += 8;
-  if (num_taps >= 6) {
-    srcs[4] = LoadAligned16(src);
-    src += 8;
-    srcs[1] = _mm_alignr_epi8(srcs[4], srcs[0], 4);
-    if (num_taps == 8) {
-      srcs[2] = _mm_alignr_epi8(srcs[4], srcs[0], 8);
-      srcs[3] = _mm_alignr_epi8(srcs[4], srcs[0], 12);
-    }
-  }
-
-  int y = 0;
-  do {
-    srcs[next_row] = LoadAligned16(src);
-    src += 8;
-    if (num_taps == 2) {
-      srcs[1] = _mm_alignr_epi8(srcs[4], srcs[0], 4);
-    } else if (num_taps == 4) {
-      srcs[1] = _mm_alignr_epi8(srcs[4], srcs[0], 4);
-      srcs[2] = _mm_alignr_epi8(srcs[4], srcs[0], 8);
-      srcs[3] = _mm_alignr_epi8(srcs[4], srcs[0], 12);
-    } else if (num_taps == 6) {
-      srcs[2] = _mm_alignr_epi8(srcs[4], srcs[0], 8);
-      srcs[3] = _mm_alignr_epi8(srcs[4], srcs[0], 12);
-      srcs[5] = _mm_alignr_epi8(srcs[8], srcs[4], 4);
-    } else if (num_taps == 8) {
-      srcs[5] = _mm_alignr_epi8(srcs[8], srcs[4], 4);
-      srcs[6] = _mm_alignr_epi8(srcs[8], srcs[4], 8);
-      srcs[7] = _mm_alignr_epi8(srcs[8], srcs[4], 12);
-    }
-
-    const __m128i sum =
-        SimpleSum2DVerticalTaps<num_taps, /*is_compound=*/false>(srcs, taps);
-    const __m128i results = _mm_packus_epi16(sum, sum);
-
-    Store2(dst8, results);
-    dst8 += dst_stride;
-    Store2(dst8, _mm_srli_si128(results, 2));
-    // When |height| <= 4 the taps are restricted to 2 and 4 tap variants.
-    // Therefore we don't need to check this condition when |height| > 4.
-    if (num_taps <= 4 && height == 2) return;
-    dst8 += dst_stride;
-    Store2(dst8, _mm_srli_si128(results, 4));
-    dst8 += dst_stride;
-    Store2(dst8, _mm_srli_si128(results, 6));
-    dst8 += dst_stride;
-
-    srcs[0] = srcs[4];
-    if (num_taps == 6) {
-      srcs[1] = srcs[5];
-      srcs[4] = srcs[8];
-    } else if (num_taps == 8) {
-      srcs[1] = srcs[5];
-      srcs[2] = srcs[6];
-      srcs[3] = srcs[7];
-      srcs[4] = srcs[8];
-    }
-
-    y += 4;
-  } while (y < height);
-}
-
 template <bool is_2d = false, bool is_compound = false>
 LIBGAV1_ALWAYS_INLINE void DoHorizontalPass(
     const uint8_t* const src, const ptrdiff_t src_stride, void* const dst,
@@ -607,28 +216,28 @@ LIBGAV1_ALWAYS_INLINE void DoHorizontalPass(
 
   if (filter_index == 2) {  // 8 tap.
     SetupTaps<8>(&v_horizontal_filter, v_tap);
-    FilterHorizontal<8, 8, 2, is_2d, is_compound>(
-        src, src_stride, dst, dst_stride, width, height, v_tap);
+    FilterHorizontal<8, 2, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+                                               width, height, v_tap);
   } else if (filter_index == 1) {  // 6 tap.
     SetupTaps<6>(&v_horizontal_filter, v_tap);
-    FilterHorizontal<6, 8, 1, is_2d, is_compound>(
-        src, src_stride, dst, dst_stride, width, height, v_tap);
+    FilterHorizontal<6, 1, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+                                               width, height, v_tap);
   } else if (filter_index == 0) {  // 6 tap.
     SetupTaps<6>(&v_horizontal_filter, v_tap);
-    FilterHorizontal<6, 8, 0, is_2d, is_compound>(
-        src, src_stride, dst, dst_stride, width, height, v_tap);
+    FilterHorizontal<6, 0, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+                                               width, height, v_tap);
   } else if (filter_index == 4) {  // 4 tap.
     SetupTaps<4>(&v_horizontal_filter, v_tap);
-    FilterHorizontal<4, 8, 4, is_2d, is_compound>(
-        src, src_stride, dst, dst_stride, width, height, v_tap);
+    FilterHorizontal<4, 4, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+                                               width, height, v_tap);
   } else if (filter_index == 5) {  // 4 tap.
     SetupTaps<4>(&v_horizontal_filter, v_tap);
-    FilterHorizontal<4, 8, 5, is_2d, is_compound>(
-        src, src_stride, dst, dst_stride, width, height, v_tap);
+    FilterHorizontal<4, 5, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+                                               width, height, v_tap);
   } else {  // 2 tap.
     SetupTaps<2>(&v_horizontal_filter, v_tap);
-    FilterHorizontal<2, 8, 3, is_2d, is_compound>(
-        src, src_stride, dst, dst_stride, width, height, v_tap);
+    FilterHorizontal<2, 3, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+                                               width, height, v_tap);
   }
 }
 
@@ -718,39 +327,6 @@ void Convolve2D_SSE4_1(const void* const reference,
   }
 }
 
-// The 1D compound shift is always |kInterRoundBitsHorizontal|, even for 1D
-// Vertical calculations.
-__m128i Compound1DShift(const __m128i sum) {
-  return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
-}
-
-template <int filter_index>
-__m128i SumVerticalTaps(const __m128i* const srcs, const __m128i* const v_tap) {
-  __m128i v_src[4];
-
-  if (filter_index < 2) {
-    // 6 taps.
-    v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
-    v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]);
-    v_src[2] = _mm_unpacklo_epi8(srcs[4], srcs[5]);
-  } else if (filter_index == 2) {
-    // 8 taps.
-    v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
-    v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]);
-    v_src[2] = _mm_unpacklo_epi8(srcs[4], srcs[5]);
-    v_src[3] = _mm_unpacklo_epi8(srcs[6], srcs[7]);
-  } else if (filter_index == 3) {
-    // 2 taps.
-    v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
-  } else if (filter_index > 3) {
-    // 4 taps.
-    v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
-    v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]);
-  }
-  const __m128i sum = SumOnePassTaps<filter_index>(v_src, v_tap);
-  return sum;
-}
-
 template <int filter_index, bool is_compound = false>
 void FilterVertical(const uint8_t* src, const ptrdiff_t src_stride,
                     void* const dst, const ptrdiff_t dst_stride,
@@ -787,7 +363,9 @@ void FilterVertical(const uint8_t* src, const ptrdiff_t src_stride,
       }
     }
 
-    int y = 0;
+    auto* dst8_x = dst8 + x;
+    auto* dst16_x = dst16 + x;
+    int y = height;
     do {
       srcs[next_row] = LoadLo8(src_x);
       src_x += src_stride;
@@ -795,11 +373,13 @@ void FilterVertical(const uint8_t* src, const ptrdiff_t src_stride,
       const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
       if (is_compound) {
         const __m128i results = Compound1DShift(sums);
-        StoreUnaligned16(dst16 + x + y * dst_stride, results);
+        StoreUnaligned16(dst16_x, results);
+        dst16_x += dst_stride;
       } else {
         const __m128i results =
             RightShiftWithRounding_S16(sums, kFilterBits - 1);
-        StoreLo8(dst8 + x + y * dst_stride, _mm_packus_epi16(results, results));
+        StoreLo8(dst8_x, _mm_packus_epi16(results, results));
+        dst8_x += dst_stride;
       }
 
       srcs[0] = srcs[1];
@@ -815,506 +395,11 @@ void FilterVertical(const uint8_t* src, const ptrdiff_t src_stride,
           }
         }
       }
-    } while (++y < height);
+    } while (--y != 0);
     x += 8;
   } while (x < width);
 }
 
-template <int filter_index, bool is_compound = false>
-void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
-                       void* const dst, const ptrdiff_t dst_stride,
-                       const int height, const __m128i* const v_tap) {
-  const int num_taps = GetNumTapsInFilter(filter_index);
-  auto* dst8 = static_cast<uint8_t*>(dst);
-  auto* dst16 = static_cast<uint16_t*>(dst);
-
-  __m128i srcs[9];
-
-  if (num_taps == 2) {
-    srcs[2] = _mm_setzero_si128();
-    // 00 01 02 03
-    srcs[0] = Load4(src);
-    src += src_stride;
-
-    int y = 0;
-    do {
-      // 10 11 12 13
-      const __m128i a = Load4(src);
-      // 00 01 02 03 10 11 12 13
-      srcs[0] = _mm_unpacklo_epi32(srcs[0], a);
-      src += src_stride;
-      // 20 21 22 23
-      srcs[2] = Load4(src);
-      src += src_stride;
-      // 10 11 12 13 20 21 22 23
-      srcs[1] = _mm_unpacklo_epi32(a, srcs[2]);
-
-      const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
-      if (is_compound) {
-        const __m128i results = Compound1DShift(sums);
-        StoreUnaligned16(dst16, results);
-        dst16 += 4 << 1;
-      } else {
-        const __m128i results_16 =
-            RightShiftWithRounding_S16(sums, kFilterBits - 1);
-        const __m128i results = _mm_packus_epi16(results_16, results_16);
-        Store4(dst8, results);
-        dst8 += dst_stride;
-        Store4(dst8, _mm_srli_si128(results, 4));
-        dst8 += dst_stride;
-      }
-
-      srcs[0] = srcs[2];
-      y += 2;
-    } while (y < height);
-  } else if (num_taps == 4) {
-    srcs[4] = _mm_setzero_si128();
-    // 00 01 02 03
-    srcs[0] = Load4(src);
-    src += src_stride;
-    // 10 11 12 13
-    const __m128i a = Load4(src);
-    // 00 01 02 03 10 11 12 13
-    srcs[0] = _mm_unpacklo_epi32(srcs[0], a);
-    src += src_stride;
-    // 20 21 22 23
-    srcs[2] = Load4(src);
-    src += src_stride;
-    // 10 11 12 13 20 21 22 23
-    srcs[1] = _mm_unpacklo_epi32(a, srcs[2]);
-
-    int y = 0;
-    do {
-      // 30 31 32 33
-      const __m128i b = Load4(src);
-      // 20 21 22 23 30 31 32 33
-      srcs[2] = _mm_unpacklo_epi32(srcs[2], b);
-      src += src_stride;
-      // 40 41 42 43
-      srcs[4] = Load4(src);
-      src += src_stride;
-      // 30 31 32 33 40 41 42 43
-      srcs[3] = _mm_unpacklo_epi32(b, srcs[4]);
-
-      const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
-      if (is_compound) {
-        const __m128i results = Compound1DShift(sums);
-        StoreUnaligned16(dst16, results);
-        dst16 += 4 << 1;
-      } else {
-        const __m128i results_16 =
-            RightShiftWithRounding_S16(sums, kFilterBits - 1);
-        const __m128i results = _mm_packus_epi16(results_16, results_16);
-        Store4(dst8, results);
-        dst8 += dst_stride;
-        Store4(dst8, _mm_srli_si128(results, 4));
-        dst8 += dst_stride;
-      }
-
-      srcs[0] = srcs[2];
-      srcs[1] = srcs[3];
-      srcs[2] = srcs[4];
-      y += 2;
-    } while (y < height);
-  } else if (num_taps == 6) {
-    srcs[6] = _mm_setzero_si128();
-    // 00 01 02 03
-    srcs[0] = Load4(src);
-    src += src_stride;
-    // 10 11 12 13
-    const __m128i a = Load4(src);
-    // 00 01 02 03 10 11 12 13
-    srcs[0] = _mm_unpacklo_epi32(srcs[0], a);
-    src += src_stride;
-    // 20 21 22 23
-    srcs[2] = Load4(src);
-    src += src_stride;
-    // 10 11 12 13 20 21 22 23
-    srcs[1] = _mm_unpacklo_epi32(a, srcs[2]);
-    // 30 31 32 33
-    const __m128i b = Load4(src);
-    // 20 21 22 23 30 31 32 33
-    srcs[2] = _mm_unpacklo_epi32(srcs[2], b);
-    src += src_stride;
-    // 40 41 42 43
-    srcs[4] = Load4(src);
-    src += src_stride;
-    // 30 31 32 33 40 41 42 43
-    srcs[3] = _mm_unpacklo_epi32(b, srcs[4]);
-
-    int y = 0;
-    do {
-      // 50 51 52 53
-      const __m128i c = Load4(src);
-      // 40 41 42 43 50 51 52 53
-      srcs[4] = _mm_unpacklo_epi32(srcs[4], c);
-      src += src_stride;
-      // 60 61 62 63
-      srcs[6] = Load4(src);
-      src += src_stride;
-      // 50 51 52 53 60 61 62 63
-      srcs[5] = _mm_unpacklo_epi32(c, srcs[6]);
-
-      const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
-      if (is_compound) {
-        const __m128i results = Compound1DShift(sums);
-        StoreUnaligned16(dst16, results);
-        dst16 += 4 << 1;
-      } else {
-        const __m128i results_16 =
-            RightShiftWithRounding_S16(sums, kFilterBits - 1);
-        const __m128i results = _mm_packus_epi16(results_16, results_16);
-        Store4(dst8, results);
-        dst8 += dst_stride;
-        Store4(dst8, _mm_srli_si128(results, 4));
-        dst8 += dst_stride;
-      }
-
-      srcs[0] = srcs[2];
-      srcs[1] = srcs[3];
-      srcs[2] = srcs[4];
-      srcs[3] = srcs[5];
-      srcs[4] = srcs[6];
-      y += 2;
-    } while (y < height);
-  } else if (num_taps == 8) {
-    srcs[8] = _mm_setzero_si128();
-    // 00 01 02 03
-    srcs[0] = Load4(src);
-    src += src_stride;
-    // 10 11 12 13
-    const __m128i a = Load4(src);
-    // 00 01 02 03 10 11 12 13
-    srcs[0] = _mm_unpacklo_epi32(srcs[0], a);
-    src += src_stride;
-    // 20 21 22 23
-    srcs[2] = Load4(src);
-    src += src_stride;
-    // 10 11 12 13 20 21 22 23
-    srcs[1] = _mm_unpacklo_epi32(a, srcs[2]);
-    // 30 31 32 33
-    const __m128i b = Load4(src);
-    // 20 21 22 23 30 31 32 33
-    srcs[2] = _mm_unpacklo_epi32(srcs[2], b);
-    src += src_stride;
-    // 40 41 42 43
-    srcs[4] = Load4(src);
-    src += src_stride;
-    // 30 31 32 33 40 41 42 43
-    srcs[3] = _mm_unpacklo_epi32(b, srcs[4]);
-    // 50 51 52 53
-    const __m128i c = Load4(src);
-    // 40 41 42 43 50 51 52 53
-    srcs[4] = _mm_unpacklo_epi32(srcs[4], c);
-    src += src_stride;
-    // 60 61 62 63
-    srcs[6] = Load4(src);
-    src += src_stride;
-    // 50 51 52 53 60 61 62 63
-    srcs[5] = _mm_unpacklo_epi32(c, srcs[6]);
-
-    int y = 0;
-    do {
-      // 70 71 72 73
-      const __m128i d = Load4(src);
-      // 60 61 62 63 70 71 72 73
-      srcs[6] = _mm_unpacklo_epi32(srcs[6], d);
-      src += src_stride;
-      // 80 81 82 83
-      srcs[8] = Load4(src);
-      src += src_stride;
-      // 70 71 72 73 80 81 82 83
-      srcs[7] = _mm_unpacklo_epi32(d, srcs[8]);
-
-      const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
-      if (is_compound) {
-        const __m128i results = Compound1DShift(sums);
-        StoreUnaligned16(dst16, results);
-        dst16 += 4 << 1;
-      } else {
-        const __m128i results_16 =
-            RightShiftWithRounding_S16(sums, kFilterBits - 1);
-        const __m128i results = _mm_packus_epi16(results_16, results_16);
-        Store4(dst8, results);
-        dst8 += dst_stride;
-        Store4(dst8, _mm_srli_si128(results, 4));
-        dst8 += dst_stride;
-      }
-
-      srcs[0] = srcs[2];
-      srcs[1] = srcs[3];
-      srcs[2] = srcs[4];
-      srcs[3] = srcs[5];
-      srcs[4] = srcs[6];
-      srcs[5] = srcs[7];
-      srcs[6] = srcs[8];
-      y += 2;
-    } while (y < height);
-  }
-}
-
-template <int filter_index, bool negative_outside_taps = false>
-void FilterVertical2xH(const uint8_t* src, const ptrdiff_t src_stride,
-                       void* const dst, const ptrdiff_t dst_stride,
-                       const int height, const __m128i* const v_tap) {
-  const int num_taps = GetNumTapsInFilter(filter_index);
-  auto* dst8 = static_cast<uint8_t*>(dst);
-
-  __m128i srcs[9];
-
-  if (num_taps == 2) {
-    srcs[2] = _mm_setzero_si128();
-    // 00 01
-    srcs[0] = Load2(src);
-    src += src_stride;
-
-    int y = 0;
-    do {
-      // 00 01 10 11
-      srcs[0] = Load2<1>(src, srcs[0]);
-      src += src_stride;
-      // 00 01 10 11 20 21
-      srcs[0] = Load2<2>(src, srcs[0]);
-      src += src_stride;
-      // 00 01 10 11 20 21 30 31
-      srcs[0] = Load2<3>(src, srcs[0]);
-      src += src_stride;
-      // 40 41
-      srcs[2] = Load2<0>(src, srcs[2]);
-      src += src_stride;
-      // 00 01 10 11 20 21 30 31 40 41
-      const __m128i srcs_0_2 = _mm_unpacklo_epi64(srcs[0], srcs[2]);
-      // 10 11 20 21 30 31 40 41
-      srcs[1] = _mm_srli_si128(srcs_0_2, 2);
-      // This uses srcs[0]..srcs[1].
-      const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
-      const __m128i results_16 =
-          RightShiftWithRounding_S16(sums, kFilterBits - 1);
-      const __m128i results = _mm_packus_epi16(results_16, results_16);
-
-      Store2(dst8, results);
-      dst8 += dst_stride;
-      Store2(dst8, _mm_srli_si128(results, 2));
-      if (height == 2) return;
-      dst8 += dst_stride;
-      Store2(dst8, _mm_srli_si128(results, 4));
-      dst8 += dst_stride;
-      Store2(dst8, _mm_srli_si128(results, 6));
-      dst8 += dst_stride;
-
-      srcs[0] = srcs[2];
-      y += 4;
-    } while (y < height);
-  } else if (num_taps == 4) {
-    srcs[4] = _mm_setzero_si128();
-
-    // 00 01
-    srcs[0] = Load2(src);
-    src += src_stride;
-    // 00 01 10 11
-    srcs[0] = Load2<1>(src, srcs[0]);
-    src += src_stride;
-    // 00 01 10 11 20 21
-    srcs[0] = Load2<2>(src, srcs[0]);
-    src += src_stride;
-
-    int y = 0;
-    do {
-      // 00 01 10 11 20 21 30 31
-      srcs[0] = Load2<3>(src, srcs[0]);
-      src += src_stride;
-      // 40 41
-      srcs[4] = Load2<0>(src, srcs[4]);
-      src += src_stride;
-      // 40 41 50 51
-      srcs[4] = Load2<1>(src, srcs[4]);
-      src += src_stride;
-      // 40 41 50 51 60 61
-      srcs[4] = Load2<2>(src, srcs[4]);
-      src += src_stride;
-      // 00 01 10 11 20 21 30 31 40 41 50 51 60 61
-      const __m128i srcs_0_4 = _mm_unpacklo_epi64(srcs[0], srcs[4]);
-      // 10 11 20 21 30 31 40 41
-      srcs[1] = _mm_srli_si128(srcs_0_4, 2);
-      // 20 21 30 31 40 41 50 51
-      srcs[2] = _mm_srli_si128(srcs_0_4, 4);
-      // 30 31 40 41 50 51 60 61
-      srcs[3] = _mm_srli_si128(srcs_0_4, 6);
-
-      // This uses srcs[0]..srcs[3].
-      const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
-      const __m128i results_16 =
-          RightShiftWithRounding_S16(sums, kFilterBits - 1);
-      const __m128i results = _mm_packus_epi16(results_16, results_16);
-
-      Store2(dst8, results);
-      dst8 += dst_stride;
-      Store2(dst8, _mm_srli_si128(results, 2));
-      if (height == 2) return;
-      dst8 += dst_stride;
-      Store2(dst8, _mm_srli_si128(results, 4));
-      dst8 += dst_stride;
-      Store2(dst8, _mm_srli_si128(results, 6));
-      dst8 += dst_stride;
-
-      srcs[0] = srcs[4];
-      y += 4;
-    } while (y < height);
-  } else if (num_taps == 6) {
-    // During the vertical pass the number of taps is restricted when
-    // |height| <= 4.
-    assert(height > 4);
-    srcs[8] = _mm_setzero_si128();
-
-    // 00 01
-    srcs[0] = Load2(src);
-    src += src_stride;
-    // 00 01 10 11
-    srcs[0] = Load2<1>(src, srcs[0]);
-    src += src_stride;
-    // 00 01 10 11 20 21
-    srcs[0] = Load2<2>(src, srcs[0]);
-    src += src_stride;
-    // 00 01 10 11 20 21 30 31
-    srcs[0] = Load2<3>(src, srcs[0]);
-    src += src_stride;
-    // 40 41
-    srcs[4] = Load2(src);
-    src += src_stride;
-    // 00 01 10 11 20 21 30 31 40 41 50 51 60 61
-    const __m128i srcs_0_4x = _mm_unpacklo_epi64(srcs[0], srcs[4]);
-    // 10 11 20 21 30 31 40 41
-    srcs[1] = _mm_srli_si128(srcs_0_4x, 2);
-
-    int y = 0;
-    do {
-      // 40 41 50 51
-      srcs[4] = Load2<1>(src, srcs[4]);
-      src += src_stride;
-      // 40 41 50 51 60 61
-      srcs[4] = Load2<2>(src, srcs[4]);
-      src += src_stride;
-      // 40 41 50 51 60 61 70 71
-      srcs[4] = Load2<3>(src, srcs[4]);
-      src += src_stride;
-      // 80 81
-      srcs[8] = Load2<0>(src, srcs[8]);
-      src += src_stride;
-      // 00 01 10 11 20 21 30 31 40 41 50 51 60 61
-      const __m128i srcs_0_4 = _mm_unpacklo_epi64(srcs[0], srcs[4]);
-      // 20 21 30 31 40 41 50 51
-      srcs[2] = _mm_srli_si128(srcs_0_4, 4);
-      // 30 31 40 41 50 51 60 61
-      srcs[3] = _mm_srli_si128(srcs_0_4, 6);
-      const __m128i srcs_4_8 = _mm_unpacklo_epi64(srcs[4], srcs[8]);
-      // 50 51 60 61 70 71 80 81
-      srcs[5] = _mm_srli_si128(srcs_4_8, 2);
-
-      // This uses srcs[0]..srcs[5].
-      const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
-      const __m128i results_16 =
-          RightShiftWithRounding_S16(sums, kFilterBits - 1);
-      const __m128i results = _mm_packus_epi16(results_16, results_16);
-
-      Store2(dst8, results);
-      dst8 += dst_stride;
-      Store2(dst8, _mm_srli_si128(results, 2));
-      dst8 += dst_stride;
-      Store2(dst8, _mm_srli_si128(results, 4));
-      dst8 += dst_stride;
-      Store2(dst8, _mm_srli_si128(results, 6));
-      dst8 += dst_stride;
-
-      srcs[0] = srcs[4];
-      srcs[1] = srcs[5];
-      srcs[4] = srcs[8];
-      y += 4;
-    } while (y < height);
-  } else if (num_taps == 8) {
-    // During the vertical pass the number of taps is restricted when
-    // |height| <= 4.
-    assert(height > 4);
-    srcs[8] = _mm_setzero_si128();
-    // 00 01
-    srcs[0] = Load2(src);
-    src += src_stride;
-    // 00 01 10 11
-    srcs[0] = Load2<1>(src, srcs[0]);
-    src += src_stride;
-    // 00 01 10 11 20 21
-    srcs[0] = Load2<2>(src, srcs[0]);
-    src += src_stride;
-    // 00 01 10 11 20 21 30 31
-    srcs[0] = Load2<3>(src, srcs[0]);
-    src += src_stride;
-    // 40 41
-    srcs[4] = Load2(src);
-    src += src_stride;
-    // 40 41 50 51
-    srcs[4] = Load2<1>(src, srcs[4]);
-    src += src_stride;
-    // 40 41 50 51 60 61
-    srcs[4] = Load2<2>(src, srcs[4]);
-    src += src_stride;
-
-    // 00 01 10 11 20 21 30 31 40 41 50 51 60 61
-    const __m128i srcs_0_4 = _mm_unpacklo_epi64(srcs[0], srcs[4]);
-    // 10 11 20 21 30 31 40 41
-    srcs[1] = _mm_srli_si128(srcs_0_4, 2);
-    // 20 21 30 31 40 41 50 51
-    srcs[2] = _mm_srli_si128(srcs_0_4, 4);
-    // 30 31 40 41 50 51 60 61
-    srcs[3] = _mm_srli_si128(srcs_0_4, 6);
-
-    int y = 0;
-    do {
-      // 40 41 50 51 60 61 70 71
-      srcs[4] = Load2<3>(src, srcs[4]);
-      src += src_stride;
-      // 80 81
-      srcs[8] = Load2<0>(src, srcs[8]);
-      src += src_stride;
-      // 80 81 90 91
-      srcs[8] = Load2<1>(src, srcs[8]);
-      src += src_stride;
-      // 80 81 90 91 a0 a1
-      srcs[8] = Load2<2>(src, srcs[8]);
-      src += src_stride;
-
-      // 40 41 50 51 60 61 70 71 80 81 90 91 a0 a1
-      const __m128i srcs_4_8 = _mm_unpacklo_epi64(srcs[4], srcs[8]);
-      // 50 51 60 61 70 71 80 81
-      srcs[5] = _mm_srli_si128(srcs_4_8, 2);
-      // 60 61 70 71 80 81 90 91
-      srcs[6] = _mm_srli_si128(srcs_4_8, 4);
-      // 70 71 80 81 90 91 a0 a1
-      srcs[7] = _mm_srli_si128(srcs_4_8, 6);
-
-      // This uses srcs[0]..srcs[7].
-      const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
-      const __m128i results_16 =
-          RightShiftWithRounding_S16(sums, kFilterBits - 1);
-      const __m128i results = _mm_packus_epi16(results_16, results_16);
-
-      Store2(dst8, results);
-      dst8 += dst_stride;
-      Store2(dst8, _mm_srli_si128(results, 2));
-      dst8 += dst_stride;
-      Store2(dst8, _mm_srli_si128(results, 4));
-      dst8 += dst_stride;
-      Store2(dst8, _mm_srli_si128(results, 6));
-      dst8 += dst_stride;
-
-      srcs[0] = srcs[4];
-      srcs[1] = srcs[5];
-      srcs[2] = srcs[6];
-      srcs[3] = srcs[7];
-      srcs[4] = srcs[8];
-      y += 4;
-    } while (y < height);
-  }
-}
-
 void ConvolveVertical_SSE4_1(const void* const reference,
                              const ptrdiff_t reference_stride,
                              const int /*horizontal_filter_index*/,
@@ -1339,9 +424,9 @@ void ConvolveVertical_SSE4_1(const void* const reference,
   if (filter_index < 2) {  // 6 tap.
     SetupTaps<6>(&v_filter, taps);
     if (width == 2) {
-      FilterVertical2xH<0>(src, src_stride, dest, dest_stride, height, taps);
+      FilterVertical2xH<6, 0>(src, src_stride, dest, dest_stride, height, taps);
     } else if (width == 4) {
-      FilterVertical4xH<0>(src, src_stride, dest, dest_stride, height, taps);
+      FilterVertical4xH<6, 0>(src, src_stride, dest, dest_stride, height, taps);
     } else {
       FilterVertical<0>(src, src_stride, dest, dest_stride, width, height,
                         taps);
@@ -1349,9 +434,9 @@ void ConvolveVertical_SSE4_1(const void* const reference,
   } else if (filter_index == 2) {  // 8 tap.
     SetupTaps<8>(&v_filter, taps);
     if (width == 2) {
-      FilterVertical2xH<2>(src, src_stride, dest, dest_stride, height, taps);
+      FilterVertical2xH<8, 2>(src, src_stride, dest, dest_stride, height, taps);
     } else if (width == 4) {
-      FilterVertical4xH<2>(src, src_stride, dest, dest_stride, height, taps);
+      FilterVertical4xH<8, 2>(src, src_stride, dest, dest_stride, height, taps);
     } else {
       FilterVertical<2>(src, src_stride, dest, dest_stride, width, height,
                         taps);
@@ -1359,9 +444,9 @@ void ConvolveVertical_SSE4_1(const void* const reference,
   } else if (filter_index == 3) {  // 2 tap.
     SetupTaps<2>(&v_filter, taps);
     if (width == 2) {
-      FilterVertical2xH<3>(src, src_stride, dest, dest_stride, height, taps);
+      FilterVertical2xH<2, 3>(src, src_stride, dest, dest_stride, height, taps);
     } else if (width == 4) {
-      FilterVertical4xH<3>(src, src_stride, dest, dest_stride, height, taps);
+      FilterVertical4xH<2, 3>(src, src_stride, dest, dest_stride, height, taps);
     } else {
       FilterVertical<3>(src, src_stride, dest, dest_stride, width, height,
                         taps);
@@ -1369,9 +454,9 @@ void ConvolveVertical_SSE4_1(const void* const reference,
   } else if (filter_index == 4) {  // 4 tap.
     SetupTaps<4>(&v_filter, taps);
     if (width == 2) {
-      FilterVertical2xH<4>(src, src_stride, dest, dest_stride, height, taps);
+      FilterVertical2xH<4, 4>(src, src_stride, dest, dest_stride, height, taps);
     } else if (width == 4) {
-      FilterVertical4xH<4>(src, src_stride, dest, dest_stride, height, taps);
+      FilterVertical4xH<4, 4>(src, src_stride, dest, dest_stride, height, taps);
     } else {
       FilterVertical<4>(src, src_stride, dest, dest_stride, width, height,
                         taps);
@@ -1382,9 +467,9 @@ void ConvolveVertical_SSE4_1(const void* const reference,
     SetupTaps<4>(&v_filter, taps);
 
     if (width == 2) {
-      FilterVertical2xH<5>(src, src_stride, dest, dest_stride, height, taps);
+      FilterVertical2xH<4, 5>(src, src_stride, dest, dest_stride, height, taps);
     } else if (width == 4) {
-      FilterVertical4xH<5>(src, src_stride, dest, dest_stride, height, taps);
+      FilterVertical4xH<4, 5>(src, src_stride, dest, dest_stride, height, taps);
     } else {
       FilterVertical<5>(src, src_stride, dest, dest_stride, width, height,
                         taps);
@@ -1474,8 +559,8 @@ void ConvolveCompoundVertical_SSE4_1(
   if (filter_index < 2) {  // 6 tap.
     SetupTaps<6>(&v_filter, taps);
     if (width == 4) {
-      FilterVertical4xH<0, /*is_compound=*/true>(src, src_stride, dest, 4,
-                                                 height, taps);
+      FilterVertical4xH<6, 0, /*is_compound=*/true>(src, src_stride, dest, 4,
+                                                    height, taps);
     } else {
       FilterVertical<0, /*is_compound=*/true>(src, src_stride, dest, width,
                                               width, height, taps);
@@ -1484,8 +569,8 @@ void ConvolveCompoundVertical_SSE4_1(
     SetupTaps<8>(&v_filter, taps);
 
     if (width == 4) {
-      FilterVertical4xH<2, /*is_compound=*/true>(src, src_stride, dest, 4,
-                                                 height, taps);
+      FilterVertical4xH<8, 2, /*is_compound=*/true>(src, src_stride, dest, 4,
+                                                    height, taps);
     } else {
       FilterVertical<2, /*is_compound=*/true>(src, src_stride, dest, width,
                                               width, height, taps);
@@ -1494,8 +579,8 @@ void ConvolveCompoundVertical_SSE4_1(
     SetupTaps<2>(&v_filter, taps);
 
     if (width == 4) {
-      FilterVertical4xH<3, /*is_compound=*/true>(src, src_stride, dest, 4,
-                                                 height, taps);
+      FilterVertical4xH<2, 3, /*is_compound=*/true>(src, src_stride, dest, 4,
+                                                    height, taps);
     } else {
       FilterVertical<3, /*is_compound=*/true>(src, src_stride, dest, width,
                                               width, height, taps);
@@ -1504,8 +589,8 @@ void ConvolveCompoundVertical_SSE4_1(
     SetupTaps<4>(&v_filter, taps);
 
     if (width == 4) {
-      FilterVertical4xH<4, /*is_compound=*/true>(src, src_stride, dest, 4,
-                                                 height, taps);
+      FilterVertical4xH<4, 4, /*is_compound=*/true>(src, src_stride, dest, 4,
+                                                    height, taps);
     } else {
       FilterVertical<4, /*is_compound=*/true>(src, src_stride, dest, width,
                                               width, height, taps);
@@ -1514,8 +599,8 @@ void ConvolveCompoundVertical_SSE4_1(
     SetupTaps<4>(&v_filter, taps);
 
     if (width == 4) {
-      FilterVertical4xH<5, /*is_compound=*/true>(src, src_stride, dest, 4,
-                                                 height, taps);
+      FilterVertical4xH<4, 5, /*is_compound=*/true>(src, src_stride, dest, 4,
+                                                    height, taps);
     } else {
       FilterVertical<5, /*is_compound=*/true>(src, src_stride, dest, width,
                                               width, height, taps);
@@ -1752,7 +837,11 @@ inline void GetHalfSubPixelFilter(__m128i* output) {
 template <int num_taps, int grade_x>
 inline void PrepareSourceVectors(const uint8_t* src, const __m128i src_indices,
                                  __m128i* const source /*[num_taps >> 1]*/) {
-  const __m128i src_vals = LoadUnaligned16(src);
+  // |used_bytes| is only computed in msan builds. Mask away unused bytes for
+  // msan because it incorrectly models the outcome of the shuffles in some
+  // cases. This has not been reproduced out of context.
+  const int used_bytes = _mm_extract_epi8(src_indices, 15) + 1 + num_taps - 2;
+  const __m128i src_vals = LoadUnaligned16Msan(src, 16 - used_bytes);
   source[0] = _mm_shuffle_epi8(src_vals, src_indices);
   if (grade_x == 1) {
     if (num_taps > 2) {
@@ -1768,7 +857,7 @@ inline void PrepareSourceVectors(const uint8_t* src, const __m128i src_indices,
     assert(grade_x > 1);
     assert(num_taps != 4);
     // grade_x > 1 also means width >= 8 && num_taps != 4
-    const __m128i src_vals_ext = LoadLo8(src + 16);
+    const __m128i src_vals_ext = LoadLo8Msan(src + 16, 24 - used_bytes);
     if (num_taps > 2) {
       source[1] = _mm_shuffle_epi8(_mm_alignr_epi8(src_vals_ext, src_vals, 2),
                                    src_indices);
@@ -1983,14 +1072,10 @@ __m128i Sum2DVerticalTaps4x2(const __m128i* const src, const __m128i* taps_lo,
 // |width_class| is 2, 4, or 8, according to the Store function that should be
 // used.
 template <int num_taps, int width_class, bool is_compound>
-#if LIBGAV1_MSAN
-__attribute__((no_sanitize_memory)) void ConvolveVerticalScale(
-#else
-inline void ConvolveVerticalScale(
-#endif
-    const int16_t* src, const int width, const int subpixel_y,
-    const int filter_index, const int step_y, const int height, void* dest,
-    const ptrdiff_t dest_stride) {
+inline void ConvolveVerticalScale(const int16_t* src, const int width,
+                                  const int subpixel_y, const int filter_index,
+                                  const int step_y, const int height,
+                                  void* dest, const ptrdiff_t dest_stride) {
   constexpr ptrdiff_t src_stride = kIntermediateStride;
   constexpr int kernel_offset = (8 - num_taps) / 2;
   const int16_t* src_y = src;
@@ -2819,7 +1904,7 @@ void ConvolveInit_SSE4_1() { low_bitdepth::Init8bpp(); }
 }  // namespace dsp
 }  // namespace libgav1
 
-#else  // !LIBGAV1_TARGETING_SSE4_1
+#else   // !LIBGAV1_TARGETING_SSE4_1
 namespace libgav1 {
 namespace dsp {
 
diff --git a/src/dsp/x86/convolve_sse4.inc b/src/dsp/x86/convolve_sse4.inc
new file mode 100644
index 0000000..550d6a4
--- /dev/null
+++ b/src/dsp/x86/convolve_sse4.inc
@@ -0,0 +1,934 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Common 128 bit functions used for sse4/avx2 convolve implementations.
+// This will be included inside an anonymous namespace on files where these are
+// necessary.
+
+#include "src/dsp/convolve.inc"
+
+// Multiply every entry in |src[]| by the corresponding entry in |taps[]| and
+// sum. The filters in |taps[]| are pre-shifted by 1. This prevents the final
+// sum from outranging int16_t.
+template <int filter_index>
+__m128i SumOnePassTaps(const __m128i* const src, const __m128i* const taps) {
+  __m128i sum;
+  if (filter_index < 2) {
+    // 6 taps.
+    const __m128i v_madd_21 = _mm_maddubs_epi16(src[0], taps[0]);  // k2k1
+    const __m128i v_madd_43 = _mm_maddubs_epi16(src[1], taps[1]);  // k4k3
+    const __m128i v_madd_65 = _mm_maddubs_epi16(src[2], taps[2]);  // k6k5
+    sum = _mm_add_epi16(v_madd_21, v_madd_43);
+    sum = _mm_add_epi16(sum, v_madd_65);
+  } else if (filter_index == 2) {
+    // 8 taps.
+    const __m128i v_madd_10 = _mm_maddubs_epi16(src[0], taps[0]);  // k1k0
+    const __m128i v_madd_32 = _mm_maddubs_epi16(src[1], taps[1]);  // k3k2
+    const __m128i v_madd_54 = _mm_maddubs_epi16(src[2], taps[2]);  // k5k4
+    const __m128i v_madd_76 = _mm_maddubs_epi16(src[3], taps[3]);  // k7k6
+    const __m128i v_sum_3210 = _mm_add_epi16(v_madd_10, v_madd_32);
+    const __m128i v_sum_7654 = _mm_add_epi16(v_madd_54, v_madd_76);
+    sum = _mm_add_epi16(v_sum_7654, v_sum_3210);
+  } else if (filter_index == 3) {
+    // 2 taps.
+    sum = _mm_maddubs_epi16(src[0], taps[0]);  // k4k3
+  } else {
+    // 4 taps.
+    const __m128i v_madd_32 = _mm_maddubs_epi16(src[0], taps[0]);  // k3k2
+    const __m128i v_madd_54 = _mm_maddubs_epi16(src[1], taps[1]);  // k5k4
+    sum = _mm_add_epi16(v_madd_32, v_madd_54);
+  }
+  return sum;
+}
+
+template <int filter_index>
+__m128i SumHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride,
+                             const __m128i* const v_tap) {
+  // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17
+  const __m128i v_src = LoadHi8(LoadLo8(&src[0]), &src[src_stride]);
+
+  if (filter_index == 3) {
+    // 03 04 04 05 05 06 06 07 13 14 14 15 15 16 16 17
+    const __m128i v_src_43 = _mm_shuffle_epi8(
+        v_src, _mm_set_epi32(0x0f0e0e0d, 0x0d0c0c0b, 0x07060605, 0x05040403));
+    const __m128i v_sum_43 = _mm_maddubs_epi16(v_src_43, v_tap[0]);  // k4k3
+    return v_sum_43;
+  }
+
+  // 02 03 03 04 04 05 05 06 12 13 13 14 14 15 15 16
+  const __m128i v_src_32 = _mm_shuffle_epi8(
+      v_src, _mm_set_epi32(0x0e0d0d0c, 0x0c0b0b0a, 0x06050504, 0x04030302));
+  // 04 05 05 06 06 07 07 xx 14 15 15 16 16 17 17 xx
+  const __m128i v_src_54 = _mm_shuffle_epi8(
+      v_src, _mm_set_epi32(static_cast<int>(0x800f0f0e), 0x0e0d0d0c,
+                           static_cast<int>(0x80070706), 0x06050504));
+  const __m128i v_madd_32 = _mm_maddubs_epi16(v_src_32, v_tap[0]);  // k3k2
+  const __m128i v_madd_54 = _mm_maddubs_epi16(v_src_54, v_tap[1]);  // k5k4
+  const __m128i v_sum_5432 = _mm_add_epi16(v_madd_54, v_madd_32);
+  return v_sum_5432;
+}
+
+template <int filter_index>
+__m128i SimpleHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride,
+                                const __m128i* const v_tap) {
+  __m128i sum = SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
+
+  // Normally the Horizontal pass does the downshift in two passes:
+  // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
+  // kInterRoundBitsHorizontal). Each one uses a rounding shift. Combining them
+  // requires adding the rounding offset from the skipped shift.
+  constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2);
+
+  sum = _mm_add_epi16(sum, _mm_set1_epi16(first_shift_rounding_bit));
+  sum = RightShiftWithRounding_S16(sum, kFilterBits - 1);
+  return _mm_packus_epi16(sum, sum);
+}
+
+template <int filter_index>
+__m128i HorizontalTaps8To16_2x2(const uint8_t* src, const ptrdiff_t src_stride,
+                                const __m128i* const v_tap) {
+  const __m128i sum =
+      SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
+
+  return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
+}
+
+template <int num_taps, bool is_2d_vertical = false>
+LIBGAV1_ALWAYS_INLINE void SetupTaps(const __m128i* const filter,
+                                     __m128i* v_tap) {
+  if (num_taps == 8) {
+    v_tap[0] = _mm_shufflelo_epi16(*filter, 0x0);   // k1k0
+    v_tap[1] = _mm_shufflelo_epi16(*filter, 0x55);  // k3k2
+    v_tap[2] = _mm_shufflelo_epi16(*filter, 0xaa);  // k5k4
+    v_tap[3] = _mm_shufflelo_epi16(*filter, 0xff);  // k7k6
+    if (is_2d_vertical) {
+      v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
+      v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]);
+      v_tap[2] = _mm_cvtepi8_epi16(v_tap[2]);
+      v_tap[3] = _mm_cvtepi8_epi16(v_tap[3]);
+    } else {
+      v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
+      v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]);
+      v_tap[2] = _mm_unpacklo_epi64(v_tap[2], v_tap[2]);
+      v_tap[3] = _mm_unpacklo_epi64(v_tap[3], v_tap[3]);
+    }
+  } else if (num_taps == 6) {
+    const __m128i adjusted_filter = _mm_srli_si128(*filter, 1);
+    v_tap[0] = _mm_shufflelo_epi16(adjusted_filter, 0x0);   // k2k1
+    v_tap[1] = _mm_shufflelo_epi16(adjusted_filter, 0x55);  // k4k3
+    v_tap[2] = _mm_shufflelo_epi16(adjusted_filter, 0xaa);  // k6k5
+    if (is_2d_vertical) {
+      v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
+      v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]);
+      v_tap[2] = _mm_cvtepi8_epi16(v_tap[2]);
+    } else {
+      v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
+      v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]);
+      v_tap[2] = _mm_unpacklo_epi64(v_tap[2], v_tap[2]);
+    }
+  } else if (num_taps == 4) {
+    v_tap[0] = _mm_shufflelo_epi16(*filter, 0x55);  // k3k2
+    v_tap[1] = _mm_shufflelo_epi16(*filter, 0xaa);  // k5k4
+    if (is_2d_vertical) {
+      v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
+      v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]);
+    } else {
+      v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
+      v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]);
+    }
+  } else {  // num_taps == 2
+    const __m128i adjusted_filter = _mm_srli_si128(*filter, 1);
+    v_tap[0] = _mm_shufflelo_epi16(adjusted_filter, 0x55);  // k4k3
+    if (is_2d_vertical) {
+      v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
+    } else {
+      v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
+    }
+  }
+}
+
+template <int num_taps, bool is_compound>
+__m128i SimpleSum2DVerticalTaps(const __m128i* const src,
+                                const __m128i* const taps) {
+  __m128i sum_lo = _mm_madd_epi16(_mm_unpacklo_epi16(src[0], src[1]), taps[0]);
+  __m128i sum_hi = _mm_madd_epi16(_mm_unpackhi_epi16(src[0], src[1]), taps[0]);
+  if (num_taps >= 4) {
+    __m128i madd_lo =
+        _mm_madd_epi16(_mm_unpacklo_epi16(src[2], src[3]), taps[1]);
+    __m128i madd_hi =
+        _mm_madd_epi16(_mm_unpackhi_epi16(src[2], src[3]), taps[1]);
+    sum_lo = _mm_add_epi32(sum_lo, madd_lo);
+    sum_hi = _mm_add_epi32(sum_hi, madd_hi);
+    if (num_taps >= 6) {
+      madd_lo = _mm_madd_epi16(_mm_unpacklo_epi16(src[4], src[5]), taps[2]);
+      madd_hi = _mm_madd_epi16(_mm_unpackhi_epi16(src[4], src[5]), taps[2]);
+      sum_lo = _mm_add_epi32(sum_lo, madd_lo);
+      sum_hi = _mm_add_epi32(sum_hi, madd_hi);
+      if (num_taps == 8) {
+        madd_lo = _mm_madd_epi16(_mm_unpacklo_epi16(src[6], src[7]), taps[3]);
+        madd_hi = _mm_madd_epi16(_mm_unpackhi_epi16(src[6], src[7]), taps[3]);
+        sum_lo = _mm_add_epi32(sum_lo, madd_lo);
+        sum_hi = _mm_add_epi32(sum_hi, madd_hi);
+      }
+    }
+  }
+
+  if (is_compound) {
+    return _mm_packs_epi32(
+        RightShiftWithRounding_S32(sum_lo, kInterRoundBitsCompoundVertical - 1),
+        RightShiftWithRounding_S32(sum_hi,
+                                   kInterRoundBitsCompoundVertical - 1));
+  }
+
+  return _mm_packs_epi32(
+      RightShiftWithRounding_S32(sum_lo, kInterRoundBitsVertical - 1),
+      RightShiftWithRounding_S32(sum_hi, kInterRoundBitsVertical - 1));
+}
+
+template <int num_taps, bool is_compound = false>
+void Filter2DVertical(const uint16_t* src, void* const dst,
+                      const ptrdiff_t dst_stride, const int width,
+                      const int height, const __m128i* const taps) {
+  assert(width >= 8);
+  constexpr int next_row = num_taps - 1;
+  // The Horizontal pass uses |width| as |stride| for the intermediate buffer.
+  const ptrdiff_t src_stride = width;
+
+  auto* dst8 = static_cast<uint8_t*>(dst);
+  auto* dst16 = static_cast<uint16_t*>(dst);
+
+  int x = 0;
+  do {
+    __m128i srcs[8];
+    const uint16_t* src_x = src + x;
+    srcs[0] = LoadAligned16(src_x);
+    src_x += src_stride;
+    if (num_taps >= 4) {
+      srcs[1] = LoadAligned16(src_x);
+      src_x += src_stride;
+      srcs[2] = LoadAligned16(src_x);
+      src_x += src_stride;
+      if (num_taps >= 6) {
+        srcs[3] = LoadAligned16(src_x);
+        src_x += src_stride;
+        srcs[4] = LoadAligned16(src_x);
+        src_x += src_stride;
+        if (num_taps == 8) {
+          srcs[5] = LoadAligned16(src_x);
+          src_x += src_stride;
+          srcs[6] = LoadAligned16(src_x);
+          src_x += src_stride;
+        }
+      }
+    }
+
+    auto* dst8_x = dst8 + x;
+    auto* dst16_x = dst16 + x;
+    int y = height;
+    do {
+      srcs[next_row] = LoadAligned16(src_x);
+      src_x += src_stride;
+
+      const __m128i sum =
+          SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps);
+      if (is_compound) {
+        StoreUnaligned16(dst16_x, sum);
+        dst16_x += dst_stride;
+      } else {
+        StoreLo8(dst8_x, _mm_packus_epi16(sum, sum));
+        dst8_x += dst_stride;
+      }
+
+      srcs[0] = srcs[1];
+      if (num_taps >= 4) {
+        srcs[1] = srcs[2];
+        srcs[2] = srcs[3];
+        if (num_taps >= 6) {
+          srcs[3] = srcs[4];
+          srcs[4] = srcs[5];
+          if (num_taps == 8) {
+            srcs[5] = srcs[6];
+            srcs[6] = srcs[7];
+          }
+        }
+      }
+    } while (--y != 0);
+    x += 8;
+  } while (x < width);
+}
+
+// Take advantage of |src_stride| == |width| to process two rows at a time.
+template <int num_taps, bool is_compound = false>
+void Filter2DVertical4xH(const uint16_t* src, void* const dst,
+                         const ptrdiff_t dst_stride, const int height,
+                         const __m128i* const taps) {
+  auto* dst8 = static_cast<uint8_t*>(dst);
+  auto* dst16 = static_cast<uint16_t*>(dst);
+
+  __m128i srcs[9];
+  srcs[0] = LoadAligned16(src);
+  src += 8;
+  if (num_taps >= 4) {
+    srcs[2] = LoadAligned16(src);
+    src += 8;
+    srcs[1] = _mm_unpacklo_epi64(_mm_srli_si128(srcs[0], 8), srcs[2]);
+    if (num_taps >= 6) {
+      srcs[4] = LoadAligned16(src);
+      src += 8;
+      srcs[3] = _mm_unpacklo_epi64(_mm_srli_si128(srcs[2], 8), srcs[4]);
+      if (num_taps == 8) {
+        srcs[6] = LoadAligned16(src);
+        src += 8;
+        srcs[5] = _mm_unpacklo_epi64(_mm_srli_si128(srcs[4], 8), srcs[6]);
+      }
+    }
+  }
+
+  int y = height;
+  do {
+    srcs[num_taps] = LoadAligned16(src);
+    src += 8;
+    srcs[num_taps - 1] = _mm_unpacklo_epi64(
+        _mm_srli_si128(srcs[num_taps - 2], 8), srcs[num_taps]);
+
+    const __m128i sum =
+        SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps);
+    if (is_compound) {
+      StoreUnaligned16(dst16, sum);
+      dst16 += 4 << 1;
+    } else {
+      const __m128i results = _mm_packus_epi16(sum, sum);
+      Store4(dst8, results);
+      dst8 += dst_stride;
+      Store4(dst8, _mm_srli_si128(results, 4));
+      dst8 += dst_stride;
+    }
+
+    srcs[0] = srcs[2];
+    if (num_taps >= 4) {
+      srcs[1] = srcs[3];
+      srcs[2] = srcs[4];
+      if (num_taps >= 6) {
+        srcs[3] = srcs[5];
+        srcs[4] = srcs[6];
+        if (num_taps == 8) {
+          srcs[5] = srcs[7];
+          srcs[6] = srcs[8];
+        }
+      }
+    }
+    y -= 2;
+  } while (y != 0);
+}
+
+// Take advantage of |src_stride| == |width| to process four rows at a time.
+template <int num_taps>
+void Filter2DVertical2xH(const uint16_t* src, void* const dst,
+                         const ptrdiff_t dst_stride, const int height,
+                         const __m128i* const taps) {
+  constexpr int next_row = (num_taps < 6) ? 4 : 8;
+
+  auto* dst8 = static_cast<uint8_t*>(dst);
+
+  __m128i srcs[9];
+  srcs[0] = LoadAligned16(src);
+  src += 8;
+  if (num_taps >= 6) {
+    srcs[4] = LoadAligned16(src);
+    src += 8;
+    srcs[1] = _mm_alignr_epi8(srcs[4], srcs[0], 4);
+    if (num_taps == 8) {
+      srcs[2] = _mm_alignr_epi8(srcs[4], srcs[0], 8);
+      srcs[3] = _mm_alignr_epi8(srcs[4], srcs[0], 12);
+    }
+  }
+
+  int y = height;
+  do {
+    srcs[next_row] = LoadAligned16(src);
+    src += 8;
+    if (num_taps == 2) {
+      srcs[1] = _mm_alignr_epi8(srcs[4], srcs[0], 4);
+    } else if (num_taps == 4) {
+      srcs[1] = _mm_alignr_epi8(srcs[4], srcs[0], 4);
+      srcs[2] = _mm_alignr_epi8(srcs[4], srcs[0], 8);
+      srcs[3] = _mm_alignr_epi8(srcs[4], srcs[0], 12);
+    } else if (num_taps == 6) {
+      srcs[2] = _mm_alignr_epi8(srcs[4], srcs[0], 8);
+      srcs[3] = _mm_alignr_epi8(srcs[4], srcs[0], 12);
+      srcs[5] = _mm_alignr_epi8(srcs[8], srcs[4], 4);
+    } else if (num_taps == 8) {
+      srcs[5] = _mm_alignr_epi8(srcs[8], srcs[4], 4);
+      srcs[6] = _mm_alignr_epi8(srcs[8], srcs[4], 8);
+      srcs[7] = _mm_alignr_epi8(srcs[8], srcs[4], 12);
+    }
+
+    const __m128i sum =
+        SimpleSum2DVerticalTaps<num_taps, /*is_compound=*/false>(srcs, taps);
+    const __m128i results = _mm_packus_epi16(sum, sum);
+
+    Store2(dst8, results);
+    dst8 += dst_stride;
+    Store2(dst8, _mm_srli_si128(results, 2));
+    // When |height| <= 4 the taps are restricted to 2 and 4 tap variants.
+    // Therefore we don't need to check this condition when |height| > 4.
+    if (num_taps <= 4 && height == 2) return;
+    dst8 += dst_stride;
+    Store2(dst8, _mm_srli_si128(results, 4));
+    dst8 += dst_stride;
+    Store2(dst8, _mm_srli_si128(results, 6));
+    dst8 += dst_stride;
+
+    srcs[0] = srcs[4];
+    if (num_taps == 6) {
+      srcs[1] = srcs[5];
+      srcs[4] = srcs[8];
+    } else if (num_taps == 8) {
+      srcs[1] = srcs[5];
+      srcs[2] = srcs[6];
+      srcs[3] = srcs[7];
+      srcs[4] = srcs[8];
+    }
+
+    y -= 4;
+  } while (y != 0);
+}
+
+// The 1D compound shift is always |kInterRoundBitsHorizontal|, even for 1D
+// Vertical calculations.
+__m128i Compound1DShift(const __m128i sum) {
+  return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
+}
+
+template <int filter_index>
+__m128i SumVerticalTaps(const __m128i* const srcs, const __m128i* const v_tap) {
+  __m128i v_src[4];
+
+  if (filter_index < 2) {
+    // 6 taps.
+    v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
+    v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]);
+    v_src[2] = _mm_unpacklo_epi8(srcs[4], srcs[5]);
+  } else if (filter_index == 2) {
+    // 8 taps.
+    v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
+    v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]);
+    v_src[2] = _mm_unpacklo_epi8(srcs[4], srcs[5]);
+    v_src[3] = _mm_unpacklo_epi8(srcs[6], srcs[7]);
+  } else if (filter_index == 3) {
+    // 2 taps.
+    v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
+  } else if (filter_index > 3) {
+    // 4 taps.
+    v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
+    v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]);
+  }
+  const __m128i sum = SumOnePassTaps<filter_index>(v_src, v_tap);
+  return sum;
+}
+
+// TODO(slavarnway): Use num_taps instead of filter_index for templates. See the
+// 2D version.
+template <int num_taps, int filter_index, bool is_compound = false>
+void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
+                       void* const dst, const ptrdiff_t dst_stride,
+                       const int height, const __m128i* const v_tap) {
+  auto* dst8 = static_cast<uint8_t*>(dst);
+  auto* dst16 = static_cast<uint16_t*>(dst);
+
+  __m128i srcs[9];
+
+  if (num_taps == 2) {
+    srcs[2] = _mm_setzero_si128();
+    // 00 01 02 03
+    srcs[0] = Load4(src);
+    src += src_stride;
+
+    int y = height;
+    do {
+      // 10 11 12 13
+      const __m128i a = Load4(src);
+      // 00 01 02 03 10 11 12 13
+      srcs[0] = _mm_unpacklo_epi32(srcs[0], a);
+      src += src_stride;
+      // 20 21 22 23
+      srcs[2] = Load4(src);
+      src += src_stride;
+      // 10 11 12 13 20 21 22 23
+      srcs[1] = _mm_unpacklo_epi32(a, srcs[2]);
+
+      const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+      if (is_compound) {
+        const __m128i results = Compound1DShift(sums);
+        StoreUnaligned16(dst16, results);
+        dst16 += 4 << 1;
+      } else {
+        const __m128i results_16 =
+            RightShiftWithRounding_S16(sums, kFilterBits - 1);
+        const __m128i results = _mm_packus_epi16(results_16, results_16);
+        Store4(dst8, results);
+        dst8 += dst_stride;
+        Store4(dst8, _mm_srli_si128(results, 4));
+        dst8 += dst_stride;
+      }
+
+      srcs[0] = srcs[2];
+      y -= 2;
+    } while (y != 0);
+  } else if (num_taps == 4) {
+    srcs[4] = _mm_setzero_si128();
+    // 00 01 02 03
+    srcs[0] = Load4(src);
+    src += src_stride;
+    // 10 11 12 13
+    const __m128i a = Load4(src);
+    // 00 01 02 03 10 11 12 13
+    srcs[0] = _mm_unpacklo_epi32(srcs[0], a);
+    src += src_stride;
+    // 20 21 22 23
+    srcs[2] = Load4(src);
+    src += src_stride;
+    // 10 11 12 13 20 21 22 23
+    srcs[1] = _mm_unpacklo_epi32(a, srcs[2]);
+
+    int y = height;
+    do {
+      // 30 31 32 33
+      const __m128i b = Load4(src);
+      // 20 21 22 23 30 31 32 33
+      srcs[2] = _mm_unpacklo_epi32(srcs[2], b);
+      src += src_stride;
+      // 40 41 42 43
+      srcs[4] = Load4(src);
+      src += src_stride;
+      // 30 31 32 33 40 41 42 43
+      srcs[3] = _mm_unpacklo_epi32(b, srcs[4]);
+
+      const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+      if (is_compound) {
+        const __m128i results = Compound1DShift(sums);
+        StoreUnaligned16(dst16, results);
+        dst16 += 4 << 1;
+      } else {
+        const __m128i results_16 =
+            RightShiftWithRounding_S16(sums, kFilterBits - 1);
+        const __m128i results = _mm_packus_epi16(results_16, results_16);
+        Store4(dst8, results);
+        dst8 += dst_stride;
+        Store4(dst8, _mm_srli_si128(results, 4));
+        dst8 += dst_stride;
+      }
+
+      srcs[0] = srcs[2];
+      srcs[1] = srcs[3];
+      srcs[2] = srcs[4];
+      y -= 2;
+    } while (y != 0);
+  } else if (num_taps == 6) {
+    srcs[6] = _mm_setzero_si128();
+    // 00 01 02 03
+    srcs[0] = Load4(src);
+    src += src_stride;
+    // 10 11 12 13
+    const __m128i a = Load4(src);
+    // 00 01 02 03 10 11 12 13
+    srcs[0] = _mm_unpacklo_epi32(srcs[0], a);
+    src += src_stride;
+    // 20 21 22 23
+    srcs[2] = Load4(src);
+    src += src_stride;
+    // 10 11 12 13 20 21 22 23
+    srcs[1] = _mm_unpacklo_epi32(a, srcs[2]);
+    // 30 31 32 33
+    const __m128i b = Load4(src);
+    // 20 21 22 23 30 31 32 33
+    srcs[2] = _mm_unpacklo_epi32(srcs[2], b);
+    src += src_stride;
+    // 40 41 42 43
+    srcs[4] = Load4(src);
+    src += src_stride;
+    // 30 31 32 33 40 41 42 43
+    srcs[3] = _mm_unpacklo_epi32(b, srcs[4]);
+
+    int y = height;
+    do {
+      // 50 51 52 53
+      const __m128i c = Load4(src);
+      // 40 41 42 43 50 51 52 53
+      srcs[4] = _mm_unpacklo_epi32(srcs[4], c);
+      src += src_stride;
+      // 60 61 62 63
+      srcs[6] = Load4(src);
+      src += src_stride;
+      // 50 51 52 53 60 61 62 63
+      srcs[5] = _mm_unpacklo_epi32(c, srcs[6]);
+
+      const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+      if (is_compound) {
+        const __m128i results = Compound1DShift(sums);
+        StoreUnaligned16(dst16, results);
+        dst16 += 4 << 1;
+      } else {
+        const __m128i results_16 =
+            RightShiftWithRounding_S16(sums, kFilterBits - 1);
+        const __m128i results = _mm_packus_epi16(results_16, results_16);
+        Store4(dst8, results);
+        dst8 += dst_stride;
+        Store4(dst8, _mm_srli_si128(results, 4));
+        dst8 += dst_stride;
+      }
+
+      srcs[0] = srcs[2];
+      srcs[1] = srcs[3];
+      srcs[2] = srcs[4];
+      srcs[3] = srcs[5];
+      srcs[4] = srcs[6];
+      y -= 2;
+    } while (y != 0);
+  } else if (num_taps == 8) {
+    srcs[8] = _mm_setzero_si128();
+    // 00 01 02 03
+    srcs[0] = Load4(src);
+    src += src_stride;
+    // 10 11 12 13
+    const __m128i a = Load4(src);
+    // 00 01 02 03 10 11 12 13
+    srcs[0] = _mm_unpacklo_epi32(srcs[0], a);
+    src += src_stride;
+    // 20 21 22 23
+    srcs[2] = Load4(src);
+    src += src_stride;
+    // 10 11 12 13 20 21 22 23
+    srcs[1] = _mm_unpacklo_epi32(a, srcs[2]);
+    // 30 31 32 33
+    const __m128i b = Load4(src);
+    // 20 21 22 23 30 31 32 33
+    srcs[2] = _mm_unpacklo_epi32(srcs[2], b);
+    src += src_stride;
+    // 40 41 42 43
+    srcs[4] = Load4(src);
+    src += src_stride;
+    // 30 31 32 33 40 41 42 43
+    srcs[3] = _mm_unpacklo_epi32(b, srcs[4]);
+    // 50 51 52 53
+    const __m128i c = Load4(src);
+    // 40 41 42 43 50 51 52 53
+    srcs[4] = _mm_unpacklo_epi32(srcs[4], c);
+    src += src_stride;
+    // 60 61 62 63
+    srcs[6] = Load4(src);
+    src += src_stride;
+    // 50 51 52 53 60 61 62 63
+    srcs[5] = _mm_unpacklo_epi32(c, srcs[6]);
+
+    int y = height;
+    do {
+      // 70 71 72 73
+      const __m128i d = Load4(src);
+      // 60 61 62 63 70 71 72 73
+      srcs[6] = _mm_unpacklo_epi32(srcs[6], d);
+      src += src_stride;
+      // 80 81 82 83
+      srcs[8] = Load4(src);
+      src += src_stride;
+      // 70 71 72 73 80 81 82 83
+      srcs[7] = _mm_unpacklo_epi32(d, srcs[8]);
+
+      const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+      if (is_compound) {
+        const __m128i results = Compound1DShift(sums);
+        StoreUnaligned16(dst16, results);
+        dst16 += 4 << 1;
+      } else {
+        const __m128i results_16 =
+            RightShiftWithRounding_S16(sums, kFilterBits - 1);
+        const __m128i results = _mm_packus_epi16(results_16, results_16);
+        Store4(dst8, results);
+        dst8 += dst_stride;
+        Store4(dst8, _mm_srli_si128(results, 4));
+        dst8 += dst_stride;
+      }
+
+      srcs[0] = srcs[2];
+      srcs[1] = srcs[3];
+      srcs[2] = srcs[4];
+      srcs[3] = srcs[5];
+      srcs[4] = srcs[6];
+      srcs[5] = srcs[7];
+      srcs[6] = srcs[8];
+      y -= 2;
+    } while (y != 0);
+  }
+}
+
+template <int num_taps, int filter_index, bool negative_outside_taps = false>
+void FilterVertical2xH(const uint8_t* src, const ptrdiff_t src_stride,
+                       void* const dst, const ptrdiff_t dst_stride,
+                       const int height, const __m128i* const v_tap) {
+  auto* dst8 = static_cast<uint8_t*>(dst);
+
+  __m128i srcs[9];
+
+  if (num_taps == 2) {
+    srcs[2] = _mm_setzero_si128();
+    // 00 01
+    srcs[0] = Load2(src);
+    src += src_stride;
+
+    int y = height;
+    do {
+      // 00 01 10 11
+      srcs[0] = Load2<1>(src, srcs[0]);
+      src += src_stride;
+      // 00 01 10 11 20 21
+      srcs[0] = Load2<2>(src, srcs[0]);
+      src += src_stride;
+      // 00 01 10 11 20 21 30 31
+      srcs[0] = Load2<3>(src, srcs[0]);
+      src += src_stride;
+      // 40 41
+      srcs[2] = Load2<0>(src, srcs[2]);
+      src += src_stride;
+      // 00 01 10 11 20 21 30 31 40 41
+      const __m128i srcs_0_2 = _mm_unpacklo_epi64(srcs[0], srcs[2]);
+      // 10 11 20 21 30 31 40 41
+      srcs[1] = _mm_srli_si128(srcs_0_2, 2);
+      // This uses srcs[0]..srcs[1].
+      const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+      const __m128i results_16 =
+          RightShiftWithRounding_S16(sums, kFilterBits - 1);
+      const __m128i results = _mm_packus_epi16(results_16, results_16);
+
+      Store2(dst8, results);
+      dst8 += dst_stride;
+      Store2(dst8, _mm_srli_si128(results, 2));
+      if (height == 2) return;
+      dst8 += dst_stride;
+      Store2(dst8, _mm_srli_si128(results, 4));
+      dst8 += dst_stride;
+      Store2(dst8, _mm_srli_si128(results, 6));
+      dst8 += dst_stride;
+
+      srcs[0] = srcs[2];
+      y -= 4;
+    } while (y != 0);
+  } else if (num_taps == 4) {
+    srcs[4] = _mm_setzero_si128();
+
+    // 00 01
+    srcs[0] = Load2(src);
+    src += src_stride;
+    // 00 01 10 11
+    srcs[0] = Load2<1>(src, srcs[0]);
+    src += src_stride;
+    // 00 01 10 11 20 21
+    srcs[0] = Load2<2>(src, srcs[0]);
+    src += src_stride;
+
+    int y = height;
+    do {
+      // 00 01 10 11 20 21 30 31
+      srcs[0] = Load2<3>(src, srcs[0]);
+      src += src_stride;
+      // 40 41
+      srcs[4] = Load2<0>(src, srcs[4]);
+      src += src_stride;
+      // 40 41 50 51
+      srcs[4] = Load2<1>(src, srcs[4]);
+      src += src_stride;
+      // 40 41 50 51 60 61
+      srcs[4] = Load2<2>(src, srcs[4]);
+      src += src_stride;
+      // 00 01 10 11 20 21 30 31 40 41 50 51 60 61
+      const __m128i srcs_0_4 = _mm_unpacklo_epi64(srcs[0], srcs[4]);
+      // 10 11 20 21 30 31 40 41
+      srcs[1] = _mm_srli_si128(srcs_0_4, 2);
+      // 20 21 30 31 40 41 50 51
+      srcs[2] = _mm_srli_si128(srcs_0_4, 4);
+      // 30 31 40 41 50 51 60 61
+      srcs[3] = _mm_srli_si128(srcs_0_4, 6);
+
+      // This uses srcs[0]..srcs[3].
+      const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+      const __m128i results_16 =
+          RightShiftWithRounding_S16(sums, kFilterBits - 1);
+      const __m128i results = _mm_packus_epi16(results_16, results_16);
+
+      Store2(dst8, results);
+      dst8 += dst_stride;
+      Store2(dst8, _mm_srli_si128(results, 2));
+      if (height == 2) return;
+      dst8 += dst_stride;
+      Store2(dst8, _mm_srli_si128(results, 4));
+      dst8 += dst_stride;
+      Store2(dst8, _mm_srli_si128(results, 6));
+      dst8 += dst_stride;
+
+      srcs[0] = srcs[4];
+      y -= 4;
+    } while (y != 0);
+  } else if (num_taps == 6) {
+    // During the vertical pass the number of taps is restricted when
+    // |height| <= 4.
+    assert(height > 4);
+    srcs[8] = _mm_setzero_si128();
+
+    // 00 01
+    srcs[0] = Load2(src);
+    src += src_stride;
+    // 00 01 10 11
+    srcs[0] = Load2<1>(src, srcs[0]);
+    src += src_stride;
+    // 00 01 10 11 20 21
+    srcs[0] = Load2<2>(src, srcs[0]);
+    src += src_stride;
+    // 00 01 10 11 20 21 30 31
+    srcs[0] = Load2<3>(src, srcs[0]);
+    src += src_stride;
+    // 40 41
+    srcs[4] = Load2(src);
+    src += src_stride;
+    // 00 01 10 11 20 21 30 31 40 41 50 51 60 61
+    const __m128i srcs_0_4x = _mm_unpacklo_epi64(srcs[0], srcs[4]);
+    // 10 11 20 21 30 31 40 41
+    srcs[1] = _mm_srli_si128(srcs_0_4x, 2);
+
+    int y = height;
+    do {
+      // 40 41 50 51
+      srcs[4] = Load2<1>(src, srcs[4]);
+      src += src_stride;
+      // 40 41 50 51 60 61
+      srcs[4] = Load2<2>(src, srcs[4]);
+      src += src_stride;
+      // 40 41 50 51 60 61 70 71
+      srcs[4] = Load2<3>(src, srcs[4]);
+      src += src_stride;
+      // 80 81
+      srcs[8] = Load2<0>(src, srcs[8]);
+      src += src_stride;
+      // 00 01 10 11 20 21 30 31 40 41 50 51 60 61
+      const __m128i srcs_0_4 = _mm_unpacklo_epi64(srcs[0], srcs[4]);
+      // 20 21 30 31 40 41 50 51
+      srcs[2] = _mm_srli_si128(srcs_0_4, 4);
+      // 30 31 40 41 50 51 60 61
+      srcs[3] = _mm_srli_si128(srcs_0_4, 6);
+      const __m128i srcs_4_8 = _mm_unpacklo_epi64(srcs[4], srcs[8]);
+      // 50 51 60 61 70 71 80 81
+      srcs[5] = _mm_srli_si128(srcs_4_8, 2);
+
+      // This uses srcs[0]..srcs[5].
+      const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+      const __m128i results_16 =
+          RightShiftWithRounding_S16(sums, kFilterBits - 1);
+      const __m128i results = _mm_packus_epi16(results_16, results_16);
+
+      Store2(dst8, results);
+      dst8 += dst_stride;
+      Store2(dst8, _mm_srli_si128(results, 2));
+      dst8 += dst_stride;
+      Store2(dst8, _mm_srli_si128(results, 4));
+      dst8 += dst_stride;
+      Store2(dst8, _mm_srli_si128(results, 6));
+      dst8 += dst_stride;
+
+      srcs[0] = srcs[4];
+      srcs[1] = srcs[5];
+      srcs[4] = srcs[8];
+      y -= 4;
+    } while (y != 0);
+  } else if (num_taps == 8) {
+    // During the vertical pass the number of taps is restricted when
+    // |height| <= 4.
+    assert(height > 4);
+    srcs[8] = _mm_setzero_si128();
+    // 00 01
+    srcs[0] = Load2(src);
+    src += src_stride;
+    // 00 01 10 11
+    srcs[0] = Load2<1>(src, srcs[0]);
+    src += src_stride;
+    // 00 01 10 11 20 21
+    srcs[0] = Load2<2>(src, srcs[0]);
+    src += src_stride;
+    // 00 01 10 11 20 21 30 31
+    srcs[0] = Load2<3>(src, srcs[0]);
+    src += src_stride;
+    // 40 41
+    srcs[4] = Load2(src);
+    src += src_stride;
+    // 40 41 50 51
+    srcs[4] = Load2<1>(src, srcs[4]);
+    src += src_stride;
+    // 40 41 50 51 60 61
+    srcs[4] = Load2<2>(src, srcs[4]);
+    src += src_stride;
+
+    // 00 01 10 11 20 21 30 31 40 41 50 51 60 61
+    const __m128i srcs_0_4 = _mm_unpacklo_epi64(srcs[0], srcs[4]);
+    // 10 11 20 21 30 31 40 41
+    srcs[1] = _mm_srli_si128(srcs_0_4, 2);
+    // 20 21 30 31 40 41 50 51
+    srcs[2] = _mm_srli_si128(srcs_0_4, 4);
+    // 30 31 40 41 50 51 60 61
+    srcs[3] = _mm_srli_si128(srcs_0_4, 6);
+
+    int y = height;
+    do {
+      // 40 41 50 51 60 61 70 71
+      srcs[4] = Load2<3>(src, srcs[4]);
+      src += src_stride;
+      // 80 81
+      srcs[8] = Load2<0>(src, srcs[8]);
+      src += src_stride;
+      // 80 81 90 91
+      srcs[8] = Load2<1>(src, srcs[8]);
+      src += src_stride;
+      // 80 81 90 91 a0 a1
+      srcs[8] = Load2<2>(src, srcs[8]);
+      src += src_stride;
+
+      // 40 41 50 51 60 61 70 71 80 81 90 91 a0 a1
+      const __m128i srcs_4_8 = _mm_unpacklo_epi64(srcs[4], srcs[8]);
+      // 50 51 60 61 70 71 80 81
+      srcs[5] = _mm_srli_si128(srcs_4_8, 2);
+      // 60 61 70 71 80 81 90 91
+      srcs[6] = _mm_srli_si128(srcs_4_8, 4);
+      // 70 71 80 81 90 91 a0 a1
+      srcs[7] = _mm_srli_si128(srcs_4_8, 6);
+
+      // This uses srcs[0]..srcs[7].
+      const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+      const __m128i results_16 =
+          RightShiftWithRounding_S16(sums, kFilterBits - 1);
+      const __m128i results = _mm_packus_epi16(results_16, results_16);
+
+      Store2(dst8, results);
+      dst8 += dst_stride;
+      Store2(dst8, _mm_srli_si128(results, 2));
+      dst8 += dst_stride;
+      Store2(dst8, _mm_srli_si128(results, 4));
+      dst8 += dst_stride;
+      Store2(dst8, _mm_srli_si128(results, 6));
+      dst8 += dst_stride;
+
+      srcs[0] = srcs[4];
+      srcs[1] = srcs[5];
+      srcs[2] = srcs[6];
+      srcs[3] = srcs[7];
+      srcs[4] = srcs[8];
+      y -= 4;
+    } while (y != 0);
+  }
+}
diff --git a/src/dsp/x86/distance_weighted_blend_sse4.cc b/src/dsp/x86/distance_weighted_blend_sse4.cc
index deb57ef..3c29b19 100644
--- a/src/dsp/x86/distance_weighted_blend_sse4.cc
+++ b/src/dsp/x86/distance_weighted_blend_sse4.cc
@@ -30,6 +30,7 @@
 
 namespace libgav1 {
 namespace dsp {
+namespace low_bitdepth {
 namespace {
 
 constexpr int kInterPostRoundBit = 4;
@@ -212,13 +213,231 @@ void Init8bpp() {
 }
 
 }  // namespace
+}  // namespace low_bitdepth
 
-void DistanceWeightedBlendInit_SSE4_1() { Init8bpp(); }
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+constexpr int kMax10bppSample = (1 << 10) - 1;
+constexpr int kInterPostRoundBit = 4;
+
+inline __m128i ComputeWeightedAverage8(const __m128i& pred0,
+                                       const __m128i& pred1,
+                                       const __m128i& weight0,
+                                       const __m128i& weight1) {
+  // This offset is a combination of round_factor and round_offset
+  // which are to be added and subtracted respectively.
+  // Here kInterPostRoundBit + 4 is considering bitdepth=10.
+  constexpr int offset =
+      (1 << ((kInterPostRoundBit + 4) - 1)) - (kCompoundOffset << 4);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i bias = _mm_set1_epi32(offset);
+  const __m128i clip_high = _mm_set1_epi16(kMax10bppSample);
+
+  __m128i prediction0 = _mm_cvtepu16_epi32(pred0);
+  __m128i mult0 = _mm_mullo_epi32(prediction0, weight0);
+  __m128i prediction1 = _mm_cvtepu16_epi32(pred1);
+  __m128i mult1 = _mm_mullo_epi32(prediction1, weight1);
+  __m128i sum = _mm_add_epi32(mult0, mult1);
+  sum = _mm_add_epi32(sum, bias);
+  const __m128i result0 = _mm_srai_epi32(sum, kInterPostRoundBit + 4);
+
+  prediction0 = _mm_unpackhi_epi16(pred0, zero);
+  mult0 = _mm_mullo_epi32(prediction0, weight0);
+  prediction1 = _mm_unpackhi_epi16(pred1, zero);
+  mult1 = _mm_mullo_epi32(prediction1, weight1);
+  sum = _mm_add_epi32(mult0, mult1);
+  sum = _mm_add_epi32(sum, bias);
+  const __m128i result1 = _mm_srai_epi32(sum, kInterPostRoundBit + 4);
+  const __m128i pack = _mm_packus_epi32(result0, result1);
+
+  return _mm_min_epi16(pack, clip_high);
+}
+
+template <int height>
+inline void DistanceWeightedBlend4xH_SSE4_1(
+    const uint16_t* pred_0, const uint16_t* pred_1, const uint8_t weight_0,
+    const uint8_t weight_1, void* const dest, const ptrdiff_t dest_stride) {
+  auto* dst = static_cast<uint16_t*>(dest);
+  const __m128i weight0 = _mm_set1_epi32(weight_0);
+  const __m128i weight1 = _mm_set1_epi32(weight_1);
+
+  int y = height;
+  do {
+    const __m128i src_00 = LoadLo8(pred_0);
+    const __m128i src_10 = LoadLo8(pred_1);
+    pred_0 += 4;
+    pred_1 += 4;
+    __m128i src_0 = LoadHi8(src_00, pred_0);
+    __m128i src_1 = LoadHi8(src_10, pred_1);
+    pred_0 += 4;
+    pred_1 += 4;
+    const __m128i res0 =
+        ComputeWeightedAverage8(src_0, src_1, weight0, weight1);
+
+    const __m128i src_01 = LoadLo8(pred_0);
+    const __m128i src_11 = LoadLo8(pred_1);
+    pred_0 += 4;
+    pred_1 += 4;
+    src_0 = LoadHi8(src_01, pred_0);
+    src_1 = LoadHi8(src_11, pred_1);
+    pred_0 += 4;
+    pred_1 += 4;
+    const __m128i res1 =
+        ComputeWeightedAverage8(src_0, src_1, weight0, weight1);
+
+    StoreLo8(dst, res0);
+    dst += dest_stride;
+    StoreHi8(dst, res0);
+    dst += dest_stride;
+    StoreLo8(dst, res1);
+    dst += dest_stride;
+    StoreHi8(dst, res1);
+    dst += dest_stride;
+    y -= 4;
+  } while (y != 0);
+}
+
+template <int height>
+inline void DistanceWeightedBlend8xH_SSE4_1(
+    const uint16_t* pred_0, const uint16_t* pred_1, const uint8_t weight_0,
+    const uint8_t weight_1, void* const dest, const ptrdiff_t dest_stride) {
+  auto* dst = static_cast<uint16_t*>(dest);
+  const __m128i weight0 = _mm_set1_epi32(weight_0);
+  const __m128i weight1 = _mm_set1_epi32(weight_1);
+
+  int y = height;
+  do {
+    const __m128i src_00 = LoadAligned16(pred_0);
+    const __m128i src_10 = LoadAligned16(pred_1);
+    pred_0 += 8;
+    pred_1 += 8;
+    const __m128i res0 =
+        ComputeWeightedAverage8(src_00, src_10, weight0, weight1);
+
+    const __m128i src_01 = LoadAligned16(pred_0);
+    const __m128i src_11 = LoadAligned16(pred_1);
+    pred_0 += 8;
+    pred_1 += 8;
+    const __m128i res1 =
+        ComputeWeightedAverage8(src_01, src_11, weight0, weight1);
+
+    StoreUnaligned16(dst, res0);
+    dst += dest_stride;
+    StoreUnaligned16(dst, res1);
+    dst += dest_stride;
+    y -= 2;
+  } while (y != 0);
+}
+
+inline void DistanceWeightedBlendLarge_SSE4_1(
+    const uint16_t* pred_0, const uint16_t* pred_1, const uint8_t weight_0,
+    const uint8_t weight_1, const int width, const int height, void* const dest,
+    const ptrdiff_t dest_stride) {
+  auto* dst = static_cast<uint16_t*>(dest);
+  const __m128i weight0 = _mm_set1_epi32(weight_0);
+  const __m128i weight1 = _mm_set1_epi32(weight_1);
+
+  int y = height;
+  do {
+    int x = 0;
+    do {
+      const __m128i src_0_lo = LoadAligned16(pred_0 + x);
+      const __m128i src_1_lo = LoadAligned16(pred_1 + x);
+      const __m128i res_lo =
+          ComputeWeightedAverage8(src_0_lo, src_1_lo, weight0, weight1);
+
+      const __m128i src_0_hi = LoadAligned16(pred_0 + x + 8);
+      const __m128i src_1_hi = LoadAligned16(pred_1 + x + 8);
+      const __m128i res_hi =
+          ComputeWeightedAverage8(src_0_hi, src_1_hi, weight0, weight1);
+
+      StoreUnaligned16(dst + x, res_lo);
+      x += 8;
+      StoreUnaligned16(dst + x, res_hi);
+      x += 8;
+    } while (x < width);
+    dst += dest_stride;
+    pred_0 += width;
+    pred_1 += width;
+  } while (--y != 0);
+}
+
+void DistanceWeightedBlend_SSE4_1(const void* prediction_0,
+                                  const void* prediction_1,
+                                  const uint8_t weight_0,
+                                  const uint8_t weight_1, const int width,
+                                  const int height, void* const dest,
+                                  const ptrdiff_t dest_stride) {
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  const ptrdiff_t dst_stride = dest_stride / sizeof(*pred_0);
+  if (width == 4) {
+    if (height == 4) {
+      DistanceWeightedBlend4xH_SSE4_1<4>(pred_0, pred_1, weight_0, weight_1,
+                                         dest, dst_stride);
+    } else if (height == 8) {
+      DistanceWeightedBlend4xH_SSE4_1<8>(pred_0, pred_1, weight_0, weight_1,
+                                         dest, dst_stride);
+    } else {
+      assert(height == 16);
+      DistanceWeightedBlend4xH_SSE4_1<16>(pred_0, pred_1, weight_0, weight_1,
+                                          dest, dst_stride);
+    }
+    return;
+  }
+
+  if (width == 8) {
+    switch (height) {
+      case 4:
+        DistanceWeightedBlend8xH_SSE4_1<4>(pred_0, pred_1, weight_0, weight_1,
+                                           dest, dst_stride);
+        return;
+      case 8:
+        DistanceWeightedBlend8xH_SSE4_1<8>(pred_0, pred_1, weight_0, weight_1,
+                                           dest, dst_stride);
+        return;
+      case 16:
+        DistanceWeightedBlend8xH_SSE4_1<16>(pred_0, pred_1, weight_0, weight_1,
+                                            dest, dst_stride);
+        return;
+      default:
+        assert(height == 32);
+        DistanceWeightedBlend8xH_SSE4_1<32>(pred_0, pred_1, weight_0, weight_1,
+                                            dest, dst_stride);
+
+        return;
+    }
+  }
+
+  DistanceWeightedBlendLarge_SSE4_1(pred_0, pred_1, weight_0, weight_1, width,
+                                    height, dest, dst_stride);
+}
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+#if DSP_ENABLED_10BPP_SSE4_1(DistanceWeightedBlend)
+  dsp->distance_weighted_blend = DistanceWeightedBlend_SSE4_1;
+#endif
+}
+
+}  // namespace
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+void DistanceWeightedBlendInit_SSE4_1() {
+  low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif
+}
 
 }  // namespace dsp
 }  // namespace libgav1
 
-#else  // !LIBGAV1_TARGETING_SSE4_1
+#else   // !LIBGAV1_TARGETING_SSE4_1
 
 namespace libgav1 {
 namespace dsp {
diff --git a/src/dsp/x86/distance_weighted_blend_sse4.h b/src/dsp/x86/distance_weighted_blend_sse4.h
index 8646eca..dbb9f88 100644
--- a/src/dsp/x86/distance_weighted_blend_sse4.h
+++ b/src/dsp/x86/distance_weighted_blend_sse4.h
@@ -36,6 +36,10 @@ void DistanceWeightedBlendInit_SSE4_1();
 #define LIBGAV1_Dsp8bpp_DistanceWeightedBlend LIBGAV1_CPU_SSE4_1
 #endif
 
+#ifndef LIBGAV1_Dsp10bpp_DistanceWeightedBlend
+#define LIBGAV1_Dsp10bpp_DistanceWeightedBlend LIBGAV1_CPU_SSE4_1
+#endif
+
 #endif  // LIBGAV1_TARGETING_SSE4_1
 
 #endif  // LIBGAV1_SRC_DSP_X86_DISTANCE_WEIGHTED_BLEND_SSE4_H_
diff --git a/src/dsp/x86/film_grain_sse4.cc b/src/dsp/x86/film_grain_sse4.cc
new file mode 100644
index 0000000..745c1ca
--- /dev/null
+++ b/src/dsp/x86/film_grain_sse4.cc
@@ -0,0 +1,514 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/film_grain.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+#include <smmintrin.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/common.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/film_grain_common.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/logging.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace film_grain {
+namespace {
+
+// Load 8 values from source, widening to int16_t intermediate value size.
+// The function is overloaded for each type and bitdepth for simplicity.
+inline __m128i LoadSource(const int8_t* src) {
+  return _mm_cvtepi8_epi16(LoadLo8(src));
+}
+
+// Load 8 values from source, widening to int16_t intermediate value size.
+inline __m128i LoadSource(const uint8_t* src) {
+  return _mm_cvtepu8_epi16(LoadLo8(src));
+}
+
+inline __m128i LoadSourceMsan(const uint8_t* src, const int valid_range) {
+  return _mm_cvtepu8_epi16(LoadLo8Msan(src, 8 - valid_range));
+}
+
+// Store 8 values to dest, narrowing to uint8_t from int16_t intermediate value.
+inline void StoreUnsigned(uint8_t* dest, const __m128i data) {
+  StoreLo8(dest, _mm_packus_epi16(data, data));
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+// Load 8 values from source.
+inline __m128i LoadSource(const int16_t* src) { return LoadUnaligned16(src); }
+
+// Load 8 values from source.
+inline __m128i LoadSource(const uint16_t* src) { return LoadUnaligned16(src); }
+
+// Store 8 values to dest.
+inline void StoreUnsigned(uint16_t* dest, const __m128i data) {
+  StoreUnaligned16(dest, data);
+}
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+// For BlendNoiseWithImageChromaWithCfl, only |subsampling_x| is needed.
+inline __m128i GetAverageLuma(const uint8_t* const luma, int subsampling_x) {
+  if (subsampling_x != 0) {
+    const __m128i src = LoadUnaligned16(luma);
+
+    return RightShiftWithRounding_U16(
+        _mm_hadd_epi16(_mm_cvtepu8_epi16(src),
+                       _mm_unpackhi_epi8(src, _mm_setzero_si128())),
+        1);
+  }
+  return _mm_cvtepu8_epi16(LoadLo8(luma));
+}
+
+inline __m128i GetAverageLumaMsan(const uint8_t* const luma, int subsampling_x,
+                                  int valid_range) {
+  if (subsampling_x != 0) {
+    const __m128i src = LoadUnaligned16Msan(luma, 16 - valid_range);
+
+    return RightShiftWithRounding_U16(
+        _mm_hadd_epi16(_mm_cvtepu8_epi16(src),
+                       _mm_unpackhi_epi8(src, _mm_setzero_si128())),
+        1);
+  }
+  return _mm_cvtepu8_epi16(LoadLo8Msan(luma, 8 - valid_range));
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+// For BlendNoiseWithImageChromaWithCfl, only |subsampling_x| is needed.
+inline __m128i GetAverageLuma(const uint16_t* const luma, int subsampling_x) {
+  if (subsampling_x != 0) {
+    return RightShiftWithRounding_U16(
+        _mm_hadd_epi16(LoadUnaligned16(luma), LoadUnaligned16(luma + 8)), 1);
+  }
+  return LoadUnaligned16(luma);
+}
+
+inline __m128i GetAverageLumaMsan(const uint16_t* const luma, int subsampling_x,
+                                  int valid_range) {
+  if (subsampling_x != 0) {
+    return RightShiftWithRounding_U16(
+        _mm_hadd_epi16(
+            LoadUnaligned16Msan(luma, 16 - valid_range * sizeof(*luma)),
+            LoadUnaligned16Msan(luma + 8, 32 - valid_range * sizeof(*luma))),
+        1);
+  }
+  return LoadUnaligned16Msan(luma, 16 - valid_range * sizeof(*luma));
+}
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+inline __m128i Clip3(const __m128i value, const __m128i low,
+                     const __m128i high) {
+  const __m128i clipped_to_ceiling = _mm_min_epi16(high, value);
+  return _mm_max_epi16(low, clipped_to_ceiling);
+}
+
+template <int bitdepth, typename Pixel>
+inline __m128i GetScalingFactors(
+    const uint8_t scaling_lut[kScalingLookupTableSize], const Pixel* source) {
+  alignas(16) int16_t start_vals[8];
+  if (bitdepth == 8) {
+    // TODO(petersonab): Speed this up by creating a uint16_t scaling_lut.
+    // Currently this code results in a series of movzbl.
+    for (int i = 0; i < 8; ++i) {
+      start_vals[i] = scaling_lut[source[i]];
+    }
+    return LoadAligned16(start_vals);
+  }
+  alignas(16) int16_t end_vals[8];
+  // TODO(petersonab): Precompute this into a larger table for direct lookups.
+  for (int i = 0; i < 8; ++i) {
+    const int index = source[i] >> 2;
+    start_vals[i] = scaling_lut[index];
+    end_vals[i] = scaling_lut[index + 1];
+  }
+  const __m128i start = LoadAligned16(start_vals);
+  const __m128i end = LoadAligned16(end_vals);
+  __m128i remainder = LoadSource(source);
+  remainder = _mm_srli_epi16(_mm_slli_epi16(remainder, 14), 1);
+  const __m128i delta = _mm_mulhrs_epi16(_mm_sub_epi16(end, start), remainder);
+  return _mm_add_epi16(start, delta);
+}
+
+// |scaling_shift| is in range [8,11].
+template <int bitdepth>
+inline __m128i ScaleNoise(const __m128i noise, const __m128i scaling,
+                          const __m128i scaling_shift) {
+  const __m128i shifted_scale_factors = _mm_sll_epi16(scaling, scaling_shift);
+  return _mm_mulhrs_epi16(noise, shifted_scale_factors);
+}
+
+template <int bitdepth, typename GrainType, typename Pixel>
+void BlendNoiseWithImageLuma_SSE4_1(
+    const void* noise_image_ptr, int min_value, int max_luma, int scaling_shift,
+    int width, int height, int start_height,
+    const uint8_t scaling_lut_y[kScalingLookupTableSize],
+    const void* source_plane_y, ptrdiff_t source_stride_y, void* dest_plane_y,
+    ptrdiff_t dest_stride_y) {
+  const auto* noise_image =
+      static_cast<const Array2D<GrainType>*>(noise_image_ptr);
+  const auto* in_y_row = static_cast<const Pixel*>(source_plane_y);
+  source_stride_y /= sizeof(Pixel);
+  auto* out_y_row = static_cast<Pixel*>(dest_plane_y);
+  dest_stride_y /= sizeof(Pixel);
+  const __m128i floor = _mm_set1_epi16(min_value);
+  const __m128i ceiling = _mm_set1_epi16(max_luma);
+  const int safe_width = width & ~7;
+  const __m128i derived_scaling_shift = _mm_cvtsi32_si128(15 - scaling_shift);
+  int y = 0;
+  do {
+    int x = 0;
+    for (; x < safe_width; x += 8) {
+      // TODO(b/133525232): Make 16-pixel version of loop body.
+      const __m128i orig = LoadSource(&in_y_row[x]);
+      const __m128i scaling =
+          GetScalingFactors<bitdepth, Pixel>(scaling_lut_y, &in_y_row[x]);
+      __m128i noise = LoadSource(&(noise_image[kPlaneY][y + start_height][x]));
+
+      noise = ScaleNoise<bitdepth>(noise, scaling, derived_scaling_shift);
+      const __m128i combined = _mm_add_epi16(orig, noise);
+      StoreUnsigned(&out_y_row[x], Clip3(combined, floor, ceiling));
+    }
+
+    if (x < width) {
+      Pixel luma_buffer[8];
+      // Prevent arbitrary indices from entering GetScalingFactors.
+      memset(luma_buffer, 0, sizeof(luma_buffer));
+      const int valid_range = width - x;
+      memcpy(luma_buffer, &in_y_row[x], valid_range * sizeof(in_y_row[0]));
+      luma_buffer[valid_range] = in_y_row[width - 1];
+      const __m128i orig = LoadSource(&in_y_row[x]);
+      const __m128i scaling =
+          GetScalingFactors<bitdepth, Pixel>(scaling_lut_y, luma_buffer);
+      __m128i noise = LoadSource(&(noise_image[kPlaneY][y + start_height][x]));
+
+      noise = ScaleNoise<bitdepth>(noise, scaling, derived_scaling_shift);
+      const __m128i combined = _mm_add_epi16(orig, noise);
+      StoreUnsigned(&out_y_row[x], Clip3(combined, floor, ceiling));
+    }
+    in_y_row += source_stride_y;
+    out_y_row += dest_stride_y;
+  } while (++y < height);
+  out_y_row = static_cast<Pixel*>(dest_plane_y);
+}
+
+template <int bitdepth, typename GrainType, typename Pixel>
+inline __m128i BlendChromaValsWithCfl(
+    const Pixel* average_luma_buffer,
+    const uint8_t scaling_lut[kScalingLookupTableSize],
+    const Pixel* chroma_cursor, const GrainType* noise_image_cursor,
+    const __m128i scaling_shift) {
+  const __m128i scaling =
+      GetScalingFactors<bitdepth, Pixel>(scaling_lut, average_luma_buffer);
+  const __m128i orig = LoadSource(chroma_cursor);
+  __m128i noise = LoadSource(noise_image_cursor);
+  noise = ScaleNoise<bitdepth>(noise, scaling, scaling_shift);
+  return _mm_add_epi16(orig, noise);
+}
+
+template <int bitdepth, typename GrainType, typename Pixel>
+LIBGAV1_ALWAYS_INLINE void BlendChromaPlaneWithCfl_SSE4_1(
+    const Array2D<GrainType>& noise_image, int min_value, int max_chroma,
+    int width, int height, int start_height, int subsampling_x,
+    int subsampling_y, int scaling_shift,
+    const uint8_t scaling_lut[kScalingLookupTableSize], const Pixel* in_y_row,
+    ptrdiff_t source_stride_y, const Pixel* in_chroma_row,
+    ptrdiff_t source_stride_chroma, Pixel* out_chroma_row,
+    ptrdiff_t dest_stride) {
+  const __m128i floor = _mm_set1_epi16(min_value);
+  const __m128i ceiling = _mm_set1_epi16(max_chroma);
+  alignas(16) Pixel luma_buffer[16];
+
+  const int chroma_height = (height + subsampling_y) >> subsampling_y;
+  const int chroma_width = (width + subsampling_x) >> subsampling_x;
+  // |chroma_width| is rounded up. If |width| is odd, then the final pixel will
+  // need to be guarded from overread, even if |chroma_width| is divisible by 8.
+  const int safe_chroma_width = (chroma_width - (width & 1)) & ~7;
+
+  // Writing to this buffer avoids the cost of doing 8 lane lookups in a row
+  // in GetScalingFactors.
+  Pixel average_luma_buffer[8];
+  assert(start_height % 2 == 0);
+  start_height >>= subsampling_y;
+  const __m128i derived_scaling_shift = _mm_cvtsi32_si128(15 - scaling_shift);
+  int y = 0;
+  do {
+    int x = 0;
+    for (; x < safe_chroma_width; x += 8) {
+      const int luma_x = x << subsampling_x;
+      // TODO(petersonab): Consider specializing by subsampling_x. In the 444
+      // case &in_y_row[x] can be passed to GetScalingFactors directly.
+      const __m128i average_luma =
+          GetAverageLuma(&in_y_row[luma_x], subsampling_x);
+      StoreUnsigned(average_luma_buffer, average_luma);
+
+      const __m128i blended =
+          BlendChromaValsWithCfl<bitdepth, GrainType, Pixel>(
+              average_luma_buffer, scaling_lut, &in_chroma_row[x],
+              &(noise_image[y + start_height][x]), derived_scaling_shift);
+      StoreUnsigned(&out_chroma_row[x], Clip3(blended, floor, ceiling));
+    }
+
+    // This section only runs if width % (8 << sub_x) != 0. It should never run
+    // on 720p and above.
+    if (x < chroma_width) {
+      // Prevent huge indices from entering GetScalingFactors due to
+      // uninitialized values. This is not a problem in 8bpp because the table
+      // is made larger than 255 values.
+      if (bitdepth > 8) {
+        memset(luma_buffer, 0, sizeof(luma_buffer));
+      }
+      const int luma_x = x << subsampling_x;
+      const int valid_range = width - luma_x;
+      assert(valid_range < 16);
+      memcpy(luma_buffer, &in_y_row[luma_x], valid_range * sizeof(in_y_row[0]));
+      luma_buffer[valid_range] = in_y_row[width - 1];
+      const __m128i average_luma =
+          GetAverageLumaMsan(luma_buffer, subsampling_x, valid_range + 1);
+      StoreUnsigned(average_luma_buffer, average_luma);
+
+      const __m128i blended =
+          BlendChromaValsWithCfl<bitdepth, GrainType, Pixel>(
+              average_luma_buffer, scaling_lut, &in_chroma_row[x],
+              &(noise_image[y + start_height][x]), derived_scaling_shift);
+      StoreUnsigned(&out_chroma_row[x], Clip3(blended, floor, ceiling));
+    }
+
+    in_y_row += source_stride_y << subsampling_y;
+    in_chroma_row += source_stride_chroma;
+    out_chroma_row += dest_stride;
+  } while (++y < chroma_height);
+}
+
+// This function is for the case params_.chroma_scaling_from_luma == true.
+// This further implies that scaling_lut_u == scaling_lut_v == scaling_lut_y.
+template <int bitdepth, typename GrainType, typename Pixel>
+void BlendNoiseWithImageChromaWithCfl_SSE4_1(
+    Plane plane, const FilmGrainParams& params, const void* noise_image_ptr,
+    int min_value, int max_chroma, int width, int height, int start_height,
+    int subsampling_x, int subsampling_y,
+    const uint8_t scaling_lut[kScalingLookupTableSize],
+    const void* source_plane_y, ptrdiff_t source_stride_y,
+    const void* source_plane_uv, ptrdiff_t source_stride_uv,
+    void* dest_plane_uv, ptrdiff_t dest_stride_uv) {
+  const auto* noise_image =
+      static_cast<const Array2D<GrainType>*>(noise_image_ptr);
+  const auto* in_y = static_cast<const Pixel*>(source_plane_y);
+  source_stride_y /= sizeof(Pixel);
+
+  const auto* in_uv = static_cast<const Pixel*>(source_plane_uv);
+  source_stride_uv /= sizeof(Pixel);
+  auto* out_uv = static_cast<Pixel*>(dest_plane_uv);
+  dest_stride_uv /= sizeof(Pixel);
+  BlendChromaPlaneWithCfl_SSE4_1<bitdepth, GrainType, Pixel>(
+      noise_image[plane], min_value, max_chroma, width, height, start_height,
+      subsampling_x, subsampling_y, params.chroma_scaling, scaling_lut, in_y,
+      source_stride_y, in_uv, source_stride_uv, out_uv, dest_stride_uv);
+}
+
+}  // namespace
+
+namespace low_bitdepth {
+namespace {
+
+// |offset| is 32x4 packed to add with the result of _mm_madd_epi16.
+inline __m128i BlendChromaValsNoCfl8bpp(
+    const uint8_t scaling_lut[kScalingLookupTableSize], const __m128i& orig,
+    const int8_t* noise_image_cursor, const __m128i& average_luma,
+    const __m128i& scaling_shift, const __m128i& offset,
+    const __m128i& weights) {
+  uint8_t merged_buffer[8];
+  const __m128i combined_lo =
+      _mm_madd_epi16(_mm_unpacklo_epi16(average_luma, orig), weights);
+  const __m128i combined_hi =
+      _mm_madd_epi16(_mm_unpackhi_epi16(average_luma, orig), weights);
+  const __m128i merged_base = _mm_packs_epi32(_mm_srai_epi32((combined_lo), 6),
+                                              _mm_srai_epi32((combined_hi), 6));
+
+  const __m128i merged = _mm_add_epi16(merged_base, offset);
+
+  StoreLo8(merged_buffer, _mm_packus_epi16(merged, merged));
+  const __m128i scaling =
+      GetScalingFactors<8, uint8_t>(scaling_lut, merged_buffer);
+  __m128i noise = LoadSource(noise_image_cursor);
+  noise = ScaleNoise<8>(noise, scaling, scaling_shift);
+  return _mm_add_epi16(orig, noise);
+}
+
+LIBGAV1_ALWAYS_INLINE void BlendChromaPlane8bpp_SSE4_1(
+    const Array2D<int8_t>& noise_image, int min_value, int max_chroma,
+    int width, int height, int start_height, int subsampling_x,
+    int subsampling_y, int scaling_shift, int chroma_offset,
+    int chroma_multiplier, int luma_multiplier,
+    const uint8_t scaling_lut[kScalingLookupTableSize], const uint8_t* in_y_row,
+    ptrdiff_t source_stride_y, const uint8_t* in_chroma_row,
+    ptrdiff_t source_stride_chroma, uint8_t* out_chroma_row,
+    ptrdiff_t dest_stride) {
+  const __m128i floor = _mm_set1_epi16(min_value);
+  const __m128i ceiling = _mm_set1_epi16(max_chroma);
+
+  const int chroma_height = (height + subsampling_y) >> subsampling_y;
+  const int chroma_width = (width + subsampling_x) >> subsampling_x;
+  // |chroma_width| is rounded up. If |width| is odd, then the final luma pixel
+  // will need to be guarded from overread, even if |chroma_width| is a
+  // multiple of 8.
+  const int safe_chroma_width = (chroma_width - (width & 1)) & ~7;
+  alignas(16) uint8_t luma_buffer[16];
+  const __m128i offset = _mm_set1_epi16(chroma_offset);
+  const __m128i multipliers = _mm_set1_epi32(LeftShift(chroma_multiplier, 16) |
+                                             (luma_multiplier & 0xFFFF));
+  const __m128i derived_scaling_shift = _mm_cvtsi32_si128(15 - scaling_shift);
+
+  start_height >>= subsampling_y;
+  int y = 0;
+  do {
+    int x = 0;
+    for (; x < safe_chroma_width; x += 8) {
+      const int luma_x = x << subsampling_x;
+      const __m128i average_luma =
+          GetAverageLuma(&in_y_row[luma_x], subsampling_x);
+      const __m128i orig_chroma = LoadSource(&in_chroma_row[x]);
+      const __m128i blended = BlendChromaValsNoCfl8bpp(
+          scaling_lut, orig_chroma, &(noise_image[y + start_height][x]),
+          average_luma, derived_scaling_shift, offset, multipliers);
+      StoreUnsigned(&out_chroma_row[x], Clip3(blended, floor, ceiling));
+    }
+
+    if (x < chroma_width) {
+      // Begin right edge iteration. Same as the normal iterations, but the
+      // |average_luma| computation requires a duplicated luma value at the
+      // end.
+      const int luma_x = x << subsampling_x;
+      const int valid_range = width - luma_x;
+      assert(valid_range < 16);
+      // There is no need to pre-initialize this buffer, because merged values
+      // used as indices are saturated in the 8bpp case. Uninitialized values
+      // are written outside the frame.
+      memcpy(luma_buffer, &in_y_row[luma_x], valid_range * sizeof(in_y_row[0]));
+      luma_buffer[valid_range] = in_y_row[width - 1];
+      const int valid_range_chroma = chroma_width - x;
+      uint8_t chroma_buffer[8];
+      memcpy(chroma_buffer, &in_chroma_row[x],
+             valid_range_chroma * sizeof(in_chroma_row[0]));
+
+      const __m128i average_luma =
+          GetAverageLumaMsan(luma_buffer, subsampling_x, valid_range + 1);
+      const __m128i orig_chroma =
+          LoadSourceMsan(chroma_buffer, valid_range_chroma);
+      const __m128i blended = BlendChromaValsNoCfl8bpp(
+          scaling_lut, orig_chroma, &(noise_image[y + start_height][x]),
+          average_luma, derived_scaling_shift, offset, multipliers);
+      StoreUnsigned(&out_chroma_row[x], Clip3(blended, floor, ceiling));
+      // End of right edge iteration.
+    }
+
+    in_y_row += source_stride_y << subsampling_y;
+    in_chroma_row += source_stride_chroma;
+    out_chroma_row += dest_stride;
+  } while (++y < chroma_height);
+}
+
+// This function is for the case params_.chroma_scaling_from_luma == false.
+void BlendNoiseWithImageChroma8bpp_SSE4_1(
+    Plane plane, const FilmGrainParams& params, const void* noise_image_ptr,
+    int min_value, int max_chroma, int width, int height, int start_height,
+    int subsampling_x, int subsampling_y,
+    const uint8_t scaling_lut[kScalingLookupTableSize],
+    const void* source_plane_y, ptrdiff_t source_stride_y,
+    const void* source_plane_uv, ptrdiff_t source_stride_uv,
+    void* dest_plane_uv, ptrdiff_t dest_stride_uv) {
+  assert(plane == kPlaneU || plane == kPlaneV);
+  const auto* noise_image =
+      static_cast<const Array2D<int8_t>*>(noise_image_ptr);
+  const auto* in_y = static_cast<const uint8_t*>(source_plane_y);
+  const auto* in_uv = static_cast<const uint8_t*>(source_plane_uv);
+  auto* out_uv = static_cast<uint8_t*>(dest_plane_uv);
+
+  const int offset = (plane == kPlaneU) ? params.u_offset : params.v_offset;
+  const int luma_multiplier =
+      (plane == kPlaneU) ? params.u_luma_multiplier : params.v_luma_multiplier;
+  const int multiplier =
+      (plane == kPlaneU) ? params.u_multiplier : params.v_multiplier;
+  BlendChromaPlane8bpp_SSE4_1(
+      noise_image[plane], min_value, max_chroma, width, height, start_height,
+      subsampling_x, subsampling_y, params.chroma_scaling, offset, multiplier,
+      luma_multiplier, scaling_lut, in_y, source_stride_y, in_uv,
+      source_stride_uv, out_uv, dest_stride_uv);
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+
+  dsp->film_grain.blend_noise_luma =
+      BlendNoiseWithImageLuma_SSE4_1<8, int8_t, uint8_t>;
+  dsp->film_grain.blend_noise_chroma[0] = BlendNoiseWithImageChroma8bpp_SSE4_1;
+  dsp->film_grain.blend_noise_chroma[1] =
+      BlendNoiseWithImageChromaWithCfl_SSE4_1<8, int8_t, uint8_t>;
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+
+  dsp->film_grain.blend_noise_luma =
+      BlendNoiseWithImageLuma_SSE4_1<10, int16_t, uint16_t>;
+  dsp->film_grain.blend_noise_chroma[1] =
+      BlendNoiseWithImageChromaWithCfl_SSE4_1<10, int16_t, uint16_t>;
+}
+
+}  // namespace
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+}  // namespace film_grain
+
+void FilmGrainInit_SSE4_1() {
+  film_grain::low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  film_grain::high_bitdepth::Init10bpp();
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !LIBGAV1_ENABLE_SSE4_1
+
+namespace libgav1 {
+namespace dsp {
+
+void FilmGrainInit_SSE4_1() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/film_grain_sse4.h b/src/dsp/x86/film_grain_sse4.h
new file mode 100644
index 0000000..1cacbac
--- /dev/null
+++ b/src/dsp/x86/film_grain_sse4.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_FILM_GRAIN_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_FILM_GRAIN_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initialize members of Dsp::film_grain. This function is not thread-safe.
+void FilmGrainInit_SSE4_1();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_TARGETING_SSE4_1
+#define LIBGAV1_Dsp8bpp_FilmGrainBlendNoiseLuma LIBGAV1_DSP_SSE4_1
+#define LIBGAV1_Dsp10bpp_FilmGrainBlendNoiseLuma LIBGAV1_DSP_SSE4_1
+#define LIBGAV1_Dsp8bpp_FilmGrainBlendNoiseChroma LIBGAV1_DSP_SSE4_1
+#define LIBGAV1_Dsp8bpp_FilmGrainBlendNoiseChromaWithCfl LIBGAV1_DSP_SSE4_1
+#define LIBGAV1_Dsp10bpp_FilmGrainBlendNoiseChromaWithCfl LIBGAV1_DSP_SSE4_1
+#endif  // LIBGAV1_TARGETING_SSE4_1
+
+#endif  // LIBGAV1_SRC_DSP_X86_FILM_GRAIN_SSE4_H_
diff --git a/src/dsp/x86/intra_edge_sse4.cc b/src/dsp/x86/intra_edge_sse4.cc
index 4a8658d..d6af907 100644
--- a/src/dsp/x86/intra_edge_sse4.cc
+++ b/src/dsp/x86/intra_edge_sse4.cc
@@ -22,7 +22,7 @@
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
-#include <cstring>  // memcpy
+#include <cstring>
 
 #include "src/dsp/constants.h"
 #include "src/dsp/dsp.h"
@@ -259,7 +259,7 @@ void IntraEdgeInit_SSE4_1() { Init8bpp(); }
 }  // namespace dsp
 }  // namespace libgav1
 
-#else  // !LIBGAV1_TARGETING_SSE4_1
+#else   // !LIBGAV1_TARGETING_SSE4_1
 namespace libgav1 {
 namespace dsp {
 
diff --git a/src/dsp/x86/intrapred_cfl_sse4.cc b/src/dsp/x86/intrapred_cfl_sse4.cc
index fac1556..f2dcfdb 100644
--- a/src/dsp/x86/intrapred_cfl_sse4.cc
+++ b/src/dsp/x86/intrapred_cfl_sse4.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "src/dsp/intrapred.h"
+#include "src/dsp/intrapred_cfl.h"
 #include "src/utils/cpu.h"
 
 #if LIBGAV1_TARGETING_SSE4_1
@@ -29,9 +29,48 @@
 #include "src/dsp/x86/common_sse4.h"
 #include "src/utils/common.h"
 #include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
 
 namespace libgav1 {
 namespace dsp {
+namespace {
+
+// This duplicates the last two 16-bit values in |row|.
+inline __m128i LastRowSamples(const __m128i row) {
+  return _mm_shuffle_epi32(row, 0xFF);
+}
+
+// This duplicates the last 16-bit value in |row|.
+inline __m128i LastRowResult(const __m128i row) {
+  const __m128i dup_row = _mm_shufflehi_epi16(row, 0xFF);
+  return _mm_shuffle_epi32(dup_row, 0xFF);
+}
+
+// Takes in two sums of input row pairs, and completes the computation for two
+// output rows.
+inline __m128i StoreLumaResults4_420(const __m128i vertical_sum0,
+                                     const __m128i vertical_sum1,
+                                     int16_t* luma_ptr) {
+  __m128i result = _mm_hadd_epi16(vertical_sum0, vertical_sum1);
+  result = _mm_slli_epi16(result, 1);
+  StoreLo8(luma_ptr, result);
+  StoreHi8(luma_ptr + kCflLumaBufferStride, result);
+  return result;
+}
+
+// Takes two halves of a vertically added pair of rows and completes the
+// computation for one output row.
+inline __m128i StoreLumaResults8_420(const __m128i vertical_sum0,
+                                     const __m128i vertical_sum1,
+                                     int16_t* luma_ptr) {
+  __m128i result = _mm_hadd_epi16(vertical_sum0, vertical_sum1);
+  result = _mm_slli_epi16(result, 1);
+  StoreUnaligned16(luma_ptr, result);
+  return result;
+}
+
+}  // namespace
+
 namespace low_bitdepth {
 namespace {
 
@@ -40,8 +79,8 @@ namespace {
 
 inline __m128i CflPredictUnclipped(const __m128i* input, __m128i alpha_q12,
                                    __m128i alpha_sign, __m128i dc_q0) {
-  __m128i ac_q3 = LoadUnaligned16(input);
-  __m128i ac_sign = _mm_sign_epi16(alpha_sign, ac_q3);
+  const __m128i ac_q3 = LoadUnaligned16(input);
+  const __m128i ac_sign = _mm_sign_epi16(alpha_sign, ac_q3);
   __m128i scaled_luma_q0 = _mm_mulhrs_epi16(_mm_abs_epi16(ac_q3), alpha_q12);
   scaled_luma_q0 = _mm_sign_epi16(scaled_luma_q0, ac_sign);
   return _mm_add_epi16(scaled_luma_q0, dc_q0);
@@ -88,8 +127,7 @@ void CflIntraPredictor_SSE4_1(
 template <int block_height_log2, bool is_inside>
 void CflSubsampler444_4xH_SSE4_1(
     int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
-    const int /*max_luma_width*/, const int max_luma_height,
-    const void* const source, ptrdiff_t stride) {
+    const int max_luma_height, const void* const source, ptrdiff_t stride) {
   static_assert(block_height_log2 <= 4, "");
   const int block_height = 1 << block_height_log2;
   const int visible_height = max_luma_height;
@@ -119,12 +157,15 @@ void CflSubsampler444_4xH_SSE4_1(
   } while (y < visible_height);
 
   if (!is_inside) {
-    int y = visible_height;
+    // Replicate the 2 high lanes.
+    samples = _mm_shuffle_epi32(samples, 0xee);
     do {
+      StoreLo8(luma_ptr, samples);
+      luma_ptr += kCflLumaBufferStride;
       StoreHi8(luma_ptr, samples);
       luma_ptr += kCflLumaBufferStride;
       sum = _mm_add_epi16(sum, samples);
-      ++y;
+      y += 2;
     } while (y < block_height);
   }
 
@@ -152,15 +193,15 @@ void CflSubsampler444_4xH_SSE4_1(
   static_assert(block_height_log2 <= 4, "");
   assert(max_luma_width >= 4);
   assert(max_luma_height >= 4);
-  const int block_height = 1 << block_height_log2;
-  const int block_width = 4;
+  static_cast<void>(max_luma_width);
+  constexpr int block_height = 1 << block_height_log2;
 
-  if (block_height <= max_luma_height && block_width <= max_luma_width) {
-    CflSubsampler444_4xH_SSE4_1<block_height_log2, true>(
-        luma, max_luma_width, max_luma_height, source, stride);
+  if (block_height <= max_luma_height) {
+    CflSubsampler444_4xH_SSE4_1<block_height_log2, true>(luma, max_luma_height,
+                                                         source, stride);
   } else {
-    CflSubsampler444_4xH_SSE4_1<block_height_log2, false>(
-        luma, max_luma_width, max_luma_height, source, stride);
+    CflSubsampler444_4xH_SSE4_1<block_height_log2, false>(luma, max_luma_height,
+                                                          source, stride);
   }
 }
 
@@ -302,19 +343,9 @@ void CflSubsampler444_SSE4_1(
   __m128i inner_sum_lo, inner_sum_hi;
   int y = 0;
   do {
-#if LIBGAV1_MSAN  // We can load uninitialized values here. Even though they are
-                  // then masked off by blendv, MSAN isn't smart enough to
-                  // understand that. So we switch to a C implementation here.
-    uint16_t c_arr[16];
-    for (int x = 0; x < 16; x++) {
-      const int x_index = std::min(x, visible_width_16 - 1);
-      c_arr[x] = src[x_index] << 3;
-    }
-    samples0 = LoadUnaligned16(c_arr);
-    samples1 = LoadUnaligned16(c_arr + 8);
-    static_cast<void>(blend_mask_16);
-#else
-    __m128i samples01 = LoadUnaligned16(src);
+    // We can load uninitialized values here. Even though they are then masked
+    // off by blendv, MSAN doesn't model that behavior.
+    __m128i samples01 = LoadUnaligned16Msan(src, invisible_width_16);
 
     if (!inside) {
       const __m128i border16 =
@@ -323,26 +354,15 @@ void CflSubsampler444_SSE4_1(
     }
     samples0 = _mm_slli_epi16(_mm_cvtepu8_epi16(samples01), 3);
     samples1 = _mm_slli_epi16(_mm_unpackhi_epi8(samples01, zero), 3);
-#endif  // LIBGAV1_MSAN
 
     StoreUnaligned16(luma_ptr, samples0);
     StoreUnaligned16(luma_ptr + 8, samples1);
     __m128i inner_sum = _mm_add_epi16(samples0, samples1);
 
     if (block_width == 32) {
-#if LIBGAV1_MSAN  // We can load uninitialized values here. Even though they are
-                  // then masked off by blendv, MSAN isn't smart enough to
-                  // understand that. So we switch to a C implementation here.
-      uint16_t c_arr[16];
-      for (int x = 16; x < 32; x++) {
-        const int x_index = std::min(x, visible_width_32 - 1);
-        c_arr[x - 16] = src[x_index] << 3;
-      }
-      samples2 = LoadUnaligned16(c_arr);
-      samples3 = LoadUnaligned16(c_arr + 8);
-      static_cast<void>(blend_mask_32);
-#else
-      __m128i samples23 = LoadUnaligned16(src + 16);
+      // We can load uninitialized values here. Even though they are then masked
+      // off by blendv, MSAN doesn't model that behavior.
+      __m128i samples23 = LoadUnaligned16Msan(src + 16, invisible_width_32);
       if (!inside) {
         const __m128i border32 =
             _mm_set1_epi8(static_cast<int8_t>(src[visible_width_32 - 1]));
@@ -350,7 +370,6 @@ void CflSubsampler444_SSE4_1(
       }
       samples2 = _mm_slli_epi16(_mm_cvtepu8_epi16(samples23), 3);
       samples3 = _mm_slli_epi16(_mm_unpackhi_epi8(samples23, zero), 3);
-#endif  // LIBGAV1_MSAN
 
       StoreUnaligned16(luma_ptr + 16, samples2);
       StoreUnaligned16(luma_ptr + 24, samples3);
@@ -418,29 +437,6 @@ void CflSubsampler444_SSE4_1(
   }
 }
 
-// Takes in two sums of input row pairs, and completes the computation for two
-// output rows.
-inline __m128i StoreLumaResults4_420(const __m128i vertical_sum0,
-                                     const __m128i vertical_sum1,
-                                     int16_t* luma_ptr) {
-  __m128i result = _mm_hadd_epi16(vertical_sum0, vertical_sum1);
-  result = _mm_slli_epi16(result, 1);
-  StoreLo8(luma_ptr, result);
-  StoreHi8(luma_ptr + kCflLumaBufferStride, result);
-  return result;
-}
-
-// Takes two halves of a vertically added pair of rows and completes the
-// computation for one output row.
-inline __m128i StoreLumaResults8_420(const __m128i vertical_sum0,
-                                     const __m128i vertical_sum1,
-                                     int16_t* luma_ptr) {
-  __m128i result = _mm_hadd_epi16(vertical_sum0, vertical_sum1);
-  result = _mm_slli_epi16(result, 1);
-  StoreUnaligned16(luma_ptr, result);
-  return result;
-}
-
 template <int block_height_log2>
 void CflSubsampler420_4xH_SSE4_1(
     int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
@@ -511,17 +507,6 @@ void CflSubsampler420_4xH_SSE4_1(
   }
 }
 
-// This duplicates the last two 16-bit values in |row|.
-inline __m128i LastRowSamples(const __m128i row) {
-  return _mm_shuffle_epi32(row, 0xFF);
-}
-
-// This duplicates the last 16-bit value in |row|.
-inline __m128i LastRowResult(const __m128i row) {
-  const __m128i dup_row = _mm_shufflehi_epi16(row, 0xFF);
-  return _mm_shuffle_epi32(dup_row, 0xFF);
-}
-
 template <int block_height_log2, int max_luma_width>
 inline void CflSubsampler420Impl_8xH_SSE4_1(
     int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
@@ -655,10 +640,11 @@ inline void CflSubsampler420Impl_WxH_SSE4_1(
   __m128i final_sum = zero;
   const int block_height = 1 << block_height_log2;
   const int luma_height = std::min(block_height, max_luma_height >> 1);
+  static_assert(max_luma_width <= 32, "");
 
   int16_t* luma_ptr = luma[0];
   __m128i final_row_result;
-  // Begin first y section, covering width up to 16.
+  // Begin first y section, covering width up to 32.
   int y = 0;
   do {
     const uint8_t* src_next = src + stride;
@@ -694,29 +680,32 @@ inline void CflSubsampler420Impl_WxH_SSE4_1(
     final_row_result =
         StoreLumaResults8_420(luma_sum2, luma_sum3, luma_ptr + 8);
     sum = _mm_add_epi16(sum, final_row_result);
+    if (block_width_log2 == 5) {
+      const __m128i wide_fill = LastRowResult(final_row_result);
+      sum = _mm_add_epi16(sum, wide_fill);
+      sum = _mm_add_epi16(sum, wide_fill);
+    }
     final_sum = _mm_add_epi32(final_sum, _mm_cvtepu16_epi32(sum));
     final_sum = _mm_add_epi32(final_sum, _mm_unpackhi_epi16(sum, zero));
     src += stride << 1;
     luma_ptr += kCflLumaBufferStride;
   } while (++y < luma_height);
 
-  // Because max_luma_width is at most 32, any values beyond x=16 will
-  // necessarily be duplicated.
-  if (block_width_log2 == 5) {
-    const __m128i wide_fill = LastRowResult(final_row_result);
-    // Multiply duplicated value by number of occurrences, height * 4, since
-    // there are 16 in each row and the value appears in the vector 4 times.
-    final_sum = _mm_add_epi32(
-        final_sum,
-        _mm_slli_epi32(_mm_cvtepi16_epi32(wide_fill), block_height_log2 + 2));
-  }
-
   // Begin second y section.
   if (y < block_height) {
     const __m128i final_fill0 =
         LoadUnaligned16(luma_ptr - kCflLumaBufferStride);
     const __m128i final_fill1 =
         LoadUnaligned16(luma_ptr - kCflLumaBufferStride + 8);
+    __m128i wide_fill;
+
+    if (block_width_log2 == 5) {
+      // There are 16 16-bit fill values per row, shifting by 2 accounts for
+      // the widening to 32-bit.
+      wide_fill =
+          _mm_slli_epi32(_mm_cvtepi16_epi32(LastRowResult(final_fill1)), 2);
+    }
+
     const __m128i final_inner_sum = _mm_add_epi16(final_fill0, final_fill1);
     const __m128i final_inner_sum0 = _mm_cvtepu16_epi32(final_inner_sum);
     const __m128i final_inner_sum1 = _mm_unpackhi_epi16(final_inner_sum, zero);
@@ -726,6 +715,9 @@ inline void CflSubsampler420Impl_WxH_SSE4_1(
     do {
       StoreUnaligned16(luma_ptr, final_fill0);
       StoreUnaligned16(luma_ptr + 8, final_fill1);
+      if (block_width_log2 == 5) {
+        final_sum = _mm_add_epi32(final_sum, wide_fill);
+      }
       luma_ptr += kCflLumaBufferStride;
 
       final_sum = _mm_add_epi32(final_sum, final_fill_to_sum);
@@ -747,14 +739,10 @@ inline void CflSubsampler420Impl_WxH_SSE4_1(
     const __m128i samples1 = LoadUnaligned16(luma_ptr + 8);
     final_row_result = _mm_sub_epi16(samples1, averages);
     StoreUnaligned16(luma_ptr + 8, final_row_result);
-  }
-  if (block_width_log2 == 5) {
-    int16_t* wide_luma_ptr = luma[0] + 16;
-    const __m128i wide_fill = LastRowResult(final_row_result);
-    for (int i = 0; i < block_height;
-         ++i, wide_luma_ptr += kCflLumaBufferStride) {
-      StoreUnaligned16(wide_luma_ptr, wide_fill);
-      StoreUnaligned16(wide_luma_ptr + 8, wide_fill);
+    if (block_width_log2 == 5) {
+      const __m128i wide_fill = LastRowResult(final_row_result);
+      StoreUnaligned16(luma_ptr + 16, wide_fill);
+      StoreUnaligned16(luma_ptr + 24, wide_fill);
     }
   }
 }
@@ -958,7 +946,882 @@ void Init8bpp() {
 }  // namespace
 }  // namespace low_bitdepth
 
-void IntraPredCflInit_SSE4_1() { low_bitdepth::Init8bpp(); }
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+//------------------------------------------------------------------------------
+// CflIntraPredictor_10bpp_SSE4_1
+
+inline __m128i CflPredictUnclipped(const __m128i* input, __m128i alpha_q12,
+                                   __m128i alpha_sign, __m128i dc_q0) {
+  const __m128i ac_q3 = LoadUnaligned16(input);
+  const __m128i ac_sign = _mm_sign_epi16(alpha_sign, ac_q3);
+  __m128i scaled_luma_q0 = _mm_mulhrs_epi16(_mm_abs_epi16(ac_q3), alpha_q12);
+  scaled_luma_q0 = _mm_sign_epi16(scaled_luma_q0, ac_sign);
+  return _mm_add_epi16(scaled_luma_q0, dc_q0);
+}
+
+inline __m128i ClipEpi16(__m128i x, __m128i min, __m128i max) {
+  return _mm_max_epi16(_mm_min_epi16(x, max), min);
+}
+
+template <int width, int height>
+void CflIntraPredictor_10bpp_SSE4_1(
+    void* const dest, ptrdiff_t stride,
+    const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int alpha) {
+  constexpr int kCflLumaBufferStrideLog2_16i = 5;
+  constexpr int kCflLumaBufferStrideLog2_128i =
+      kCflLumaBufferStrideLog2_16i - 3;
+  constexpr int kRowIncr = 1 << kCflLumaBufferStrideLog2_128i;
+  auto* dst = static_cast<uint16_t*>(dest);
+  const __m128i alpha_sign = _mm_set1_epi16(alpha);
+  const __m128i alpha_q12 = _mm_slli_epi16(_mm_abs_epi16(alpha_sign), 9);
+  auto* row = reinterpret_cast<const __m128i*>(luma);
+  const __m128i* row_end = row + (height << kCflLumaBufferStrideLog2_128i);
+  const __m128i dc_val = _mm_set1_epi16(dst[0]);
+  const __m128i min = _mm_setzero_si128();
+  const __m128i max = _mm_set1_epi16((1 << kBitdepth10) - 1);
+
+  stride >>= 1;
+
+  do {
+    __m128i res = CflPredictUnclipped(row, alpha_q12, alpha_sign, dc_val);
+    res = ClipEpi16(res, min, max);
+    if (width == 4) {
+      StoreLo8(dst, res);
+    } else if (width == 8) {
+      StoreUnaligned16(dst, res);
+    } else if (width == 16) {
+      StoreUnaligned16(dst, res);
+      const __m128i res_1 =
+          CflPredictUnclipped(row + 1, alpha_q12, alpha_sign, dc_val);
+      StoreUnaligned16(dst + 8, ClipEpi16(res_1, min, max));
+    } else {
+      StoreUnaligned16(dst, res);
+      const __m128i res_1 =
+          CflPredictUnclipped(row + 1, alpha_q12, alpha_sign, dc_val);
+      StoreUnaligned16(dst + 8, ClipEpi16(res_1, min, max));
+      const __m128i res_2 =
+          CflPredictUnclipped(row + 2, alpha_q12, alpha_sign, dc_val);
+      StoreUnaligned16(dst + 16, ClipEpi16(res_2, min, max));
+      const __m128i res_3 =
+          CflPredictUnclipped(row + 3, alpha_q12, alpha_sign, dc_val);
+      StoreUnaligned16(dst + 24, ClipEpi16(res_3, min, max));
+    }
+
+    dst += stride;
+  } while ((row += kRowIncr) < row_end);
+}
+
+template <int block_height_log2, bool is_inside>
+void CflSubsampler444_4xH_SSE4_1(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_height, const void* const source, ptrdiff_t stride) {
+  static_assert(block_height_log2 <= 4, "");
+  const int block_height = 1 << block_height_log2;
+  const int visible_height = max_luma_height;
+  const auto* src = static_cast<const uint16_t*>(source);
+  const ptrdiff_t src_stride = stride / sizeof(src[0]);
+  int16_t* luma_ptr = luma[0];
+  __m128i zero = _mm_setzero_si128();
+  __m128i sum = zero;
+  __m128i samples;
+  int y = visible_height;
+
+  do {
+    samples = LoadHi8(LoadLo8(src), src + src_stride);
+    src += src_stride << 1;
+    sum = _mm_add_epi16(sum, samples);
+    y -= 2;
+  } while (y != 0);
+
+  if (!is_inside) {
+    y = visible_height;
+    samples = _mm_unpackhi_epi64(samples, samples);
+    do {
+      sum = _mm_add_epi16(sum, samples);
+      y += 2;
+    } while (y < block_height);
+  }
+
+  sum = _mm_add_epi32(_mm_unpackhi_epi16(sum, zero), _mm_cvtepu16_epi32(sum));
+  sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 8));
+  sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 4));
+
+  // Here the left shift by 3 (to increase precision) is nullified in right
+  // shift ((log2 of width 4) + 1).
+  __m128i averages = RightShiftWithRounding_U32(sum, block_height_log2 - 1);
+  averages = _mm_shufflelo_epi16(averages, 0);
+  src = static_cast<const uint16_t*>(source);
+  luma_ptr = luma[0];
+  y = visible_height;
+  do {
+    samples = LoadLo8(src);
+    samples = _mm_slli_epi16(samples, 3);
+    StoreLo8(luma_ptr, _mm_sub_epi16(samples, averages));
+    src += src_stride;
+    luma_ptr += kCflLumaBufferStride;
+  } while (--y != 0);
+
+  if (!is_inside) {
+    y = visible_height;
+    // Replicate last line
+    do {
+      StoreLo8(luma_ptr, _mm_sub_epi16(samples, averages));
+      luma_ptr += kCflLumaBufferStride;
+    } while (++y < block_height);
+  }
+}
+
+template <int block_height_log2>
+void CflSubsampler444_4xH_SSE4_1(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_width, const int max_luma_height,
+    const void* const source, ptrdiff_t stride) {
+  static_cast<void>(max_luma_width);
+  static_cast<void>(max_luma_height);
+  static_assert(block_height_log2 <= 4, "");
+  assert(max_luma_width >= 4);
+  assert(max_luma_height >= 4);
+  const int block_height = 1 << block_height_log2;
+
+  if (block_height <= max_luma_height) {
+    CflSubsampler444_4xH_SSE4_1<block_height_log2, true>(luma, max_luma_height,
+                                                         source, stride);
+  } else {
+    CflSubsampler444_4xH_SSE4_1<block_height_log2, false>(luma, max_luma_height,
+                                                          source, stride);
+  }
+}
+
+template <int block_height_log2, bool is_inside>
+void CflSubsampler444_8xH_SSE4_1(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_height, const void* const source, ptrdiff_t stride) {
+  const int block_height = 1 << block_height_log2;
+  const int visible_height = max_luma_height;
+  const __m128i dup16 = _mm_set1_epi32(0x01000100);
+  const auto* src = static_cast<const uint16_t*>(source);
+  const ptrdiff_t src_stride = stride / sizeof(src[0]);
+  int16_t* luma_ptr = luma[0];
+  const __m128i zero = _mm_setzero_si128();
+  __m128i sum = zero;
+  __m128i samples;
+  int y = visible_height;
+
+  do {
+    samples = LoadUnaligned16(src);
+    src += src_stride;
+    sum = _mm_add_epi16(sum, samples);
+  } while (--y != 0);
+
+  if (!is_inside) {
+    y = visible_height;
+    do {
+      sum = _mm_add_epi16(sum, samples);
+    } while (++y < block_height);
+  }
+
+  sum = _mm_add_epi32(_mm_unpackhi_epi16(sum, zero), _mm_cvtepu16_epi32(sum));
+  sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 8));
+  sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 4));
+
+  // Here the left shift by 3 (to increase precision) is nullified in right
+  // shift (log2 of width 8).
+  __m128i averages = RightShiftWithRounding_U32(sum, block_height_log2);
+  averages = _mm_shuffle_epi8(averages, dup16);
+
+  src = static_cast<const uint16_t*>(source);
+  luma_ptr = luma[0];
+  y = visible_height;
+  do {
+    samples = LoadUnaligned16(src);
+    samples = _mm_slli_epi16(samples, 3);
+    StoreUnaligned16(luma_ptr, _mm_sub_epi16(samples, averages));
+    src += src_stride;
+    luma_ptr += kCflLumaBufferStride;
+  } while (--y != 0);
+
+  if (!is_inside) {
+    y = visible_height;
+    // Replicate last line
+    do {
+      StoreUnaligned16(luma_ptr, _mm_sub_epi16(samples, averages));
+      luma_ptr += kCflLumaBufferStride;
+    } while (++y < block_height);
+  }
+}
+
+template <int block_height_log2>
+void CflSubsampler444_8xH_SSE4_1(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_width, const int max_luma_height,
+    const void* const source, ptrdiff_t stride) {
+  static_cast<void>(max_luma_width);
+  static_cast<void>(max_luma_height);
+  static_assert(block_height_log2 <= 5, "");
+  assert(max_luma_width >= 4);
+  assert(max_luma_height >= 4);
+  const int block_height = 1 << block_height_log2;
+  const int block_width = 8;
+
+  const int horz_inside = block_width <= max_luma_width;
+  const int vert_inside = block_height <= max_luma_height;
+  if (horz_inside && vert_inside) {
+    CflSubsampler444_8xH_SSE4_1<block_height_log2, true>(luma, max_luma_height,
+                                                         source, stride);
+  } else {
+    CflSubsampler444_8xH_SSE4_1<block_height_log2, false>(luma, max_luma_height,
+                                                          source, stride);
+  }
+}
+
+template <int block_width_log2, int block_height_log2, bool is_inside>
+void CflSubsampler444_WxH_SSE4_1(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_width, const int max_luma_height,
+    const void* const source, ptrdiff_t stride) {
+  const int block_height = 1 << block_height_log2;
+  const int visible_height = max_luma_height;
+  const int block_width = 1 << block_width_log2;
+  const __m128i dup16 = _mm_set1_epi32(0x01000100);
+  const __m128i zero = _mm_setzero_si128();
+  const auto* src = static_cast<const uint16_t*>(source);
+  const ptrdiff_t src_stride = stride / sizeof(src[0]);
+  int16_t* luma_ptr = luma[0];
+  __m128i sum = zero;
+  __m128i inner_sum_lo, inner_sum_hi;
+  __m128i samples[4];
+  int y = visible_height;
+
+  do {
+    samples[0] = LoadUnaligned16(src);
+    samples[1] = (max_luma_width >= 16) ? LoadUnaligned16(src + 8)
+                                        : LastRowResult(samples[0]);
+    __m128i inner_sum = _mm_add_epi16(samples[0], samples[1]);
+    if (block_width == 32) {
+      samples[2] = (max_luma_width >= 24) ? LoadUnaligned16(src + 16)
+                                          : LastRowResult(samples[1]);
+      samples[3] = (max_luma_width == 32) ? LoadUnaligned16(src + 24)
+                                          : LastRowResult(samples[2]);
+
+      inner_sum = _mm_add_epi16(samples[2], inner_sum);
+      inner_sum = _mm_add_epi16(samples[3], inner_sum);
+    }
+    inner_sum_lo = _mm_cvtepu16_epi32(inner_sum);
+    inner_sum_hi = _mm_unpackhi_epi16(inner_sum, zero);
+    sum = _mm_add_epi32(sum, inner_sum_lo);
+    sum = _mm_add_epi32(sum, inner_sum_hi);
+    src += src_stride;
+  } while (--y != 0);
+
+  if (!is_inside) {
+    y = visible_height;
+    __m128i inner_sum = _mm_add_epi16(samples[0], samples[1]);
+    if (block_width == 32) {
+      inner_sum = _mm_add_epi16(samples[2], inner_sum);
+      inner_sum = _mm_add_epi16(samples[3], inner_sum);
+    }
+    inner_sum_lo = _mm_cvtepu16_epi32(inner_sum);
+    inner_sum_hi = _mm_unpackhi_epi16(inner_sum, zero);
+    do {
+      sum = _mm_add_epi32(sum, inner_sum_lo);
+      sum = _mm_add_epi32(sum, inner_sum_hi);
+    } while (++y < block_height);
+  }
+
+  sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 8));
+  sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 4));
+
+  // Here the left shift by 3 (to increase precision) is subtracted in right
+  // shift factor (block_width_log2 + block_height_log2 - 3).
+  __m128i averages =
+      RightShiftWithRounding_U32(sum, block_width_log2 + block_height_log2 - 3);
+  averages = _mm_shuffle_epi8(averages, dup16);
+
+  src = static_cast<const uint16_t*>(source);
+  __m128i samples_ext = zero;
+  luma_ptr = luma[0];
+  y = visible_height;
+  do {
+    int idx = 0;
+    for (int x = 0; x < block_width; x += 8) {
+      if (max_luma_width > x) {
+        samples[idx] = LoadUnaligned16(&src[x]);
+        samples[idx] = _mm_slli_epi16(samples[idx], 3);
+        samples_ext = samples[idx];
+      } else {
+        samples[idx] = LastRowResult(samples_ext);
+      }
+      StoreUnaligned16(&luma_ptr[x], _mm_sub_epi16(samples[idx++], averages));
+    }
+    src += src_stride;
+    luma_ptr += kCflLumaBufferStride;
+  } while (--y != 0);
+
+  if (!is_inside) {
+    y = visible_height;
+    // Replicate last line
+    do {
+      int idx = 0;
+      for (int x = 0; x < block_width; x += 8) {
+        StoreUnaligned16(&luma_ptr[x], _mm_sub_epi16(samples[idx++], averages));
+      }
+      luma_ptr += kCflLumaBufferStride;
+    } while (++y < block_height);
+  }
+}
+
+template <int block_width_log2, int block_height_log2>
+void CflSubsampler444_WxH_SSE4_1(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_width, const int max_luma_height,
+    const void* const source, ptrdiff_t stride) {
+  static_assert(block_width_log2 == 4 || block_width_log2 == 5,
+                "This function will only work for block_width 16 and 32.");
+  static_assert(block_height_log2 <= 5, "");
+  assert(max_luma_width >= 4);
+  assert(max_luma_height >= 4);
+
+  const int block_height = 1 << block_height_log2;
+  const int vert_inside = block_height <= max_luma_height;
+  if (vert_inside) {
+    CflSubsampler444_WxH_SSE4_1<block_width_log2, block_height_log2, true>(
+        luma, max_luma_width, max_luma_height, source, stride);
+  } else {
+    CflSubsampler444_WxH_SSE4_1<block_width_log2, block_height_log2, false>(
+        luma, max_luma_width, max_luma_height, source, stride);
+  }
+}
+
+template <int block_height_log2>
+void CflSubsampler420_4xH_SSE4_1(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int /*max_luma_width*/, const int max_luma_height,
+    const void* const source, ptrdiff_t stride) {
+  const int block_height = 1 << block_height_log2;
+  const auto* src = static_cast<const uint16_t*>(source);
+  const ptrdiff_t src_stride = stride / sizeof(src[0]);
+  int16_t* luma_ptr = luma[0];
+  const __m128i zero = _mm_setzero_si128();
+  __m128i final_sum = zero;
+  const int luma_height = std::min(block_height, max_luma_height >> 1);
+  int y = luma_height;
+
+  do {
+    const __m128i samples_row0 = LoadUnaligned16(src);
+    src += src_stride;
+    const __m128i samples_row1 = LoadUnaligned16(src);
+    src += src_stride;
+    const __m128i luma_sum01 = _mm_add_epi16(samples_row0, samples_row1);
+
+    const __m128i samples_row2 = LoadUnaligned16(src);
+    src += src_stride;
+    const __m128i samples_row3 = LoadUnaligned16(src);
+    src += src_stride;
+    const __m128i luma_sum23 = _mm_add_epi16(samples_row2, samples_row3);
+    __m128i sum = StoreLumaResults4_420(luma_sum01, luma_sum23, luma_ptr);
+    luma_ptr += kCflLumaBufferStride << 1;
+
+    const __m128i samples_row4 = LoadUnaligned16(src);
+    src += src_stride;
+    const __m128i samples_row5 = LoadUnaligned16(src);
+    src += src_stride;
+    const __m128i luma_sum45 = _mm_add_epi16(samples_row4, samples_row5);
+
+    const __m128i samples_row6 = LoadUnaligned16(src);
+    src += src_stride;
+    const __m128i samples_row7 = LoadUnaligned16(src);
+    src += src_stride;
+    const __m128i luma_sum67 = _mm_add_epi16(samples_row6, samples_row7);
+    sum = _mm_add_epi16(
+        sum, StoreLumaResults4_420(luma_sum45, luma_sum67, luma_ptr));
+    luma_ptr += kCflLumaBufferStride << 1;
+
+    final_sum = _mm_add_epi32(final_sum, _mm_cvtepu16_epi32(sum));
+    final_sum = _mm_add_epi32(final_sum, _mm_unpackhi_epi16(sum, zero));
+    y -= 4;
+  } while (y != 0);
+
+  const __m128i final_fill = LoadLo8(luma_ptr - kCflLumaBufferStride);
+  const __m128i final_fill_to_sum = _mm_cvtepu16_epi32(final_fill);
+  for (y = luma_height; y < block_height; ++y) {
+    StoreLo8(luma_ptr, final_fill);
+    luma_ptr += kCflLumaBufferStride;
+    final_sum = _mm_add_epi32(final_sum, final_fill_to_sum);
+  }
+  final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 8));
+  final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 4));
+
+  __m128i averages = RightShiftWithRounding_U32(
+      final_sum, block_height_log2 + 2 /*log2 of width 4*/);
+
+  averages = _mm_shufflelo_epi16(averages, 0);
+  luma_ptr = luma[0];
+  y = block_height;
+  do {
+    const __m128i samples = LoadLo8(luma_ptr);
+    StoreLo8(luma_ptr, _mm_sub_epi16(samples, averages));
+    luma_ptr += kCflLumaBufferStride;
+  } while (--y != 0);
+}
+
+template <int block_height_log2, int max_luma_width>
+inline void CflSubsampler420Impl_8xH_SSE4_1(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_height, const void* const source, ptrdiff_t stride) {
+  const int block_height = 1 << block_height_log2;
+  const auto* src = static_cast<const uint16_t*>(source);
+  const ptrdiff_t src_stride = stride / sizeof(src[0]);
+  const __m128i zero = _mm_setzero_si128();
+  __m128i final_sum = zero;
+  int16_t* luma_ptr = luma[0];
+  const int luma_height = std::min(block_height, max_luma_height >> 1);
+  int y = luma_height;
+
+  do {
+    const __m128i samples_row00 = LoadUnaligned16(src);
+    const __m128i samples_row01 = (max_luma_width == 16)
+                                      ? LoadUnaligned16(src + 8)
+                                      : LastRowSamples(samples_row00);
+    src += src_stride;
+    const __m128i samples_row10 = LoadUnaligned16(src);
+    const __m128i samples_row11 = (max_luma_width == 16)
+                                      ? LoadUnaligned16(src + 8)
+                                      : LastRowSamples(samples_row10);
+    src += src_stride;
+    const __m128i luma_sum00 = _mm_add_epi16(samples_row00, samples_row10);
+    const __m128i luma_sum01 = _mm_add_epi16(samples_row01, samples_row11);
+    __m128i sum = StoreLumaResults8_420(luma_sum00, luma_sum01, luma_ptr);
+    luma_ptr += kCflLumaBufferStride;
+
+    const __m128i samples_row20 = LoadUnaligned16(src);
+    const __m128i samples_row21 = (max_luma_width == 16)
+                                      ? LoadUnaligned16(src + 8)
+                                      : LastRowSamples(samples_row20);
+    src += src_stride;
+    const __m128i samples_row30 = LoadUnaligned16(src);
+    const __m128i samples_row31 = (max_luma_width == 16)
+                                      ? LoadUnaligned16(src + 8)
+                                      : LastRowSamples(samples_row30);
+    src += src_stride;
+    const __m128i luma_sum10 = _mm_add_epi16(samples_row20, samples_row30);
+    const __m128i luma_sum11 = _mm_add_epi16(samples_row21, samples_row31);
+    sum = _mm_add_epi16(
+        sum, StoreLumaResults8_420(luma_sum10, luma_sum11, luma_ptr));
+    luma_ptr += kCflLumaBufferStride;
+
+    const __m128i samples_row40 = LoadUnaligned16(src);
+    const __m128i samples_row41 = (max_luma_width == 16)
+                                      ? LoadUnaligned16(src + 8)
+                                      : LastRowSamples(samples_row40);
+    src += src_stride;
+    const __m128i samples_row50 = LoadUnaligned16(src);
+    const __m128i samples_row51 = (max_luma_width == 16)
+                                      ? LoadUnaligned16(src + 8)
+                                      : LastRowSamples(samples_row50);
+    src += src_stride;
+    const __m128i luma_sum20 = _mm_add_epi16(samples_row40, samples_row50);
+    const __m128i luma_sum21 = _mm_add_epi16(samples_row41, samples_row51);
+    sum = _mm_add_epi16(
+        sum, StoreLumaResults8_420(luma_sum20, luma_sum21, luma_ptr));
+    luma_ptr += kCflLumaBufferStride;
+
+    const __m128i samples_row60 = LoadUnaligned16(src);
+    const __m128i samples_row61 = (max_luma_width == 16)
+                                      ? LoadUnaligned16(src + 8)
+                                      : LastRowSamples(samples_row60);
+    src += src_stride;
+    const __m128i samples_row70 = LoadUnaligned16(src);
+    const __m128i samples_row71 = (max_luma_width == 16)
+                                      ? LoadUnaligned16(src + 8)
+                                      : LastRowSamples(samples_row70);
+    src += src_stride;
+    const __m128i luma_sum30 = _mm_add_epi16(samples_row60, samples_row70);
+    const __m128i luma_sum31 = _mm_add_epi16(samples_row61, samples_row71);
+    sum = _mm_add_epi16(
+        sum, StoreLumaResults8_420(luma_sum30, luma_sum31, luma_ptr));
+    luma_ptr += kCflLumaBufferStride;
+
+    final_sum = _mm_add_epi32(final_sum, _mm_cvtepu16_epi32(sum));
+    final_sum = _mm_add_epi32(final_sum, _mm_unpackhi_epi16(sum, zero));
+    y -= 4;
+  } while (y != 0);
+
+  // Duplicate the final row downward to the end after max_luma_height.
+  const __m128i final_fill = LoadUnaligned16(luma_ptr - kCflLumaBufferStride);
+  const __m128i final_fill_to_sum0 = _mm_cvtepi16_epi32(final_fill);
+  const __m128i final_fill_to_sum1 =
+      _mm_cvtepi16_epi32(_mm_srli_si128(final_fill, 8));
+  const __m128i final_fill_to_sum =
+      _mm_add_epi32(final_fill_to_sum0, final_fill_to_sum1);
+  for (y = luma_height; y < block_height; ++y) {
+    StoreUnaligned16(luma_ptr, final_fill);
+    luma_ptr += kCflLumaBufferStride;
+    final_sum = _mm_add_epi32(final_sum, final_fill_to_sum);
+  }
+  final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 8));
+  final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 4));
+
+  __m128i averages = RightShiftWithRounding_S32(
+      final_sum, block_height_log2 + 3 /*log2 of width 8*/);
+
+  averages = _mm_shufflelo_epi16(averages, 0);
+  averages = _mm_shuffle_epi32(averages, 0);
+  luma_ptr = luma[0];
+  y = block_height;
+  do {
+    const __m128i samples = LoadUnaligned16(luma_ptr);
+    StoreUnaligned16(luma_ptr, _mm_sub_epi16(samples, averages));
+    luma_ptr += kCflLumaBufferStride;
+  } while (--y != 0);
+}
+
+template <int block_height_log2>
+void CflSubsampler420_8xH_SSE4_1(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_width, const int max_luma_height,
+    const void* const source, ptrdiff_t stride) {
+  if (max_luma_width == 8) {
+    CflSubsampler420Impl_8xH_SSE4_1<block_height_log2, 8>(luma, max_luma_height,
+                                                          source, stride);
+  } else {
+    CflSubsampler420Impl_8xH_SSE4_1<block_height_log2, 16>(
+        luma, max_luma_height, source, stride);
+  }
+}
+
+template <int block_width_log2, int block_height_log2, int max_luma_width>
+inline void CflSubsampler420Impl_WxH_SSE4_1(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_height, const void* const source, ptrdiff_t stride) {
+  const auto* src = static_cast<const uint16_t*>(source);
+  const ptrdiff_t src_stride = stride / sizeof(src[0]);
+  const __m128i zero = _mm_setzero_si128();
+  __m128i final_sum = zero;
+  const int block_height = 1 << block_height_log2;
+  const int luma_height = std::min(block_height, max_luma_height >> 1);
+  int16_t* luma_ptr = luma[0];
+  __m128i final_row_result;
+  // Begin first y section, covering width up to 32.
+  int y = luma_height;
+
+  do {
+    const uint16_t* src_next = src + src_stride;
+    const __m128i samples_row00 = LoadUnaligned16(src);
+    const __m128i samples_row01 = (max_luma_width >= 16)
+                                      ? LoadUnaligned16(src + 8)
+                                      : LastRowSamples(samples_row00);
+    const __m128i samples_row02 = (max_luma_width >= 24)
+                                      ? LoadUnaligned16(src + 16)
+                                      : LastRowSamples(samples_row01);
+    const __m128i samples_row03 = (max_luma_width == 32)
+                                      ? LoadUnaligned16(src + 24)
+                                      : LastRowSamples(samples_row02);
+    const __m128i samples_row10 = LoadUnaligned16(src_next);
+    const __m128i samples_row11 = (max_luma_width >= 16)
+                                      ? LoadUnaligned16(src_next + 8)
+                                      : LastRowSamples(samples_row10);
+    const __m128i samples_row12 = (max_luma_width >= 24)
+                                      ? LoadUnaligned16(src_next + 16)
+                                      : LastRowSamples(samples_row11);
+    const __m128i samples_row13 = (max_luma_width == 32)
+                                      ? LoadUnaligned16(src_next + 24)
+                                      : LastRowSamples(samples_row12);
+    const __m128i luma_sum0 = _mm_add_epi16(samples_row00, samples_row10);
+    const __m128i luma_sum1 = _mm_add_epi16(samples_row01, samples_row11);
+    const __m128i luma_sum2 = _mm_add_epi16(samples_row02, samples_row12);
+    const __m128i luma_sum3 = _mm_add_epi16(samples_row03, samples_row13);
+    __m128i sum = StoreLumaResults8_420(luma_sum0, luma_sum1, luma_ptr);
+    final_row_result =
+        StoreLumaResults8_420(luma_sum2, luma_sum3, luma_ptr + 8);
+    sum = _mm_add_epi16(sum, final_row_result);
+    final_sum = _mm_add_epi32(final_sum, _mm_cvtepu16_epi32(sum));
+    final_sum = _mm_add_epi32(final_sum, _mm_unpackhi_epi16(sum, zero));
+
+    // Because max_luma_width is at most 32, any values beyond x=16 will
+    // necessarily be duplicated.
+    if (block_width_log2 == 5) {
+      const __m128i wide_fill = LastRowResult(final_row_result);
+      // There are 16 16-bit fill values per row, shifting by 2 accounts for
+      // the widening to 32-bit.
+      final_sum = _mm_add_epi32(
+          final_sum, _mm_slli_epi32(_mm_cvtepi16_epi32(wide_fill), 2));
+    }
+    src += src_stride << 1;
+    luma_ptr += kCflLumaBufferStride;
+  } while (--y != 0);
+
+  // Begin second y section.
+  y = luma_height;
+  if (y < block_height) {
+    const __m128i final_fill0 =
+        LoadUnaligned16(luma_ptr - kCflLumaBufferStride);
+    const __m128i final_fill1 =
+        LoadUnaligned16(luma_ptr - kCflLumaBufferStride + 8);
+    __m128i wide_fill;
+    if (block_width_log2 == 5) {
+      // There are 16 16-bit fill values per row, shifting by 2 accounts for
+      // the widening to 32-bit.
+      wide_fill =
+          _mm_slli_epi32(_mm_cvtepi16_epi32(LastRowResult(final_fill1)), 2);
+    }
+    const __m128i final_inner_sum = _mm_add_epi16(final_fill0, final_fill1);
+    const __m128i final_inner_sum0 = _mm_cvtepu16_epi32(final_inner_sum);
+    const __m128i final_inner_sum1 = _mm_unpackhi_epi16(final_inner_sum, zero);
+    const __m128i final_fill_to_sum =
+        _mm_add_epi32(final_inner_sum0, final_inner_sum1);
+
+    do {
+      StoreUnaligned16(luma_ptr, final_fill0);
+      StoreUnaligned16(luma_ptr + 8, final_fill1);
+      if (block_width_log2 == 5) {
+        final_sum = _mm_add_epi32(final_sum, wide_fill);
+      }
+      luma_ptr += kCflLumaBufferStride;
+      final_sum = _mm_add_epi32(final_sum, final_fill_to_sum);
+    } while (++y < block_height);
+  }  // End second y section.
+
+  final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 8));
+  final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 4));
+
+  __m128i averages = RightShiftWithRounding_S32(
+      final_sum, block_width_log2 + block_height_log2);
+  averages = _mm_shufflelo_epi16(averages, 0);
+  averages = _mm_shuffle_epi32(averages, 0);
+
+  luma_ptr = luma[0];
+  y = block_height;
+  do {
+    const __m128i samples0 = LoadUnaligned16(luma_ptr);
+    StoreUnaligned16(luma_ptr, _mm_sub_epi16(samples0, averages));
+    const __m128i samples1 = LoadUnaligned16(luma_ptr + 8);
+    final_row_result = _mm_sub_epi16(samples1, averages);
+    StoreUnaligned16(luma_ptr + 8, final_row_result);
+
+    if (block_width_log2 == 5) {
+      const __m128i wide_fill = LastRowResult(final_row_result);
+      StoreUnaligned16(luma_ptr + 16, wide_fill);
+      StoreUnaligned16(luma_ptr + 24, wide_fill);
+    }
+    luma_ptr += kCflLumaBufferStride;
+  } while (--y != 0);
+}
+
+template <int block_width_log2, int block_height_log2>
+void CflSubsampler420_WxH_SSE4_1(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_width, const int max_luma_height,
+    const void* const source, ptrdiff_t stride) {
+  switch (max_luma_width) {
+    case 8:
+      CflSubsampler420Impl_WxH_SSE4_1<block_width_log2, block_height_log2, 8>(
+          luma, max_luma_height, source, stride);
+      return;
+    case 16:
+      CflSubsampler420Impl_WxH_SSE4_1<block_width_log2, block_height_log2, 16>(
+          luma, max_luma_height, source, stride);
+      return;
+    case 24:
+      CflSubsampler420Impl_WxH_SSE4_1<block_width_log2, block_height_log2, 24>(
+          luma, max_luma_height, source, stride);
+      return;
+    default:
+      assert(max_luma_width == 32);
+      CflSubsampler420Impl_WxH_SSE4_1<block_width_log2, block_height_log2, 32>(
+          luma, max_luma_height, source, stride);
+      return;
+  }
+}
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x4_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize4x4] =
+      CflIntraPredictor_10bpp_SSE4_1<4, 4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x8_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize4x8] =
+      CflIntraPredictor_10bpp_SSE4_1<4, 8>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x16_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize4x16] =
+      CflIntraPredictor_10bpp_SSE4_1<4, 16>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x4_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize8x4] =
+      CflIntraPredictor_10bpp_SSE4_1<8, 4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x8_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize8x8] =
+      CflIntraPredictor_10bpp_SSE4_1<8, 8>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x16_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize8x16] =
+      CflIntraPredictor_10bpp_SSE4_1<8, 16>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x32_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize8x32] =
+      CflIntraPredictor_10bpp_SSE4_1<8, 32>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x4_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize16x4] =
+      CflIntraPredictor_10bpp_SSE4_1<16, 4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x8_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize16x8] =
+      CflIntraPredictor_10bpp_SSE4_1<16, 8>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x16_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize16x16] =
+      CflIntraPredictor_10bpp_SSE4_1<16, 16>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x32_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize16x32] =
+      CflIntraPredictor_10bpp_SSE4_1<16, 32>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x8_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize32x8] =
+      CflIntraPredictor_10bpp_SSE4_1<32, 8>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x16_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize32x16] =
+      CflIntraPredictor_10bpp_SSE4_1<32, 16>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x32_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize32x32] =
+      CflIntraPredictor_10bpp_SSE4_1<32, 32>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x4_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType420] =
+      CflSubsampler420_4xH_SSE4_1<2>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x8_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType420] =
+      CflSubsampler420_4xH_SSE4_1<3>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x16_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType420] =
+      CflSubsampler420_4xH_SSE4_1<4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x4_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType420] =
+      CflSubsampler420_8xH_SSE4_1<2>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x8_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType420] =
+      CflSubsampler420_8xH_SSE4_1<3>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x16_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType420] =
+      CflSubsampler420_8xH_SSE4_1<4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x32_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType420] =
+      CflSubsampler420_8xH_SSE4_1<5>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x4_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType420] =
+      CflSubsampler420_WxH_SSE4_1<4, 2>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x8_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType420] =
+      CflSubsampler420_WxH_SSE4_1<4, 3>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x16_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType420] =
+      CflSubsampler420_WxH_SSE4_1<4, 4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x32_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType420] =
+      CflSubsampler420_WxH_SSE4_1<4, 5>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x8_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType420] =
+      CflSubsampler420_WxH_SSE4_1<5, 3>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x16_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType420] =
+      CflSubsampler420_WxH_SSE4_1<5, 4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x32_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType420] =
+      CflSubsampler420_WxH_SSE4_1<5, 5>;
+#endif
+
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x4_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType444] =
+      CflSubsampler444_4xH_SSE4_1<2>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x8_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType444] =
+      CflSubsampler444_4xH_SSE4_1<3>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x16_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType444] =
+      CflSubsampler444_4xH_SSE4_1<4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x4_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType444] =
+      CflSubsampler444_8xH_SSE4_1<2>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x8_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType444] =
+      CflSubsampler444_8xH_SSE4_1<3>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x16_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType444] =
+      CflSubsampler444_8xH_SSE4_1<4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x32_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType444] =
+      CflSubsampler444_8xH_SSE4_1<5>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x4_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType444] =
+      CflSubsampler444_WxH_SSE4_1<4, 2>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x8_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType444] =
+      CflSubsampler444_WxH_SSE4_1<4, 3>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x16_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType444] =
+      CflSubsampler444_WxH_SSE4_1<4, 4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x32_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType444] =
+      CflSubsampler444_WxH_SSE4_1<4, 5>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x8_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType444] =
+      CflSubsampler444_WxH_SSE4_1<5, 3>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x16_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType444] =
+      CflSubsampler444_WxH_SSE4_1<5, 4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x32_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType444] =
+      CflSubsampler444_WxH_SSE4_1<5, 5>;
+#endif
+}
+
+}  // namespace
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+void IntraPredCflInit_SSE4_1() {
+  low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+}
 
 }  // namespace dsp
 }  // namespace libgav1
diff --git a/src/dsp/x86/intrapred_cfl_sse4.h b/src/dsp/x86/intrapred_cfl_sse4.h
new file mode 100644
index 0000000..5d1a425
--- /dev/null
+++ b/src/dsp/x86/intrapred_cfl_sse4.h
@@ -0,0 +1,376 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_INTRAPRED_CFL_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_INTRAPRED_CFL_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::cfl_intra_predictors and Dsp::cfl_subsamplers, see the
+// defines below for specifics. These functions are not thread-safe.
+void IntraPredCflInit_SSE4_1();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+// If sse4 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the sse4 implementation should be used.
+#if LIBGAV1_TARGETING_SSE4_1
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+//------------------------------------------------------------------------------
+// 10bpp
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+#endif  // LIBGAV1_TARGETING_SSE4_1
+
+#endif  // LIBGAV1_SRC_DSP_X86_INTRAPRED_CFL_SSE4_H_
diff --git a/src/dsp/x86/intrapred_directional_sse4.cc b/src/dsp/x86/intrapred_directional_sse4.cc
new file mode 100644
index 0000000..e642aee
--- /dev/null
+++ b/src/dsp/x86/intrapred_directional_sse4.cc
@@ -0,0 +1,1478 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred_directional.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <smmintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/dsp/x86/transpose_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/memory.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+//------------------------------------------------------------------------------
+// 7.11.2.4. Directional intra prediction process
+
+// Special case: An |xstep| of 64 corresponds to an angle delta of 45, meaning
+// upsampling is ruled out. In addition, the bits masked by 0x3F for
+// |shift_val| are 0 for all multiples of 64, so the formula
+// val = top[top_base_x]*shift + top[top_base_x+1]*(32-shift), reduces to
+// val = top[top_base_x+1] << 5, meaning only the second set of pixels is
+// involved in the output. Hence |top| is offset by 1.
+inline void DirectionalZone1_Step64(uint8_t* dst, ptrdiff_t stride,
+                                    const uint8_t* const top, const int width,
+                                    const int height) {
+  ptrdiff_t offset = 1;
+  if (height == 4) {
+    memcpy(dst, top + offset, width);
+    dst += stride;
+    memcpy(dst, top + offset + 1, width);
+    dst += stride;
+    memcpy(dst, top + offset + 2, width);
+    dst += stride;
+    memcpy(dst, top + offset + 3, width);
+    return;
+  }
+  int y = 0;
+  do {
+    memcpy(dst, top + offset, width);
+    dst += stride;
+    memcpy(dst, top + offset + 1, width);
+    dst += stride;
+    memcpy(dst, top + offset + 2, width);
+    dst += stride;
+    memcpy(dst, top + offset + 3, width);
+    dst += stride;
+    memcpy(dst, top + offset + 4, width);
+    dst += stride;
+    memcpy(dst, top + offset + 5, width);
+    dst += stride;
+    memcpy(dst, top + offset + 6, width);
+    dst += stride;
+    memcpy(dst, top + offset + 7, width);
+    dst += stride;
+
+    offset += 8;
+    y += 8;
+  } while (y < height);
+}
+
+inline void DirectionalZone1_4xH(uint8_t* dst, ptrdiff_t stride,
+                                 const uint8_t* const top, const int height,
+                                 const int xstep, const bool upsampled) {
+  const int upsample_shift = static_cast<int>(upsampled);
+  const int scale_bits = 6 - upsample_shift;
+  const __m128i max_shift = _mm_set1_epi8(32);
+  // Downscaling for a weighted average whose weights sum to 32 (max_shift).
+  const int rounding_bits = 5;
+  const int max_base_x = (height + 3 /* width - 1 */) << upsample_shift;
+  const __m128i final_top_val = _mm_set1_epi16(top[max_base_x]);
+  const __m128i sampler = upsampled ? _mm_set_epi64x(0, 0x0706050403020100)
+                                    : _mm_set_epi64x(0, 0x0403030202010100);
+  // Each 16-bit value here corresponds to a position that may exceed
+  // |max_base_x|. When added to the top_base_x, it is used to mask values
+  // that pass the end of |top|. Starting from 1 to simulate "cmpge" which is
+  // not supported for packed integers.
+  const __m128i offsets =
+      _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
+
+  // All rows from |min_corner_only_y| down will simply use memcpy. |max_base_x|
+  // is always greater than |height|, so clipping to 1 is enough to make the
+  // logic work.
+  const int xstep_units = std::max(xstep >> scale_bits, 1);
+  const int min_corner_only_y = std::min(max_base_x / xstep_units, height);
+
+  // Rows up to this y-value can be computed without checking for bounds.
+  int y = 0;
+  int top_x = xstep;
+
+  for (; y < min_corner_only_y; ++y, dst += stride, top_x += xstep) {
+    const int top_base_x = top_x >> scale_bits;
+
+    // Permit negative values of |top_x|.
+    const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+    const __m128i shift = _mm_set1_epi8(shift_val);
+    const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
+    const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
+    __m128i top_index_vect = _mm_set1_epi16(top_base_x);
+    top_index_vect = _mm_add_epi16(top_index_vect, offsets);
+    const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x);
+
+    // Load 8 values because we will select the sampled values based on
+    // |upsampled|.
+    const __m128i values = LoadLo8(top + top_base_x);
+    const __m128i sampled_values = _mm_shuffle_epi8(values, sampler);
+    const __m128i past_max = _mm_cmpgt_epi16(top_index_vect, max_base_x_vect);
+    __m128i prod = _mm_maddubs_epi16(sampled_values, shifts);
+    prod = RightShiftWithRounding_U16(prod, rounding_bits);
+    // Replace pixels from invalid range with top-right corner.
+    prod = _mm_blendv_epi8(prod, final_top_val, past_max);
+    Store4(dst, _mm_packus_epi16(prod, prod));
+  }
+
+  // Fill in corner-only rows.
+  for (; y < height; ++y) {
+    memset(dst, top[max_base_x], /* width */ 4);
+    dst += stride;
+  }
+}
+
+// 7.11.2.4 (7) angle < 90
+inline void DirectionalZone1_Large(uint8_t* dest, ptrdiff_t stride,
+                                   const uint8_t* const top_row,
+                                   const int width, const int height,
+                                   const int xstep, const bool upsampled) {
+  const int upsample_shift = static_cast<int>(upsampled);
+  const __m128i sampler =
+      upsampled ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100)
+                : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100);
+  const int scale_bits = 6 - upsample_shift;
+  const int max_base_x = ((width + height) - 1) << upsample_shift;
+
+  const __m128i max_shift = _mm_set1_epi8(32);
+  // Downscaling for a weighted average whose weights sum to 32 (max_shift).
+  const int rounding_bits = 5;
+  const int base_step = 1 << upsample_shift;
+  const int base_step8 = base_step << 3;
+
+  // All rows from |min_corner_only_y| down will simply use memcpy. |max_base_x|
+  // is always greater than |height|, so clipping to 1 is enough to make the
+  // logic work.
+  const int xstep_units = std::max(xstep >> scale_bits, 1);
+  const int min_corner_only_y = std::min(max_base_x / xstep_units, height);
+
+  // Rows up to this y-value can be computed without checking for bounds.
+  const int max_no_corner_y = std::min(
+      LeftShift((max_base_x - (base_step * width)), scale_bits) / xstep,
+      height);
+  // No need to check for exceeding |max_base_x| in the first loop.
+  int y = 0;
+  int top_x = xstep;
+  for (; y < max_no_corner_y; ++y, dest += stride, top_x += xstep) {
+    int top_base_x = top_x >> scale_bits;
+    // Permit negative values of |top_x|.
+    const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+    const __m128i shift = _mm_set1_epi8(shift_val);
+    const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
+    const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
+    int x = 0;
+    do {
+      const __m128i top_vals = LoadUnaligned16(top_row + top_base_x);
+      __m128i vals = _mm_shuffle_epi8(top_vals, sampler);
+      vals = _mm_maddubs_epi16(vals, shifts);
+      vals = RightShiftWithRounding_U16(vals, rounding_bits);
+      StoreLo8(dest + x, _mm_packus_epi16(vals, vals));
+      top_base_x += base_step8;
+      x += 8;
+    } while (x < width);
+  }
+
+  // Each 16-bit value here corresponds to a position that may exceed
+  // |max_base_x|. When added to the top_base_x, it is used to mask values
+  // that pass the end of |top|. Starting from 1 to simulate "cmpge" which is
+  // not supported for packed integers.
+  const __m128i offsets =
+      _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
+
+  const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x);
+  const __m128i final_top_val = _mm_set1_epi16(top_row[max_base_x]);
+  const __m128i base_step8_vect = _mm_set1_epi16(base_step8);
+  for (; y < min_corner_only_y; ++y, dest += stride, top_x += xstep) {
+    int top_base_x = top_x >> scale_bits;
+
+    const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+    const __m128i shift = _mm_set1_epi8(shift_val);
+    const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
+    const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
+    __m128i top_index_vect = _mm_set1_epi16(top_base_x);
+    top_index_vect = _mm_add_epi16(top_index_vect, offsets);
+
+    int x = 0;
+    const int min_corner_only_x =
+        std::min(width, ((max_base_x - top_base_x) >> upsample_shift) + 7) & ~7;
+    for (; x < min_corner_only_x;
+         x += 8, top_base_x += base_step8,
+         top_index_vect = _mm_add_epi16(top_index_vect, base_step8_vect)) {
+      const __m128i past_max = _mm_cmpgt_epi16(top_index_vect, max_base_x_vect);
+      // Assuming a buffer zone of 8 bytes at the end of top_row, this prevents
+      // reading out of bounds. If all indices are past max and we don't need to
+      // use the loaded bytes at all, |top_base_x| becomes 0. |top_base_x| will
+      // reset for the next |y|.
+      top_base_x &= ~_mm_cvtsi128_si32(past_max);
+      const __m128i top_vals = LoadUnaligned16(top_row + top_base_x);
+      __m128i vals = _mm_shuffle_epi8(top_vals, sampler);
+      vals = _mm_maddubs_epi16(vals, shifts);
+      vals = RightShiftWithRounding_U16(vals, rounding_bits);
+      vals = _mm_blendv_epi8(vals, final_top_val, past_max);
+      StoreLo8(dest + x, _mm_packus_epi16(vals, vals));
+    }
+    // Corner-only section of the row.
+    memset(dest + x, top_row[max_base_x], width - x);
+  }
+  // Fill in corner-only rows.
+  for (; y < height; ++y) {
+    memset(dest, top_row[max_base_x], width);
+    dest += stride;
+  }
+}
+
+// 7.11.2.4 (7) angle < 90
+inline void DirectionalZone1_SSE4_1(uint8_t* dest, ptrdiff_t stride,
+                                    const uint8_t* const top_row,
+                                    const int width, const int height,
+                                    const int xstep, const bool upsampled) {
+  const int upsample_shift = static_cast<int>(upsampled);
+  if (xstep == 64) {
+    DirectionalZone1_Step64(dest, stride, top_row, width, height);
+    return;
+  }
+  if (width == 4) {
+    DirectionalZone1_4xH(dest, stride, top_row, height, xstep, upsampled);
+    return;
+  }
+  if (width >= 32) {
+    DirectionalZone1_Large(dest, stride, top_row, width, height, xstep,
+                           upsampled);
+    return;
+  }
+  const __m128i sampler =
+      upsampled ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100)
+                : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100);
+  const int scale_bits = 6 - upsample_shift;
+  const int max_base_x = ((width + height) - 1) << upsample_shift;
+
+  const __m128i max_shift = _mm_set1_epi8(32);
+  // Downscaling for a weighted average whose weights sum to 32 (max_shift).
+  const int rounding_bits = 5;
+  const int base_step = 1 << upsample_shift;
+  const int base_step8 = base_step << 3;
+
+  // No need to check for exceeding |max_base_x| in the loops.
+  if (((xstep * height) >> scale_bits) + base_step * width < max_base_x) {
+    int top_x = xstep;
+    int y = 0;
+    do {
+      int top_base_x = top_x >> scale_bits;
+      // Permit negative values of |top_x|.
+      const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+      const __m128i shift = _mm_set1_epi8(shift_val);
+      const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
+      const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
+      int x = 0;
+      do {
+        const __m128i top_vals = LoadUnaligned16(top_row + top_base_x);
+        __m128i vals = _mm_shuffle_epi8(top_vals, sampler);
+        vals = _mm_maddubs_epi16(vals, shifts);
+        vals = RightShiftWithRounding_U16(vals, rounding_bits);
+        StoreLo8(dest + x, _mm_packus_epi16(vals, vals));
+        top_base_x += base_step8;
+        x += 8;
+      } while (x < width);
+      dest += stride;
+      top_x += xstep;
+    } while (++y < height);
+    return;
+  }
+
+  // Each 16-bit value here corresponds to a position that may exceed
+  // |max_base_x|. When added to the top_base_x, it is used to mask values
+  // that pass the end of |top|. Starting from 1 to simulate "cmpge" which is
+  // not supported for packed integers.
+  const __m128i offsets =
+      _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
+
+  const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x);
+  const __m128i final_top_val = _mm_set1_epi16(top_row[max_base_x]);
+  const __m128i base_step8_vect = _mm_set1_epi16(base_step8);
+  int top_x = xstep;
+  int y = 0;
+  do {
+    int top_base_x = top_x >> scale_bits;
+
+    if (top_base_x >= max_base_x) {
+      for (int i = y; i < height; ++i) {
+        memset(dest, top_row[max_base_x], width);
+        dest += stride;
+      }
+      return;
+    }
+
+    const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+    const __m128i shift = _mm_set1_epi8(shift_val);
+    const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
+    const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
+    __m128i top_index_vect = _mm_set1_epi16(top_base_x);
+    top_index_vect = _mm_add_epi16(top_index_vect, offsets);
+
+    int x = 0;
+    for (; x < width - 8;
+         x += 8, top_base_x += base_step8,
+         top_index_vect = _mm_add_epi16(top_index_vect, base_step8_vect)) {
+      const __m128i past_max = _mm_cmpgt_epi16(top_index_vect, max_base_x_vect);
+      // Assuming a buffer zone of 8 bytes at the end of top_row, this prevents
+      // reading out of bounds. If all indices are past max and we don't need to
+      // use the loaded bytes at all, |top_base_x| becomes 0. |top_base_x| will
+      // reset for the next |y|.
+      top_base_x &= ~_mm_cvtsi128_si32(past_max);
+      const __m128i top_vals = LoadUnaligned16(top_row + top_base_x);
+      __m128i vals = _mm_shuffle_epi8(top_vals, sampler);
+      vals = _mm_maddubs_epi16(vals, shifts);
+      vals = RightShiftWithRounding_U16(vals, rounding_bits);
+      vals = _mm_blendv_epi8(vals, final_top_val, past_max);
+      StoreLo8(dest + x, _mm_packus_epi16(vals, vals));
+    }
+    const __m128i past_max = _mm_cmpgt_epi16(top_index_vect, max_base_x_vect);
+    __m128i vals;
+    if (upsampled) {
+      vals = LoadUnaligned16(top_row + top_base_x);
+    } else {
+      const __m128i top_vals = LoadLo8(top_row + top_base_x);
+      vals = _mm_shuffle_epi8(top_vals, sampler);
+      vals = _mm_insert_epi8(vals, top_row[top_base_x + 8], 15);
+    }
+    vals = _mm_maddubs_epi16(vals, shifts);
+    vals = RightShiftWithRounding_U16(vals, rounding_bits);
+    vals = _mm_blendv_epi8(vals, final_top_val, past_max);
+    StoreLo8(dest + x, _mm_packus_epi16(vals, vals));
+    dest += stride;
+    top_x += xstep;
+  } while (++y < height);
+}
+
+void DirectionalIntraPredictorZone1_SSE4_1(void* const dest, ptrdiff_t stride,
+                                           const void* const top_row,
+                                           const int width, const int height,
+                                           const int xstep,
+                                           const bool upsampled_top) {
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  auto* dst = static_cast<uint8_t*>(dest);
+  DirectionalZone1_SSE4_1(dst, stride, top_ptr, width, height, xstep,
+                          upsampled_top);
+}
+
+template <bool upsampled>
+inline void DirectionalZone3_4x4(uint8_t* dest, ptrdiff_t stride,
+                                 const uint8_t* const left_column,
+                                 const int base_left_y, const int ystep) {
+  // For use in the non-upsampled case.
+  const __m128i sampler = _mm_set_epi64x(0, 0x0403030202010100);
+  const int upsample_shift = static_cast<int>(upsampled);
+  const int scale_bits = 6 - upsample_shift;
+  const __m128i max_shift = _mm_set1_epi8(32);
+  // Downscaling for a weighted average whose weights sum to 32 (max_shift).
+  const int rounding_bits = 5;
+
+  __m128i result_block[4];
+  for (int x = 0, left_y = base_left_y; x < 4; x++, left_y += ystep) {
+    const int left_base_y = left_y >> scale_bits;
+    const int shift_val = ((left_y << upsample_shift) & 0x3F) >> 1;
+    const __m128i shift = _mm_set1_epi8(shift_val);
+    const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
+    const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
+    __m128i vals;
+    if (upsampled) {
+      vals = LoadLo8(left_column + left_base_y);
+    } else {
+      const __m128i top_vals = LoadLo8(left_column + left_base_y);
+      vals = _mm_shuffle_epi8(top_vals, sampler);
+    }
+    vals = _mm_maddubs_epi16(vals, shifts);
+    vals = RightShiftWithRounding_U16(vals, rounding_bits);
+    result_block[x] = _mm_packus_epi16(vals, vals);
+  }
+  const __m128i result = Transpose4x4_U8(result_block);
+  // This is result_row0.
+  Store4(dest, result);
+  dest += stride;
+  const int result_row1 = _mm_extract_epi32(result, 1);
+  memcpy(dest, &result_row1, sizeof(result_row1));
+  dest += stride;
+  const int result_row2 = _mm_extract_epi32(result, 2);
+  memcpy(dest, &result_row2, sizeof(result_row2));
+  dest += stride;
+  const int result_row3 = _mm_extract_epi32(result, 3);
+  memcpy(dest, &result_row3, sizeof(result_row3));
+}
+
+template <bool upsampled, int height>
+inline void DirectionalZone3_8xH(uint8_t* dest, ptrdiff_t stride,
+                                 const uint8_t* const left_column,
+                                 const int base_left_y, const int ystep) {
+  // For use in the non-upsampled case.
+  const __m128i sampler =
+      _mm_set_epi64x(0x0807070606050504, 0x0403030202010100);
+  const int upsample_shift = static_cast<int>(upsampled);
+  const int scale_bits = 6 - upsample_shift;
+  const __m128i max_shift = _mm_set1_epi8(32);
+  // Downscaling for a weighted average whose weights sum to 32 (max_shift).
+  const int rounding_bits = 5;
+
+  __m128i result_block[8];
+  for (int x = 0, left_y = base_left_y; x < 8; x++, left_y += ystep) {
+    const int left_base_y = left_y >> scale_bits;
+    const int shift_val = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+    const __m128i shift = _mm_set1_epi8(shift_val);
+    const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
+    const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
+    __m128i vals;
+    if (upsampled) {
+      vals = LoadUnaligned16(left_column + left_base_y);
+    } else {
+      const __m128i top_vals = LoadUnaligned16(left_column + left_base_y);
+      vals = _mm_shuffle_epi8(top_vals, sampler);
+    }
+    vals = _mm_maddubs_epi16(vals, shifts);
+    result_block[x] = RightShiftWithRounding_U16(vals, rounding_bits);
+  }
+  Transpose8x8_U16(result_block, result_block);
+  for (int y = 0; y < height; ++y) {
+    StoreLo8(dest, _mm_packus_epi16(result_block[y], result_block[y]));
+    dest += stride;
+  }
+}
+
+// 7.11.2.4 (9) angle > 180
+void DirectionalIntraPredictorZone3_SSE4_1(void* dest, ptrdiff_t stride,
+                                           const void* const left_column,
+                                           const int width, const int height,
+                                           const int ystep,
+                                           const bool upsampled) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  auto* dst = static_cast<uint8_t*>(dest);
+  const int upsample_shift = static_cast<int>(upsampled);
+  if (width == 4 || height == 4) {
+    const ptrdiff_t stride4 = stride << 2;
+    if (upsampled) {
+      int left_y = ystep;
+      int x = 0;
+      do {
+        uint8_t* dst_x = dst + x;
+        int y = 0;
+        do {
+          DirectionalZone3_4x4<true>(
+              dst_x, stride, left_ptr + (y << upsample_shift), left_y, ystep);
+          dst_x += stride4;
+          y += 4;
+        } while (y < height);
+        left_y += ystep << 2;
+        x += 4;
+      } while (x < width);
+    } else {
+      int left_y = ystep;
+      int x = 0;
+      do {
+        uint8_t* dst_x = dst + x;
+        int y = 0;
+        do {
+          DirectionalZone3_4x4<false>(dst_x, stride, left_ptr + y, left_y,
+                                      ystep);
+          dst_x += stride4;
+          y += 4;
+        } while (y < height);
+        left_y += ystep << 2;
+        x += 4;
+      } while (x < width);
+    }
+    return;
+  }
+
+  const ptrdiff_t stride8 = stride << 3;
+  if (upsampled) {
+    int left_y = ystep;
+    int x = 0;
+    do {
+      uint8_t* dst_x = dst + x;
+      int y = 0;
+      do {
+        DirectionalZone3_8xH<true, 8>(
+            dst_x, stride, left_ptr + (y << upsample_shift), left_y, ystep);
+        dst_x += stride8;
+        y += 8;
+      } while (y < height);
+      left_y += ystep << 3;
+      x += 8;
+    } while (x < width);
+  } else {
+    int left_y = ystep;
+    int x = 0;
+    do {
+      uint8_t* dst_x = dst + x;
+      int y = 0;
+      do {
+        DirectionalZone3_8xH<false, 8>(
+            dst_x, stride, left_ptr + (y << upsample_shift), left_y, ystep);
+        dst_x += stride8;
+        y += 8;
+      } while (y < height);
+      left_y += ystep << 3;
+      x += 8;
+    } while (x < width);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Directional Zone 2 Functions
+// 7.11.2.4 (8)
+
+// DirectionalBlend* selectively overwrites the values written by
+// DirectionalZone2FromLeftCol*. |zone_bounds| has one 16-bit index for each
+// row.
+template <int y_selector>
+inline void DirectionalBlend4_SSE4_1(uint8_t* dest,
+                                     const __m128i& dest_index_vect,
+                                     const __m128i& vals,
+                                     const __m128i& zone_bounds) {
+  const __m128i max_dest_x_vect = _mm_shufflelo_epi16(zone_bounds, y_selector);
+  const __m128i use_left = _mm_cmplt_epi16(dest_index_vect, max_dest_x_vect);
+  const __m128i original_vals = _mm_cvtepu8_epi16(Load4(dest));
+  const __m128i blended_vals = _mm_blendv_epi8(vals, original_vals, use_left);
+  Store4(dest, _mm_packus_epi16(blended_vals, blended_vals));
+}
+
+inline void DirectionalBlend8_SSE4_1(uint8_t* dest,
+                                     const __m128i& dest_index_vect,
+                                     const __m128i& vals,
+                                     const __m128i& zone_bounds,
+                                     const __m128i& bounds_selector) {
+  const __m128i max_dest_x_vect =
+      _mm_shuffle_epi8(zone_bounds, bounds_selector);
+  const __m128i use_left = _mm_cmplt_epi16(dest_index_vect, max_dest_x_vect);
+  const __m128i original_vals = _mm_cvtepu8_epi16(LoadLo8(dest));
+  const __m128i blended_vals = _mm_blendv_epi8(vals, original_vals, use_left);
+  StoreLo8(dest, _mm_packus_epi16(blended_vals, blended_vals));
+}
+
+constexpr int kDirectionalWeightBits = 5;
+// |source| is packed with 4 or 8 pairs of 8-bit values from left or top.
+// |shifts| is named to match the specification, with 4 or 8 pairs of (32 -
+// shift) and shift. Shift is guaranteed to be between 0 and 32.
+inline __m128i DirectionalZone2FromSource_SSE4_1(const uint8_t* const source,
+                                                 const __m128i& shifts,
+                                                 const __m128i& sampler) {
+  const __m128i src_vals = LoadUnaligned16(source);
+  __m128i vals = _mm_shuffle_epi8(src_vals, sampler);
+  vals = _mm_maddubs_epi16(vals, shifts);
+  return RightShiftWithRounding_U16(vals, kDirectionalWeightBits);
+}
+
+// Because the source values "move backwards" as the row index increases, the
+// indices derived from ystep are generally negative. This is accommodated by
+// making sure the relative indices are within [-15, 0] when the function is
+// called, and sliding them into the inclusive range [0, 15], relative to a
+// lower base address.
+constexpr int kPositiveIndexOffset = 15;
+
+template <bool upsampled>
+inline void DirectionalZone2FromLeftCol_4x4_SSE4_1(
+    uint8_t* dst, ptrdiff_t stride, const uint8_t* const left_column_base,
+    __m128i left_y) {
+  const int upsample_shift = static_cast<int>(upsampled);
+  const int scale_bits = 6 - upsample_shift;
+  const __m128i max_shifts = _mm_set1_epi8(32);
+  const __m128i shift_mask = _mm_set1_epi32(0x003F003F);
+  const __m128i index_increment = _mm_cvtsi32_si128(0x01010101);
+  const __m128i positive_offset = _mm_set1_epi8(kPositiveIndexOffset);
+  // Left_column and sampler are both offset by 15 so the indices are always
+  // positive.
+  const uint8_t* left_column = left_column_base - kPositiveIndexOffset;
+  for (int y = 0; y < 4; dst += stride, ++y) {
+    __m128i offset_y = _mm_srai_epi16(left_y, scale_bits);
+    offset_y = _mm_packs_epi16(offset_y, offset_y);
+
+    const __m128i adjacent = _mm_add_epi8(offset_y, index_increment);
+    __m128i sampler = _mm_unpacklo_epi8(offset_y, adjacent);
+    // Slide valid |offset_y| indices from range [-15, 0] to [0, 15] so they
+    // can work as shuffle indices. Some values may be out of bounds, but their
+    // pred results will be masked over by top prediction.
+    sampler = _mm_add_epi8(sampler, positive_offset);
+
+    __m128i shifts = _mm_srli_epi16(
+        _mm_and_si128(_mm_slli_epi16(left_y, upsample_shift), shift_mask), 1);
+    shifts = _mm_packus_epi16(shifts, shifts);
+    const __m128i opposite_shifts = _mm_sub_epi8(max_shifts, shifts);
+    shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
+    const __m128i vals = DirectionalZone2FromSource_SSE4_1(
+        left_column + (y << upsample_shift), shifts, sampler);
+    Store4(dst, _mm_packus_epi16(vals, vals));
+  }
+}
+
+// The height at which a load of 16 bytes will not contain enough source pixels
+// from |left_column| to supply an accurate row when computing 8 pixels at a
+// time. The values are found by inspection. By coincidence, all angles that
+// satisfy (ystep >> 6) == 2 map to the same value, so it is enough to look up
+// by ystep >> 6. The largest index for this lookup is 1023 >> 6 == 15.
+constexpr int kDirectionalZone2ShuffleInvalidHeight[16] = {
+    1024, 1024, 16, 16, 16, 16, 0, 0, 18, 0, 0, 0, 0, 0, 0, 40};
+
+template <bool upsampled>
+inline void DirectionalZone2FromLeftCol_8x8_SSE4_1(
+    uint8_t* dst, ptrdiff_t stride, const uint8_t* const left_column,
+    __m128i left_y) {
+  const int upsample_shift = static_cast<int>(upsampled);
+  const int scale_bits = 6 - upsample_shift;
+  const __m128i max_shifts = _mm_set1_epi8(32);
+  const __m128i shift_mask = _mm_set1_epi32(0x003F003F);
+  const __m128i index_increment = _mm_set1_epi8(1);
+  const __m128i denegation = _mm_set1_epi8(kPositiveIndexOffset);
+  for (int y = 0; y < 8; dst += stride, ++y) {
+    __m128i offset_y = _mm_srai_epi16(left_y, scale_bits);
+    offset_y = _mm_packs_epi16(offset_y, offset_y);
+    const __m128i adjacent = _mm_add_epi8(offset_y, index_increment);
+
+    // Offset the relative index because ystep is negative in Zone 2 and shuffle
+    // indices must be nonnegative.
+    __m128i sampler = _mm_unpacklo_epi8(offset_y, adjacent);
+    sampler = _mm_add_epi8(sampler, denegation);
+
+    __m128i shifts = _mm_srli_epi16(
+        _mm_and_si128(_mm_slli_epi16(left_y, upsample_shift), shift_mask), 1);
+    shifts = _mm_packus_epi16(shifts, shifts);
+    const __m128i opposite_shifts = _mm_sub_epi8(max_shifts, shifts);
+    shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
+
+    // The specification adds (y << 6) to left_y, which is subject to
+    // upsampling, but this puts sampler indices out of the 0-15 range. It is
+    // equivalent to offset the source address by (y << upsample_shift) instead.
+    const __m128i vals = DirectionalZone2FromSource_SSE4_1(
+        left_column - kPositiveIndexOffset + (y << upsample_shift), shifts,
+        sampler);
+    StoreLo8(dst, _mm_packus_epi16(vals, vals));
+  }
+}
+
+// |zone_bounds| is an epi16 of the relative x index at which base >= -(1 <<
+// upsampled_top), for each row. When there are 4 values, they can be duplicated
+// with a non-register shuffle mask.
+// |shifts| is one pair of weights that applies throughout a given row.
+template <bool upsampled_top>
+inline void DirectionalZone1Blend_4x4(
+    uint8_t* dest, const uint8_t* const top_row, ptrdiff_t stride,
+    __m128i sampler, const __m128i& zone_bounds, const __m128i& shifts,
+    const __m128i& dest_index_x, int top_x, const int xstep) {
+  const int upsample_shift = static_cast<int>(upsampled_top);
+  const int scale_bits_x = 6 - upsample_shift;
+  top_x -= xstep;
+
+  int top_base_x = (top_x >> scale_bits_x);
+  const __m128i vals0 = DirectionalZone2FromSource_SSE4_1(
+      top_row + top_base_x, _mm_shufflelo_epi16(shifts, 0x00), sampler);
+  DirectionalBlend4_SSE4_1<0x00>(dest, dest_index_x, vals0, zone_bounds);
+  top_x -= xstep;
+  dest += stride;
+
+  top_base_x = (top_x >> scale_bits_x);
+  const __m128i vals1 = DirectionalZone2FromSource_SSE4_1(
+      top_row + top_base_x, _mm_shufflelo_epi16(shifts, 0x55), sampler);
+  DirectionalBlend4_SSE4_1<0x55>(dest, dest_index_x, vals1, zone_bounds);
+  top_x -= xstep;
+  dest += stride;
+
+  top_base_x = (top_x >> scale_bits_x);
+  const __m128i vals2 = DirectionalZone2FromSource_SSE4_1(
+      top_row + top_base_x, _mm_shufflelo_epi16(shifts, 0xAA), sampler);
+  DirectionalBlend4_SSE4_1<0xAA>(dest, dest_index_x, vals2, zone_bounds);
+  top_x -= xstep;
+  dest += stride;
+
+  top_base_x = (top_x >> scale_bits_x);
+  const __m128i vals3 = DirectionalZone2FromSource_SSE4_1(
+      top_row + top_base_x, _mm_shufflelo_epi16(shifts, 0xFF), sampler);
+  DirectionalBlend4_SSE4_1<0xFF>(dest, dest_index_x, vals3, zone_bounds);
+}
+
+template <bool upsampled_top, int height>
+inline void DirectionalZone1Blend_8xH(
+    uint8_t* dest, const uint8_t* const top_row, ptrdiff_t stride,
+    __m128i sampler, const __m128i& zone_bounds, const __m128i& shifts,
+    const __m128i& dest_index_x, int top_x, const int xstep) {
+  const int upsample_shift = static_cast<int>(upsampled_top);
+  const int scale_bits_x = 6 - upsample_shift;
+
+  __m128i y_selector = _mm_set1_epi32(0x01000100);
+  const __m128i index_increment = _mm_set1_epi32(0x02020202);
+  for (int y = 0; y < height; ++y,
+           y_selector = _mm_add_epi8(y_selector, index_increment),
+           dest += stride) {
+    top_x -= xstep;
+    const int top_base_x = top_x >> scale_bits_x;
+    const __m128i vals = DirectionalZone2FromSource_SSE4_1(
+        top_row + top_base_x, _mm_shuffle_epi8(shifts, y_selector), sampler);
+    DirectionalBlend8_SSE4_1(dest, dest_index_x, vals, zone_bounds, y_selector);
+  }
+}
+
+// 7.11.2.4 (8) 90 < angle > 180
+// The strategy for this function is to know how many blocks can be processed
+// with just pixels from |top_ptr|, then handle mixed blocks, then handle only
+// blocks that take from |left_ptr|. Additionally, a fast index-shuffle
+// approach is used for pred values from |left_column| in sections that permit
+// it.
+template <bool upsampled_left, bool upsampled_top>
+inline void DirectionalZone2_SSE4_1(void* dest, ptrdiff_t stride,
+                                    const uint8_t* const top_row,
+                                    const uint8_t* const left_column,
+                                    const int width, const int height,
+                                    const int xstep, const int ystep) {
+  auto* dst = static_cast<uint8_t*>(dest);
+  const int upsample_left_shift = static_cast<int>(upsampled_left);
+  const int upsample_top_shift = static_cast<int>(upsampled_top);
+  const __m128i max_shift = _mm_set1_epi8(32);
+  const ptrdiff_t stride8 = stride << 3;
+  const __m128i dest_index_x =
+      _mm_set_epi32(0x00070006, 0x00050004, 0x00030002, 0x00010000);
+  const __m128i sampler_top =
+      upsampled_top
+          ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100)
+          : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100);
+  const __m128i shift_mask = _mm_set1_epi32(0x003F003F);
+  // All columns from |min_top_only_x| to the right will only need |top_row| to
+  // compute. This assumes minimum |xstep| is 3.
+  const int min_top_only_x = std::min((height * xstep) >> 6, width);
+
+  // For steep angles, the source pixels from left_column may not fit in a
+  // 16-byte load for shuffling.
+  // TODO(petersonab): Find a more precise formula for this subject to x.
+  const int max_shuffle_height =
+      std::min(height, kDirectionalZone2ShuffleInvalidHeight[ystep >> 6]);
+
+  const int xstep8 = xstep << 3;
+  const __m128i xstep8_vect = _mm_set1_epi16(xstep8);
+  // Accumulate xstep across 8 rows.
+  const __m128i xstep_dup = _mm_set1_epi16(-xstep);
+  const __m128i increments = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
+  const __m128i xstep_for_shift = _mm_mullo_epi16(xstep_dup, increments);
+  // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1
+  const __m128i scaled_one = _mm_set1_epi16(-64);
+  __m128i xstep_bounds_base =
+      (xstep == 64) ? _mm_sub_epi16(scaled_one, xstep_for_shift)
+                    : _mm_sub_epi16(_mm_set1_epi16(-1), xstep_for_shift);
+
+  const int left_base_increment = ystep >> 6;
+  const int ystep_remainder = ystep & 0x3F;
+  const int ystep8 = ystep << 3;
+  const int left_base_increment8 = ystep8 >> 6;
+  const int ystep_remainder8 = ystep8 & 0x3F;
+  const __m128i increment_left8 = _mm_set1_epi16(-ystep_remainder8);
+
+  // If the 64 scaling is regarded as a decimal point, the first value of the
+  // left_y vector omits the portion which is covered under the left_column
+  // offset. Following values need the full ystep as a relative offset.
+  const __m128i ystep_init = _mm_set1_epi16(-ystep_remainder);
+  const __m128i ystep_dup = _mm_set1_epi16(-ystep);
+  __m128i left_y = _mm_mullo_epi16(ystep_dup, dest_index_x);
+  left_y = _mm_add_epi16(ystep_init, left_y);
+
+  const __m128i increment_top8 = _mm_set1_epi16(8 << 6);
+  int x = 0;
+
+  // This loop treats each set of 4 columns in 3 stages with y-value boundaries.
+  // The first stage, before the first y-loop, covers blocks that are only
+  // computed from the top row. The second stage, comprising two y-loops, covers
+  // blocks that have a mixture of values computed from top or left. The final
+  // stage covers blocks that are only computed from the left.
+  for (int left_offset = -left_base_increment; x < min_top_only_x;
+       x += 8,
+           xstep_bounds_base = _mm_sub_epi16(xstep_bounds_base, increment_top8),
+           // Watch left_y because it can still get big.
+       left_y = _mm_add_epi16(left_y, increment_left8),
+           left_offset -= left_base_increment8) {
+    uint8_t* dst_x = dst + x;
+
+    // Round down to the nearest multiple of 8.
+    const int max_top_only_y = std::min(((x + 1) << 6) / xstep, height) & ~7;
+    DirectionalZone1_4xH(dst_x, stride, top_row + (x << upsample_top_shift),
+                         max_top_only_y, -xstep, upsampled_top);
+    DirectionalZone1_4xH(dst_x + 4, stride,
+                         top_row + ((x + 4) << upsample_top_shift),
+                         max_top_only_y, -xstep, upsampled_top);
+
+    int y = max_top_only_y;
+    dst_x += stride * y;
+    const int xstep_y = xstep * y;
+    const __m128i xstep_y_vect = _mm_set1_epi16(xstep_y);
+    // All rows from |min_left_only_y| down for this set of columns, only need
+    // |left_column| to compute.
+    const int min_left_only_y = std::min(((x + 8) << 6) / xstep, height);
+    // At high angles such that min_left_only_y < 8, ystep is low and xstep is
+    // high. This means that max_shuffle_height is unbounded and xstep_bounds
+    // will overflow in 16 bits. This is prevented by stopping the first
+    // blending loop at min_left_only_y for such cases, which means we skip over
+    // the second blending loop as well.
+    const int left_shuffle_stop_y =
+        std::min(max_shuffle_height, min_left_only_y);
+    __m128i xstep_bounds = _mm_add_epi16(xstep_bounds_base, xstep_y_vect);
+    __m128i xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift, xstep_y_vect);
+    int top_x = -xstep_y;
+
+    for (; y < left_shuffle_stop_y;
+         y += 8, dst_x += stride8,
+         xstep_bounds = _mm_add_epi16(xstep_bounds, xstep8_vect),
+         xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift_y, xstep8_vect),
+         top_x -= xstep8) {
+      DirectionalZone2FromLeftCol_8x8_SSE4_1<upsampled_left>(
+          dst_x, stride,
+          left_column + ((left_offset + y) << upsample_left_shift), left_y);
+
+      __m128i shifts = _mm_srli_epi16(
+          _mm_and_si128(_mm_slli_epi16(xstep_for_shift_y, upsample_top_shift),
+                        shift_mask),
+          1);
+      shifts = _mm_packus_epi16(shifts, shifts);
+      __m128i opposite_shifts = _mm_sub_epi8(max_shift, shifts);
+      shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
+      __m128i xstep_bounds_off = _mm_srai_epi16(xstep_bounds, 6);
+      DirectionalZone1Blend_8xH<upsampled_top, 8>(
+          dst_x, top_row + (x << upsample_top_shift), stride, sampler_top,
+          xstep_bounds_off, shifts, dest_index_x, top_x, xstep);
+    }
+    // Pick up from the last y-value, using the 10% slower but secure method for
+    // left prediction.
+    const auto base_left_y = static_cast<int16_t>(_mm_extract_epi16(left_y, 0));
+    for (; y < min_left_only_y;
+         y += 8, dst_x += stride8,
+         xstep_bounds = _mm_add_epi16(xstep_bounds, xstep8_vect),
+         xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift_y, xstep8_vect),
+         top_x -= xstep8) {
+      const __m128i xstep_bounds_off = _mm_srai_epi16(xstep_bounds, 6);
+
+      DirectionalZone3_8xH<upsampled_left, 8>(
+          dst_x, stride,
+          left_column + ((left_offset + y) << upsample_left_shift), base_left_y,
+          -ystep);
+
+      __m128i shifts = _mm_srli_epi16(
+          _mm_and_si128(_mm_slli_epi16(xstep_for_shift_y, upsample_top_shift),
+                        shift_mask),
+          1);
+      shifts = _mm_packus_epi16(shifts, shifts);
+      __m128i opposite_shifts = _mm_sub_epi8(max_shift, shifts);
+      shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
+      DirectionalZone1Blend_8xH<upsampled_top, 8>(
+          dst_x, top_row + (x << upsample_top_shift), stride, sampler_top,
+          xstep_bounds_off, shifts, dest_index_x, top_x, xstep);
+    }
+    // Loop over y for left_only rows.
+    for (; y < height; y += 8, dst_x += stride8) {
+      DirectionalZone3_8xH<upsampled_left, 8>(
+          dst_x, stride,
+          left_column + ((left_offset + y) << upsample_left_shift), base_left_y,
+          -ystep);
+    }
+  }
+  for (; x < width; x += 4) {
+    DirectionalZone1_4xH(dst + x, stride, top_row + (x << upsample_top_shift),
+                         height, -xstep, upsampled_top);
+  }
+}
+
+template <bool upsampled_left, bool upsampled_top>
+inline void DirectionalZone2_4_SSE4_1(void* dest, ptrdiff_t stride,
+                                      const uint8_t* const top_row,
+                                      const uint8_t* const left_column,
+                                      const int width, const int height,
+                                      const int xstep, const int ystep) {
+  auto* dst = static_cast<uint8_t*>(dest);
+  const int upsample_left_shift = static_cast<int>(upsampled_left);
+  const int upsample_top_shift = static_cast<int>(upsampled_top);
+  const __m128i max_shift = _mm_set1_epi8(32);
+  const ptrdiff_t stride4 = stride << 2;
+  const __m128i dest_index_x = _mm_set_epi32(0, 0, 0x00030002, 0x00010000);
+  const __m128i sampler_top =
+      upsampled_top
+          ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100)
+          : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100);
+  // All columns from |min_top_only_x| to the right will only need |top_row| to
+  // compute.
+  assert(xstep >= 3);
+  const int min_top_only_x = std::min((height * xstep) >> 6, width);
+
+  const int xstep4 = xstep << 2;
+  const __m128i xstep4_vect = _mm_set1_epi16(xstep4);
+  const __m128i xstep_dup = _mm_set1_epi16(-xstep);
+  const __m128i increments = _mm_set_epi32(0, 0, 0x00040003, 0x00020001);
+  __m128i xstep_for_shift = _mm_mullo_epi16(xstep_dup, increments);
+  const __m128i scaled_one = _mm_set1_epi16(-64);
+  // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1
+  __m128i xstep_bounds_base =
+      (xstep == 64) ? _mm_sub_epi16(scaled_one, xstep_for_shift)
+                    : _mm_sub_epi16(_mm_set1_epi16(-1), xstep_for_shift);
+
+  const int left_base_increment = ystep >> 6;
+  const int ystep_remainder = ystep & 0x3F;
+  const int ystep4 = ystep << 2;
+  const int left_base_increment4 = ystep4 >> 6;
+  // This is guaranteed to be less than 64, but accumulation may bring it past
+  // 64 for higher x values.
+  const int ystep_remainder4 = ystep4 & 0x3F;
+  const __m128i increment_left4 = _mm_set1_epi16(-ystep_remainder4);
+  const __m128i increment_top4 = _mm_set1_epi16(4 << 6);
+
+  // If the 64 scaling is regarded as a decimal point, the first value of the
+  // left_y vector omits the portion which will go into the left_column offset.
+  // Following values need the full ystep as a relative offset.
+  const __m128i ystep_init = _mm_set1_epi16(-ystep_remainder);
+  const __m128i ystep_dup = _mm_set1_epi16(-ystep);
+  __m128i left_y = _mm_mullo_epi16(ystep_dup, dest_index_x);
+  left_y = _mm_add_epi16(ystep_init, left_y);
+  const __m128i shift_mask = _mm_set1_epi32(0x003F003F);
+
+  int x = 0;
+  // Loop over x for columns with a mixture of sources.
+  for (int left_offset = -left_base_increment; x < min_top_only_x; x += 4,
+           xstep_bounds_base = _mm_sub_epi16(xstep_bounds_base, increment_top4),
+           left_y = _mm_add_epi16(left_y, increment_left4),
+           left_offset -= left_base_increment4) {
+    uint8_t* dst_x = dst + x;
+
+    // Round down to the nearest multiple of 8.
+    const int max_top_only_y = std::min((x << 6) / xstep, height) & 0xFFFFFFF4;
+    DirectionalZone1_4xH(dst_x, stride, top_row + (x << upsample_top_shift),
+                         max_top_only_y, -xstep, upsampled_top);
+    int y = max_top_only_y;
+    dst_x += stride * y;
+    const int xstep_y = xstep * y;
+    const __m128i xstep_y_vect = _mm_set1_epi16(xstep_y);
+    // All rows from |min_left_only_y| down for this set of columns, only need
+    // |left_column| to compute. Rounded up to the nearest multiple of 4.
+    const int min_left_only_y = std::min(((x + 4) << 6) / xstep, height);
+
+    __m128i xstep_bounds = _mm_add_epi16(xstep_bounds_base, xstep_y_vect);
+    __m128i xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift, xstep_y_vect);
+    int top_x = -xstep_y;
+
+    // Loop over y for mixed rows.
+    for (; y < min_left_only_y;
+         y += 4, dst_x += stride4,
+         xstep_bounds = _mm_add_epi16(xstep_bounds, xstep4_vect),
+         xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift_y, xstep4_vect),
+         top_x -= xstep4) {
+      DirectionalZone2FromLeftCol_4x4_SSE4_1<upsampled_left>(
+          dst_x, stride,
+          left_column + ((left_offset + y) * (1 << upsample_left_shift)),
+          left_y);
+
+      __m128i shifts = _mm_srli_epi16(
+          _mm_and_si128(_mm_slli_epi16(xstep_for_shift_y, upsample_top_shift),
+                        shift_mask),
+          1);
+      shifts = _mm_packus_epi16(shifts, shifts);
+      const __m128i opposite_shifts = _mm_sub_epi8(max_shift, shifts);
+      shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
+      const __m128i xstep_bounds_off = _mm_srai_epi16(xstep_bounds, 6);
+      DirectionalZone1Blend_4x4<upsampled_top>(
+          dst_x, top_row + (x << upsample_top_shift), stride, sampler_top,
+          xstep_bounds_off, shifts, dest_index_x, top_x, xstep);
+    }
+    // Loop over y for left-only rows, if any.
+    for (; y < height; y += 4, dst_x += stride4) {
+      DirectionalZone2FromLeftCol_4x4_SSE4_1<upsampled_left>(
+          dst_x, stride,
+          left_column + ((left_offset + y) << upsample_left_shift), left_y);
+    }
+  }
+  // Loop over top-only columns, if any.
+  for (; x < width; x += 4) {
+    DirectionalZone1_4xH(dst + x, stride, top_row + (x << upsample_top_shift),
+                         height, -xstep, upsampled_top);
+  }
+}
+
+void DirectionalIntraPredictorZone2_SSE4_1(void* const dest, ptrdiff_t stride,
+                                           const void* const top_row,
+                                           const void* const left_column,
+                                           const int width, const int height,
+                                           const int xstep, const int ystep,
+                                           const bool upsampled_top,
+                                           const bool upsampled_left) {
+  // Increasing the negative buffer for this function allows more rows to be
+  // processed at a time without branching in an inner loop to check the base.
+  uint8_t top_buffer[288];
+  uint8_t left_buffer[288];
+  memcpy(top_buffer + 128, static_cast<const uint8_t*>(top_row) - 16, 160);
+  memcpy(left_buffer + 128, static_cast<const uint8_t*>(left_column) - 16, 160);
+  const uint8_t* top_ptr = top_buffer + 144;
+  const uint8_t* left_ptr = left_buffer + 144;
+  if (width == 4 || height == 4) {
+    if (upsampled_left) {
+      if (upsampled_top) {
+        DirectionalZone2_4_SSE4_1<true, true>(dest, stride, top_ptr, left_ptr,
+                                              width, height, xstep, ystep);
+      } else {
+        DirectionalZone2_4_SSE4_1<true, false>(dest, stride, top_ptr, left_ptr,
+                                               width, height, xstep, ystep);
+      }
+    } else {
+      if (upsampled_top) {
+        DirectionalZone2_4_SSE4_1<false, true>(dest, stride, top_ptr, left_ptr,
+                                               width, height, xstep, ystep);
+      } else {
+        DirectionalZone2_4_SSE4_1<false, false>(dest, stride, top_ptr, left_ptr,
+                                                width, height, xstep, ystep);
+      }
+    }
+    return;
+  }
+  if (upsampled_left) {
+    if (upsampled_top) {
+      DirectionalZone2_SSE4_1<true, true>(dest, stride, top_ptr, left_ptr,
+                                          width, height, xstep, ystep);
+    } else {
+      DirectionalZone2_SSE4_1<true, false>(dest, stride, top_ptr, left_ptr,
+                                           width, height, xstep, ystep);
+    }
+  } else {
+    if (upsampled_top) {
+      DirectionalZone2_SSE4_1<false, true>(dest, stride, top_ptr, left_ptr,
+                                           width, height, xstep, ystep);
+    } else {
+      DirectionalZone2_SSE4_1<false, false>(dest, stride, top_ptr, left_ptr,
+                                            width, height, xstep, ystep);
+    }
+  }
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  static_cast<void>(dsp);
+#if DSP_ENABLED_8BPP_SSE4_1(DirectionalIntraPredictorZone1)
+  dsp->directional_intra_predictor_zone1 =
+      DirectionalIntraPredictorZone1_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(DirectionalIntraPredictorZone2)
+  dsp->directional_intra_predictor_zone2 =
+      DirectionalIntraPredictorZone2_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(DirectionalIntraPredictorZone3)
+  dsp->directional_intra_predictor_zone3 =
+      DirectionalIntraPredictorZone3_SSE4_1;
+#endif
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+//------------------------------------------------------------------------------
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+//------------------------------------------------------------------------------
+// 7.11.2.4. Directional intra prediction process
+
+// Special case: An |xstep| of 64 corresponds to an angle delta of 45, meaning
+// upsampling is ruled out. In addition, the bits masked by 0x3F for
+// |shift_val| are 0 for all multiples of 64, so the formula
+// val = top[top_base_x]*shift + top[top_base_x+1]*(32-shift), reduces to
+// val = top[top_base_x+1] << 5, meaning only the second set of pixels is
+// involved in the output. Hence |top| is offset by 1.
+inline void DirectionalZone1_Step64(uint16_t* dst, ptrdiff_t stride,
+                                    const uint16_t* const top, const int width,
+                                    const int height) {
+  ptrdiff_t offset = 1;
+  if (height == 4) {
+    memcpy(dst, top + offset, width * sizeof(dst[0]));
+    dst += stride;
+    memcpy(dst, top + offset + 1, width * sizeof(dst[0]));
+    dst += stride;
+    memcpy(dst, top + offset + 2, width * sizeof(dst[0]));
+    dst += stride;
+    memcpy(dst, top + offset + 3, width * sizeof(dst[0]));
+    return;
+  }
+  int y = height;
+  do {
+    memcpy(dst, top + offset, width * sizeof(dst[0]));
+    dst += stride;
+    memcpy(dst, top + offset + 1, width * sizeof(dst[0]));
+    dst += stride;
+    memcpy(dst, top + offset + 2, width * sizeof(dst[0]));
+    dst += stride;
+    memcpy(dst, top + offset + 3, width * sizeof(dst[0]));
+    dst += stride;
+    memcpy(dst, top + offset + 4, width * sizeof(dst[0]));
+    dst += stride;
+    memcpy(dst, top + offset + 5, width * sizeof(dst[0]));
+    dst += stride;
+    memcpy(dst, top + offset + 6, width * sizeof(dst[0]));
+    dst += stride;
+    memcpy(dst, top + offset + 7, width * sizeof(dst[0]));
+    dst += stride;
+
+    offset += 8;
+    y -= 8;
+  } while (y != 0);
+}
+
+// Produce a weighted average whose weights sum to 32.
+inline __m128i CombineTopVals4(const __m128i& top_vals, const __m128i& sampler,
+                               const __m128i& shifts,
+                               const __m128i& top_indices,
+                               const __m128i& final_top_val,
+                               const __m128i& border_index) {
+  const __m128i sampled_values = _mm_shuffle_epi8(top_vals, sampler);
+  __m128i prod = _mm_mullo_epi16(sampled_values, shifts);
+  prod = _mm_hadd_epi16(prod, prod);
+  const __m128i result = RightShiftWithRounding_U16(prod, 5 /*log2(32)*/);
+
+  const __m128i past_max = _mm_cmpgt_epi16(top_indices, border_index);
+  // Replace pixels from invalid range with top-right corner.
+  return _mm_blendv_epi8(result, final_top_val, past_max);
+}
+
+// When width is 4, only one load operation is needed per iteration. We also
+// avoid extra loop precomputations that cause too much overhead.
+inline void DirectionalZone1_4xH(uint16_t* dst, ptrdiff_t stride,
+                                 const uint16_t* const top, const int height,
+                                 const int xstep, const bool upsampled,
+                                 const __m128i& sampler) {
+  const int upsample_shift = static_cast<int>(upsampled);
+  const int index_scale_bits = 6 - upsample_shift;
+  const int max_base_x = (height + 3 /* width - 1 */) << upsample_shift;
+  const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x);
+  const __m128i final_top_val = _mm_set1_epi16(top[max_base_x]);
+
+  // Each 16-bit value here corresponds to a position that may exceed
+  // |max_base_x|. When added to the top_base_x, it is used to mask values
+  // that pass the end of |top|. Starting from 1 to simulate "cmpge" because
+  // only cmpgt is available.
+  const __m128i offsets =
+      _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
+
+  // All rows from |min_corner_only_y| down will simply use memcpy.
+  // |max_base_x| is always greater than |height|, so clipping the denominator
+  // to 1 is enough to make the logic work.
+  const int xstep_units = std::max(xstep >> index_scale_bits, 1);
+  const int min_corner_only_y = std::min(max_base_x / xstep_units, height);
+
+  int y = 0;
+  int top_x = xstep;
+  const __m128i max_shift = _mm_set1_epi16(32);
+
+  for (; y < min_corner_only_y; ++y, dst += stride, top_x += xstep) {
+    const int top_base_x = top_x >> index_scale_bits;
+
+    // Permit negative values of |top_x|.
+    const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+    const __m128i shift = _mm_set1_epi16(shift_val);
+    const __m128i opposite_shift = _mm_sub_epi16(max_shift, shift);
+    const __m128i shifts = _mm_unpacklo_epi16(opposite_shift, shift);
+    __m128i top_index_vect = _mm_set1_epi16(top_base_x);
+    top_index_vect = _mm_add_epi16(top_index_vect, offsets);
+
+    // Load 8 values because we will select the sampled values based on
+    // |upsampled|.
+    const __m128i values = LoadUnaligned16(top + top_base_x);
+    const __m128i pred =
+        CombineTopVals4(values, sampler, shifts, top_index_vect, final_top_val,
+                        max_base_x_vect);
+    StoreLo8(dst, pred);
+  }
+
+  // Fill in corner-only rows.
+  for (; y < height; ++y) {
+    Memset(dst, top[max_base_x], /* width */ 4);
+    dst += stride;
+  }
+}
+
+// General purpose combine function.
+// |check_border| means the final source value has to be duplicated into the
+// result. This simplifies the loop structures that use precomputed boundaries
+// to identify sections where it is safe to compute without checking for the
+// right border.
+template <bool check_border>
+inline __m128i CombineTopVals(
+    const __m128i& top_vals_0, const __m128i& top_vals_1,
+    const __m128i& sampler, const __m128i& shifts,
+    const __m128i& top_indices = _mm_setzero_si128(),
+    const __m128i& final_top_val = _mm_setzero_si128(),
+    const __m128i& border_index = _mm_setzero_si128()) {
+  constexpr int scale_int_bits = 5;
+  const __m128i sampled_values_0 = _mm_shuffle_epi8(top_vals_0, sampler);
+  const __m128i sampled_values_1 = _mm_shuffle_epi8(top_vals_1, sampler);
+  const __m128i prod_0 = _mm_mullo_epi16(sampled_values_0, shifts);
+  const __m128i prod_1 = _mm_mullo_epi16(sampled_values_1, shifts);
+  const __m128i combined = _mm_hadd_epi16(prod_0, prod_1);
+  const __m128i result = RightShiftWithRounding_U16(combined, scale_int_bits);
+  if (check_border) {
+    const __m128i past_max = _mm_cmpgt_epi16(top_indices, border_index);
+    // Replace pixels from invalid range with top-right corner.
+    return _mm_blendv_epi8(result, final_top_val, past_max);
+  }
+  return result;
+}
+
+// 7.11.2.4 (7) angle < 90
+inline void DirectionalZone1_Large(uint16_t* dest, ptrdiff_t stride,
+                                   const uint16_t* const top_row,
+                                   const int width, const int height,
+                                   const int xstep, const bool upsampled,
+                                   const __m128i& sampler) {
+  const int upsample_shift = static_cast<int>(upsampled);
+  const int index_scale_bits = 6 - upsample_shift;
+  const int max_base_x = ((width + height) - 1) << upsample_shift;
+
+  const __m128i max_shift = _mm_set1_epi16(32);
+  const int base_step = 1 << upsample_shift;
+  const int base_step8 = base_step << 3;
+
+  // All rows from |min_corner_only_y| down will simply use memcpy.
+  // |max_base_x| is always greater than |height|, so clipping to 1 is enough
+  // to make the logic work.
+  const int xstep_units = std::max(xstep >> index_scale_bits, 1);
+  const int min_corner_only_y = std::min(max_base_x / xstep_units, height);
+
+  // Rows up to this y-value can be computed without checking for bounds.
+  const int max_no_corner_y = std::min(
+      LeftShift((max_base_x - (base_step * width)), index_scale_bits) / xstep,
+      height);
+  // No need to check for exceeding |max_base_x| in the first loop.
+  int y = 0;
+  int top_x = xstep;
+  for (; y < max_no_corner_y; ++y, dest += stride, top_x += xstep) {
+    int top_base_x = top_x >> index_scale_bits;
+    // Permit negative values of |top_x|.
+    const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+    const __m128i shift = _mm_set1_epi16(shift_val);
+    const __m128i opposite_shift = _mm_sub_epi16(max_shift, shift);
+    const __m128i shifts = _mm_unpacklo_epi16(opposite_shift, shift);
+    int x = 0;
+    do {
+      const __m128i top_vals_0 = LoadUnaligned16(top_row + top_base_x);
+      const __m128i top_vals_1 =
+          LoadUnaligned16(top_row + top_base_x + (4 << upsample_shift));
+
+      const __m128i pred =
+          CombineTopVals<false>(top_vals_0, top_vals_1, sampler, shifts);
+
+      StoreUnaligned16(dest + x, pred);
+      top_base_x += base_step8;
+      x += 8;
+    } while (x < width);
+  }
+
+  // Each 16-bit value here corresponds to a position that may exceed
+  // |max_base_x|. When added to |top_base_x|, it is used to mask values
+  // that pass the end of the |top| buffer. Starting from 1 to simulate "cmpge"
+  // which is not supported for packed integers.
+  const __m128i offsets =
+      _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
+
+  const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x);
+  const __m128i final_top_val = _mm_set1_epi16(top_row[max_base_x]);
+  const __m128i base_step8_vect = _mm_set1_epi16(base_step8);
+  for (; y < min_corner_only_y; ++y, dest += stride, top_x += xstep) {
+    int top_base_x = top_x >> index_scale_bits;
+
+    const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+    const __m128i shift = _mm_set1_epi16(shift_val);
+    const __m128i opposite_shift = _mm_sub_epi16(max_shift, shift);
+    const __m128i shifts = _mm_unpacklo_epi16(opposite_shift, shift);
+    __m128i top_index_vect = _mm_set1_epi16(top_base_x);
+    top_index_vect = _mm_add_epi16(top_index_vect, offsets);
+
+    int x = 0;
+    const int min_corner_only_x =
+        std::min(width, ((max_base_x - top_base_x) >> upsample_shift) + 7) & ~7;
+    for (; x < min_corner_only_x;
+         x += 8, top_base_x += base_step8,
+         top_index_vect = _mm_add_epi16(top_index_vect, base_step8_vect)) {
+      const __m128i top_vals_0 = LoadUnaligned16(top_row + top_base_x);
+      const __m128i top_vals_1 =
+          LoadUnaligned16(top_row + top_base_x + (4 << upsample_shift));
+      const __m128i pred =
+          CombineTopVals<true>(top_vals_0, top_vals_1, sampler, shifts,
+                               top_index_vect, final_top_val, max_base_x_vect);
+      StoreUnaligned16(dest + x, pred);
+    }
+    // Corner-only section of the row.
+    Memset(dest + x, top_row[max_base_x], width - x);
+  }
+  // Fill in corner-only rows.
+  for (; y < height; ++y) {
+    Memset(dest, top_row[max_base_x], width);
+    dest += stride;
+  }
+}
+
+// 7.11.2.4 (7) angle < 90
+inline void DirectionalIntraPredictorZone1_SSE4_1(
+    void* dest_ptr, ptrdiff_t stride, const void* const top_ptr,
+    const int width, const int height, const int xstep, const bool upsampled) {
+  const auto* const top_row = static_cast<const uint16_t*>(top_ptr);
+  auto* dest = static_cast<uint16_t*>(dest_ptr);
+  stride /= sizeof(uint16_t);
+  const int upsample_shift = static_cast<int>(upsampled);
+  if (xstep == 64) {
+    DirectionalZone1_Step64(dest, stride, top_row, width, height);
+    return;
+  }
+  // Each base pixel paired with its following pixel, for hadd purposes.
+  const __m128i adjacency_shuffler = _mm_set_epi16(
+      0x0908, 0x0706, 0x0706, 0x0504, 0x0504, 0x0302, 0x0302, 0x0100);
+  // This is equivalent to not shuffling at all.
+  const __m128i identity_shuffler = _mm_set_epi16(
+      0x0F0E, 0x0D0C, 0x0B0A, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100);
+  // This represents a trade-off between code size and speed. When upsampled
+  // is true, no shuffle is necessary. But to avoid in-loop branching, we
+  // would need 2 copies of the main function body.
+  const __m128i sampler = upsampled ? identity_shuffler : adjacency_shuffler;
+  if (width == 4) {
+    DirectionalZone1_4xH(dest, stride, top_row, height, xstep, upsampled,
+                         sampler);
+    return;
+  }
+  if (width >= 32) {
+    DirectionalZone1_Large(dest, stride, top_row, width, height, xstep,
+                           upsampled, sampler);
+    return;
+  }
+  const int index_scale_bits = 6 - upsample_shift;
+  const int max_base_x = ((width + height) - 1) << upsample_shift;
+
+  const __m128i max_shift = _mm_set1_epi16(32);
+  const int base_step = 1 << upsample_shift;
+  const int base_step8 = base_step << 3;
+
+  // No need to check for exceeding |max_base_x| in the loops.
+  if (((xstep * height) >> index_scale_bits) + base_step * width < max_base_x) {
+    int top_x = xstep;
+    int y = height;
+    do {
+      int top_base_x = top_x >> index_scale_bits;
+      // Permit negative values of |top_x|.
+      const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+      const __m128i shift = _mm_set1_epi16(shift_val);
+      const __m128i opposite_shift = _mm_sub_epi16(max_shift, shift);
+      const __m128i shifts = _mm_unpacklo_epi16(opposite_shift, shift);
+      int x = 0;
+      do {
+        const __m128i top_vals_0 = LoadUnaligned16(top_row + top_base_x);
+        const __m128i top_vals_1 =
+            LoadUnaligned16(top_row + top_base_x + (4 << upsample_shift));
+        const __m128i pred =
+            CombineTopVals<false>(top_vals_0, top_vals_1, sampler, shifts);
+        StoreUnaligned16(dest + x, pred);
+        top_base_x += base_step8;
+        x += 8;
+      } while (x < width);
+      dest += stride;
+      top_x += xstep;
+    } while (--y != 0);
+    return;
+  }
+
+  // General case. Blocks with width less than 32 do not benefit from x-wise
+  // loop splitting, but do benefit from using memset on appropriate rows.
+
+  // Each 16-bit value here corresponds to a position that may exceed
+  // |max_base_x|. When added to the top_base_x, it is used to mask values
+  // that pass the end of |top|. Starting from 1 to simulate "cmpge" which is
+  // not supported for packed integers.
+  const __m128i offsets =
+      _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
+
+  const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x);
+  const __m128i final_top_val = _mm_set1_epi16(top_row[max_base_x]);
+  const __m128i base_step8_vect = _mm_set1_epi16(base_step8);
+
+  // All rows from |min_corner_only_y| down will simply use memcpy.
+  // |max_base_x| is always greater than |height|, so clipping the denominator
+  // to 1 is enough to make the logic work.
+  const int xstep_units = std::max(xstep >> index_scale_bits, 1);
+  const int min_corner_only_y = std::min(max_base_x / xstep_units, height);
+
+  int top_x = xstep;
+  int y = 0;
+  for (; y < min_corner_only_y; ++y, dest += stride, top_x += xstep) {
+    int top_base_x = top_x >> index_scale_bits;
+
+    const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+    const __m128i shift = _mm_set1_epi16(shift_val);
+    const __m128i opposite_shift = _mm_sub_epi16(max_shift, shift);
+    const __m128i shifts = _mm_unpacklo_epi16(opposite_shift, shift);
+    __m128i top_index_vect = _mm_set1_epi16(top_base_x);
+    top_index_vect = _mm_add_epi16(top_index_vect, offsets);
+
+    for (int x = 0; x < width; x += 8, top_base_x += base_step8,
+             top_index_vect = _mm_add_epi16(top_index_vect, base_step8_vect)) {
+      const __m128i top_vals_0 = LoadUnaligned16(top_row + top_base_x);
+      const __m128i top_vals_1 =
+          LoadUnaligned16(top_row + top_base_x + (4 << upsample_shift));
+      const __m128i pred =
+          CombineTopVals<true>(top_vals_0, top_vals_1, sampler, shifts,
+                               top_index_vect, final_top_val, max_base_x_vect);
+      StoreUnaligned16(dest + x, pred);
+    }
+  }
+
+  // Fill in corner-only rows.
+  for (; y < height; ++y) {
+    Memset(dest, top_row[max_base_x], width);
+    dest += stride;
+  }
+}
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+  assert(dsp != nullptr);
+  static_cast<void>(dsp);
+#if DSP_ENABLED_10BPP_SSE4_1(DirectionalIntraPredictorZone1)
+  dsp->directional_intra_predictor_zone1 =
+      DirectionalIntraPredictorZone1_SSE4_1;
+#endif
+}
+
+}  // namespace
+}  // namespace high_bitdepth
+
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+void IntraPredDirectionalInit_SSE4_1() {
+  low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !LIBGAV1_TARGETING_SSE4_1
+namespace libgav1 {
+namespace dsp {
+
+void IntraPredDirectionalInit_SSE4_1() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/intrapred_directional_sse4.h b/src/dsp/x86/intrapred_directional_sse4.h
new file mode 100644
index 0000000..b352450
--- /dev/null
+++ b/src/dsp/x86/intrapred_directional_sse4.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_INTRAPRED_DIRECTIONAL_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_INTRAPRED_DIRECTIONAL_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::directional_intra_predictor_zone*, see the defines below for
+// specifics. These functions are not thread-safe.
+void IntraPredDirectionalInit_SSE4_1();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+// If sse4 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the sse4 implementation should be used.
+#if LIBGAV1_TARGETING_SSE4_1
+#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone1
+#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone1 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone2
+#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone2 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone3
+#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone3 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone1
+#define LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone1 LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif  // LIBGAV1_TARGETING_SSE4_1
+
+#endif  // LIBGAV1_SRC_DSP_X86_INTRAPRED_DIRECTIONAL_SSE4_H_
diff --git a/src/dsp/x86/intrapred_filter_sse4.cc b/src/dsp/x86/intrapred_filter_sse4.cc
new file mode 100644
index 0000000..022af8d
--- /dev/null
+++ b/src/dsp/x86/intrapred_filter_sse4.cc
@@ -0,0 +1,432 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred_filter.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <xmmintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/dsp/x86/transpose_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+//------------------------------------------------------------------------------
+// FilterIntraPredictor_SSE4_1
+// Section 7.11.2.3. Recursive intra prediction process
+// This filter applies recursively to 4x2 sub-blocks within the transform block,
+// meaning that the predicted pixels in each sub-block are used as inputs to
+// sub-blocks below and to the right, if present.
+//
+// Each output value in the sub-block is predicted by a different filter applied
+// to the same array of top-left, top, and left values. If fn refers to the
+// output of the nth filter, given this block:
+// TL T0 T1 T2 T3
+// L0 f0 f1 f2 f3
+// L1 f4 f5 f6 f7
+// The filter input order is p0, p1, p2, p3, p4, p5, p6:
+// p0 p1 p2 p3 p4
+// p5 f0 f1 f2 f3
+// p6 f4 f5 f6 f7
+// Filters usually apply to 8 values for convenience, so in this case we fix
+// the 8th filter tap to 0 and disregard the value of the 8th input.
+
+// This shuffle mask selects 32-bit blocks in the order 0, 1, 0, 1, which
+// duplicates the first 8 bytes of a 128-bit vector into the second 8 bytes.
+constexpr int kDuplicateFirstHalf = 0x44;
+
+// Apply all filter taps to the given 7 packed 16-bit values, keeping the 8th
+// at zero to preserve the sum.
+// |pixels| contains p0-p7 in order as shown above.
+// |taps_0_1| contains the filter kernels used to predict f0 and f1, and so on.
+inline void Filter4x2_SSE4_1(uint8_t* dst, const ptrdiff_t stride,
+                             const __m128i& pixels, const __m128i& taps_0_1,
+                             const __m128i& taps_2_3, const __m128i& taps_4_5,
+                             const __m128i& taps_6_7) {
+  const __m128i mul_0_01 = _mm_maddubs_epi16(pixels, taps_0_1);
+  const __m128i mul_0_23 = _mm_maddubs_epi16(pixels, taps_2_3);
+  // |output_half| contains 8 partial sums for f0-f7.
+  __m128i output_half = _mm_hadd_epi16(mul_0_01, mul_0_23);
+  __m128i output = _mm_hadd_epi16(output_half, output_half);
+  const __m128i output_row0 =
+      _mm_packus_epi16(RightShiftWithRounding_S16(output, 4),
+                       /* unused half */ output);
+  Store4(dst, output_row0);
+  const __m128i mul_1_01 = _mm_maddubs_epi16(pixels, taps_4_5);
+  const __m128i mul_1_23 = _mm_maddubs_epi16(pixels, taps_6_7);
+  output_half = _mm_hadd_epi16(mul_1_01, mul_1_23);
+  output = _mm_hadd_epi16(output_half, output_half);
+  const __m128i output_row1 =
+      _mm_packus_epi16(RightShiftWithRounding_S16(output, 4),
+                       /* arbitrary pack arg */ output);
+  Store4(dst + stride, output_row1);
+}
+
+// 4xH transform sizes are given special treatment because LoadLo8 goes out
+// of bounds and every block involves the left column. The top-left pixel, p0,
+// is stored in the top buffer for the first 4x2, but comes from the left buffer
+// for successive blocks. This implementation takes advantage of the fact
+// that the p5 and p6 for each sub-block come solely from the |left_ptr| buffer,
+// using shifts to arrange things to fit reusable shuffle vectors.
+inline void Filter4xH(uint8_t* dest, ptrdiff_t stride,
+                      const uint8_t* const top_ptr,
+                      const uint8_t* const left_ptr, FilterIntraPredictor pred,
+                      const int height) {
+  // Two filter kernels per vector.
+  const __m128i taps_0_1 = LoadAligned16(kFilterIntraTaps[pred][0]);
+  const __m128i taps_2_3 = LoadAligned16(kFilterIntraTaps[pred][2]);
+  const __m128i taps_4_5 = LoadAligned16(kFilterIntraTaps[pred][4]);
+  const __m128i taps_6_7 = LoadAligned16(kFilterIntraTaps[pred][6]);
+  __m128i top = Load4(top_ptr - 1);
+  __m128i pixels = _mm_insert_epi8(top, top_ptr[3], 4);
+  __m128i left = (height == 4 ? Load4(left_ptr) : LoadLo8(left_ptr));
+  left = _mm_slli_si128(left, 5);
+
+  // Relative pixels: top[-1], top[0], top[1], top[2], top[3], left[0], left[1],
+  // left[2], left[3], left[4], left[5], left[6], left[7]
+  // Let rn represent a pixel usable as pn for the 4x2 after this one. We get:
+  //  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
+  // p0 p1 p2 p3 p4 p5 p6 r5 r6 ...
+  //                   r0
+  pixels = _mm_or_si128(left, pixels);
+
+  // Two sets of the same input pixels to apply two filters at once.
+  pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
+  Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+                   taps_6_7);
+  dest += stride;  // Move to y = 1.
+  pixels = Load4(dest);
+
+  // Relative pixels: top[0], top[1], top[2], top[3], empty, left[-2], left[-1],
+  // left[0], left[1], ...
+  //  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
+  // p1 p2 p3 p4 xx xx p0 p5 p6 r5 r6 ...
+  //                         r0
+  pixels = _mm_or_si128(left, pixels);
+
+  // This mask rearranges bytes in the order: 6, 0, 1, 2, 3, 7, 8, 15. The last
+  // byte is an unused value, which shall be multiplied by 0 when we apply the
+  // filter.
+  constexpr int64_t kInsertTopLeftFirstMask = 0x0F08070302010006;
+
+  // Insert left[-1] in front as TL and put left[0] and left[1] at the end.
+  const __m128i pixel_order1 = _mm_set1_epi64x(kInsertTopLeftFirstMask);
+  pixels = _mm_shuffle_epi8(pixels, pixel_order1);
+  dest += stride;  // Move to y = 2.
+  Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+                   taps_6_7);
+  dest += stride;  // Move to y = 3.
+
+  // Compute the middle 8 rows before using common code for the final 4 rows, in
+  // order to fit the assumption that |left| has the next TL at position 8.
+  if (height == 16) {
+    // This shift allows us to use pixel_order2 twice after shifting by 2 later.
+    left = _mm_slli_si128(left, 1);
+    pixels = Load4(dest);
+
+    // Relative pixels: top[0], top[1], top[2], top[3], empty, empty, left[-4],
+    // left[-3], left[-2], left[-1], left[0], left[1], left[2], left[3]
+    //  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
+    // p1 p2 p3 p4 xx xx xx xx xx p0 p5 p6 r5 r6 ...
+    //                                  r0
+    pixels = _mm_or_si128(left, pixels);
+
+    // This mask rearranges bytes in the order: 9, 0, 1, 2, 3, 7, 8, 15. The
+    // last byte is an unused value, as above. The top-left was shifted to
+    // position nine to keep two empty spaces after the top pixels.
+    constexpr int64_t kInsertTopLeftSecondMask = 0x0F0B0A0302010009;
+
+    // Insert (relative) left[-1] in front as TL and put left[0] and left[1] at
+    // the end.
+    const __m128i pixel_order2 = _mm_set1_epi64x(kInsertTopLeftSecondMask);
+    pixels = _mm_shuffle_epi8(pixels, pixel_order2);
+    dest += stride;  // Move to y = 4.
+
+    // First 4x2 in the if body.
+    Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+                     taps_6_7);
+
+    // Clear all but final pixel in the first 8 of left column.
+    __m128i keep_top_left = _mm_srli_si128(left, 13);
+    dest += stride;  // Move to y = 5.
+    pixels = Load4(dest);
+    left = _mm_srli_si128(left, 2);
+
+    // Relative pixels: top[0], top[1], top[2], top[3], left[-6],
+    // left[-5], left[-4], left[-3], left[-2], left[-1], left[0], left[1]
+    //  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
+    // p1 p2 p3 p4 xx xx xx xx xx p0 p5 p6 r5 r6 ...
+    //                                  r0
+    pixels = _mm_or_si128(left, pixels);
+    left = LoadLo8(left_ptr + 8);
+
+    pixels = _mm_shuffle_epi8(pixels, pixel_order2);
+    dest += stride;  // Move to y = 6.
+
+    // Second 4x2 in the if body.
+    Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+                     taps_6_7);
+
+    // Position TL value so we can use pixel_order1.
+    keep_top_left = _mm_slli_si128(keep_top_left, 6);
+    dest += stride;  // Move to y = 7.
+    pixels = Load4(dest);
+    left = _mm_slli_si128(left, 7);
+    left = _mm_or_si128(left, keep_top_left);
+
+    // Relative pixels: top[0], top[1], top[2], top[3], empty, empty,
+    // left[-1], left[0], left[1], left[2], left[3], ...
+    //  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
+    // p1 p2 p3 p4 xx xx p0 p5 p6 r5 r6 ...
+    //                         r0
+    pixels = _mm_or_si128(left, pixels);
+    pixels = _mm_shuffle_epi8(pixels, pixel_order1);
+    dest += stride;  // Move to y = 8.
+
+    // Third 4x2 in the if body.
+    Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+                     taps_6_7);
+    dest += stride;  // Move to y = 9.
+
+    // Prepare final inputs.
+    pixels = Load4(dest);
+    left = _mm_srli_si128(left, 2);
+
+    // Relative pixels: top[0], top[1], top[2], top[3], left[-3], left[-2]
+    // left[-1], left[0], left[1], left[2], left[3], ...
+    //  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
+    // p1 p2 p3 p4 xx xx p0 p5 p6 r5 r6 ...
+    //                         r0
+    pixels = _mm_or_si128(left, pixels);
+    pixels = _mm_shuffle_epi8(pixels, pixel_order1);
+    dest += stride;  // Move to y = 10.
+
+    // Fourth 4x2 in the if body.
+    Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+                     taps_6_7);
+    dest += stride;  // Move to y = 11.
+  }
+
+  // In both the 8 and 16 case at this point, we can assume that |left| has the
+  // next TL at position 8.
+  if (height > 4) {
+    // Erase prior left pixels by shifting TL to position 0.
+    left = _mm_srli_si128(left, 8);
+    left = _mm_slli_si128(left, 6);
+    pixels = Load4(dest);
+
+    // Relative pixels: top[0], top[1], top[2], top[3], empty, empty,
+    // left[-1], left[0], left[1], left[2], left[3], ...
+    //  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
+    // p1 p2 p3 p4 xx xx p0 p5 p6 r5 r6 ...
+    //                         r0
+    pixels = _mm_or_si128(left, pixels);
+    pixels = _mm_shuffle_epi8(pixels, pixel_order1);
+    dest += stride;  // Move to y = 12 or 4.
+
+    // First of final two 4x2 blocks.
+    Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+                     taps_6_7);
+    dest += stride;  // Move to y = 13 or 5.
+    pixels = Load4(dest);
+    left = _mm_srli_si128(left, 2);
+
+    // Relative pixels: top[0], top[1], top[2], top[3], left[-3], left[-2]
+    // left[-1], left[0], left[1], left[2], left[3], ...
+    //  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
+    // p1 p2 p3 p4 xx xx p0 p5 p6 r5 r6 ...
+    //                         r0
+    pixels = _mm_or_si128(left, pixels);
+    pixels = _mm_shuffle_epi8(pixels, pixel_order1);
+    dest += stride;  // Move to y = 14 or 6.
+
+    // Last of final two 4x2 blocks.
+    Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+                     taps_6_7);
+  }
+}
+
+void FilterIntraPredictor_SSE4_1(void* const dest, ptrdiff_t stride,
+                                 const void* const top_row,
+                                 const void* const left_column,
+                                 FilterIntraPredictor pred, const int width,
+                                 const int height) {
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  auto* dst = static_cast<uint8_t*>(dest);
+  if (width == 4) {
+    Filter4xH(dst, stride, top_ptr, left_ptr, pred, height);
+    return;
+  }
+
+  // There is one set of 7 taps for each of the 4x2 output pixels.
+  const __m128i taps_0_1 = LoadAligned16(kFilterIntraTaps[pred][0]);
+  const __m128i taps_2_3 = LoadAligned16(kFilterIntraTaps[pred][2]);
+  const __m128i taps_4_5 = LoadAligned16(kFilterIntraTaps[pred][4]);
+  const __m128i taps_6_7 = LoadAligned16(kFilterIntraTaps[pred][6]);
+
+  // This mask rearranges bytes in the order: 0, 1, 2, 3, 4, 8, 9, 15. The 15 at
+  // the end is an unused value, which shall be multiplied by 0 when we apply
+  // the filter.
+  constexpr int64_t kCondenseLeftMask = 0x0F09080403020100;
+
+  // Takes the "left section" and puts it right after p0-p4.
+  const __m128i pixel_order1 = _mm_set1_epi64x(kCondenseLeftMask);
+
+  // This mask rearranges bytes in the order: 8, 0, 1, 2, 3, 9, 10, 15. The last
+  // byte is unused as above.
+  constexpr int64_t kInsertTopLeftMask = 0x0F0A090302010008;
+
+  // Shuffles the "top left" from the left section, to the front. Used when
+  // grabbing data from left_column and not top_row.
+  const __m128i pixel_order2 = _mm_set1_epi64x(kInsertTopLeftMask);
+
+  // This first pass takes care of the cases where the top left pixel comes from
+  // top_row.
+  __m128i pixels = LoadLo8(top_ptr - 1);
+  __m128i left = _mm_slli_si128(Load4(left_column), 8);
+  pixels = _mm_or_si128(pixels, left);
+
+  // Two sets of the same pixels to multiply with two sets of taps.
+  pixels = _mm_shuffle_epi8(pixels, pixel_order1);
+  Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5, taps_6_7);
+  left = _mm_srli_si128(left, 1);
+
+  // Load
+  pixels = Load4(dst + stride);
+
+  // Because of the above shift, this OR 'invades' the final of the first 8
+  // bytes of |pixels|. This is acceptable because the 8th filter tap is always
+  // a padded 0.
+  pixels = _mm_or_si128(pixels, left);
+  pixels = _mm_shuffle_epi8(pixels, pixel_order2);
+  const ptrdiff_t stride2 = stride << 1;
+  const ptrdiff_t stride4 = stride << 2;
+  Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+                   taps_6_7);
+  dst += 4;
+  for (int x = 3; x < width - 4; x += 4) {
+    pixels = Load4(top_ptr + x);
+    pixels = _mm_insert_epi8(pixels, top_ptr[x + 4], 4);
+    pixels = _mm_insert_epi8(pixels, dst[-1], 5);
+    pixels = _mm_insert_epi8(pixels, dst[stride - 1], 6);
+
+    // Duplicate bottom half into upper half.
+    pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
+    Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+                     taps_6_7);
+    pixels = Load4(dst + stride - 1);
+    pixels = _mm_insert_epi8(pixels, dst[stride + 3], 4);
+    pixels = _mm_insert_epi8(pixels, dst[stride2 - 1], 5);
+    pixels = _mm_insert_epi8(pixels, dst[stride + stride2 - 1], 6);
+
+    // Duplicate bottom half into upper half.
+    pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
+    Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3,
+                     taps_4_5, taps_6_7);
+    dst += 4;
+  }
+
+  // Now we handle heights that reference previous blocks rather than top_row.
+  for (int y = 4; y < height; y += 4) {
+    // Leftmost 4x4 block for this height.
+    dst -= width;
+    dst += stride4;
+
+    // Top Left is not available by offset in these leftmost blocks.
+    pixels = Load4(dst - stride);
+    left = _mm_slli_si128(Load4(left_ptr + y - 1), 8);
+    left = _mm_insert_epi8(left, left_ptr[y + 3], 12);
+    pixels = _mm_or_si128(pixels, left);
+    pixels = _mm_shuffle_epi8(pixels, pixel_order2);
+    Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+                     taps_6_7);
+
+    // The bytes shifted into positions 6 and 7 will be ignored by the shuffle.
+    left = _mm_srli_si128(left, 2);
+    pixels = Load4(dst + stride);
+    pixels = _mm_or_si128(pixels, left);
+    pixels = _mm_shuffle_epi8(pixels, pixel_order2);
+    Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3,
+                     taps_4_5, taps_6_7);
+
+    dst += 4;
+
+    // Remaining 4x4 blocks for this height.
+    for (int x = 4; x < width; x += 4) {
+      pixels = Load4(dst - stride - 1);
+      pixels = _mm_insert_epi8(pixels, dst[-stride + 3], 4);
+      pixels = _mm_insert_epi8(pixels, dst[-1], 5);
+      pixels = _mm_insert_epi8(pixels, dst[stride - 1], 6);
+
+      // Duplicate bottom half into upper half.
+      pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
+      Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+                       taps_6_7);
+      pixels = Load4(dst + stride - 1);
+      pixels = _mm_insert_epi8(pixels, dst[stride + 3], 4);
+      pixels = _mm_insert_epi8(pixels, dst[stride2 - 1], 5);
+      pixels = _mm_insert_epi8(pixels, dst[stride2 + stride - 1], 6);
+
+      // Duplicate bottom half into upper half.
+      pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
+      Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3,
+                       taps_4_5, taps_6_7);
+      dst += 4;
+    }
+  }
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  static_cast<void>(dsp);
+// These guards check if this version of the function was not superseded by
+// a higher optimization level, such as AVX. The corresponding #define also
+// prevents the C version from being added to the table.
+#if DSP_ENABLED_8BPP_SSE4_1(FilterIntraPredictor)
+  dsp->filter_intra_predictor = FilterIntraPredictor_SSE4_1;
+#endif
+}
+
+}  // namespace
+
+void IntraPredFilterInit_SSE4_1() { Init8bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !LIBGAV1_TARGETING_SSE4_1
+namespace libgav1 {
+namespace dsp {
+
+void IntraPredFilterInit_SSE4_1() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/intrapred_filter_sse4.h b/src/dsp/x86/intrapred_filter_sse4.h
new file mode 100644
index 0000000..ce28f93
--- /dev/null
+++ b/src/dsp/x86/intrapred_filter_sse4.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_INTRAPRED_FILTER_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_INTRAPRED_FILTER_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::filter_intra_predictor, see the defines below for specifics.
+// These functions are not thread-safe.
+void IntraPredFilterInit_SSE4_1();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+// If sse4 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the sse4 implementation should be used.
+#if LIBGAV1_TARGETING_SSE4_1
+#ifndef LIBGAV1_Dsp8bpp_FilterIntraPredictor
+#define LIBGAV1_Dsp8bpp_FilterIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+#endif  // LIBGAV1_TARGETING_SSE4_1
+
+#endif  // LIBGAV1_SRC_DSP_X86_INTRAPRED_FILTER_SSE4_H_
diff --git a/src/dsp/x86/intrapred_smooth_sse4.cc b/src/dsp/x86/intrapred_smooth_sse4.cc
index e944ea3..de9f551 100644
--- a/src/dsp/x86/intrapred_smooth_sse4.cc
+++ b/src/dsp/x86/intrapred_smooth_sse4.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "src/dsp/intrapred.h"
+#include "src/dsp/intrapred_smooth.h"
 #include "src/utils/cpu.h"
 
 #if LIBGAV1_TARGETING_SSE4_1
@@ -22,12 +22,12 @@
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
-#include <cstring>  // memcpy
 
 #include "src/dsp/constants.h"
 #include "src/dsp/dsp.h"
 #include "src/dsp/x86/common_sse4.h"
 #include "src/utils/common.h"
+#include "src/utils/constants.h"
 
 namespace libgav1 {
 namespace dsp {
@@ -67,29 +67,6 @@ inline void WriteSmoothHorizontalSum4(void* const dest, const __m128i& left,
   Store4(dest, _mm_shuffle_epi8(pred, cvtepi32_epi8));
 }
 
-template <int y_mask>
-inline __m128i SmoothVerticalSum4(const __m128i& top, const __m128i& weights,
-                                  const __m128i& scaled_bottom_left) {
-  const __m128i weights_y = _mm_shuffle_epi32(weights, y_mask);
-  const __m128i weighted_top_y = _mm_mullo_epi16(top, weights_y);
-  const __m128i scaled_bottom_left_y =
-      _mm_shuffle_epi32(scaled_bottom_left, y_mask);
-  return _mm_add_epi32(scaled_bottom_left_y, weighted_top_y);
-}
-
-template <int y_mask>
-inline void WriteSmoothVerticalSum4(uint8_t* dest, const __m128i& top,
-                                    const __m128i& weights,
-                                    const __m128i& scaled_bottom_left,
-                                    const __m128i& round) {
-  __m128i pred_sum =
-      SmoothVerticalSum4<y_mask>(top, weights, scaled_bottom_left);
-  // Equivalent to RightShiftWithRounding(pred[x][y], 8).
-  pred_sum = _mm_srli_epi32(_mm_add_epi32(pred_sum, round), 8);
-  const __m128i cvtepi32_epi8 = _mm_set1_epi32(0x0C080400);
-  Store4(dest, _mm_shuffle_epi8(pred_sum, cvtepi32_epi8));
-}
-
 // For SMOOTH_H, |pixels| is the repeated left value for the row. For SMOOTH_V,
 // |pixels| is a segment of the top row or the whole top row, and |weights| is
 // repeated.
diff --git a/src/dsp/x86/intrapred_smooth_sse4.h b/src/dsp/x86/intrapred_smooth_sse4.h
new file mode 100644
index 0000000..9353371
--- /dev/null
+++ b/src/dsp/x86/intrapred_smooth_sse4.h
@@ -0,0 +1,318 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_INTRAPRED_SMOOTH_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_INTRAPRED_SMOOTH_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::intra_predictors[][kIntraPredictorSmooth.*].
+// This function is not thread-safe.
+void IntraPredSmoothInit_SSE4_1();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+// If sse4 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the sse4 implementation should be used.
+#if LIBGAV1_TARGETING_SSE4_1
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmooth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmooth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmooth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmooth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmooth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmooth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmooth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmooth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmooth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmooth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmooth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmooth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmooth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmooth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmooth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmooth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmooth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmooth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmooth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+#endif  // LIBGAV1_TARGETING_SSE4_1
+
+#endif  // LIBGAV1_SRC_DSP_X86_INTRAPRED_SMOOTH_SSE4_H_
diff --git a/src/dsp/x86/intrapred_sse4.cc b/src/dsp/x86/intrapred_sse4.cc
index 9938dfe..063929d 100644
--- a/src/dsp/x86/intrapred_sse4.cc
+++ b/src/dsp/x86/intrapred_sse4.cc
@@ -23,13 +23,14 @@
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
-#include <cstring>  // memcpy
+#include <cstring>
 
 #include "src/dsp/constants.h"
 #include "src/dsp/dsp.h"
 #include "src/dsp/x86/common_sse4.h"
 #include "src/dsp/x86/transpose_sse4.h"
 #include "src/utils/common.h"
+#include "src/utils/constants.h"
 
 namespace libgav1 {
 namespace dsp {
@@ -51,10 +52,6 @@ inline __m128i DivideByMultiplyShift_U32(const __m128i dividend) {
   return _mm_mulhi_epi16(interm, _mm_cvtsi32_si128(multiplier));
 }
 
-// This shuffle mask selects 32-bit blocks in the order 0, 1, 0, 1, which
-// duplicates the first 8 bytes of a 128-bit vector into the second 8 bytes.
-constexpr int kDuplicateFirstHalf = 0x44;
-
 //------------------------------------------------------------------------------
 // DcPredFuncs_SSE4_1
 
@@ -1408,1337 +1405,6 @@ void Paeth64x64_SSE4_1(void* const dest, ptrdiff_t stride,
   WritePaeth16x16(dst + 48, stride, top_left, top_3, left_3);
 }
 
-//------------------------------------------------------------------------------
-// 7.11.2.4. Directional intra prediction process
-
-// Special case: An |xstep| of 64 corresponds to an angle delta of 45, meaning
-// upsampling is ruled out. In addition, the bits masked by 0x3F for
-// |shift_val| are 0 for all multiples of 64, so the formula
-// val = top[top_base_x]*shift + top[top_base_x+1]*(32-shift), reduces to
-// val = top[top_base_x+1] << 5, meaning only the second set of pixels is
-// involved in the output. Hence |top| is offset by 1.
-inline void DirectionalZone1_Step64(uint8_t* dst, ptrdiff_t stride,
-                                    const uint8_t* const top, const int width,
-                                    const int height) {
-  ptrdiff_t offset = 1;
-  if (height == 4) {
-    memcpy(dst, top + offset, width);
-    dst += stride;
-    memcpy(dst, top + offset + 1, width);
-    dst += stride;
-    memcpy(dst, top + offset + 2, width);
-    dst += stride;
-    memcpy(dst, top + offset + 3, width);
-    return;
-  }
-  int y = 0;
-  do {
-    memcpy(dst, top + offset, width);
-    dst += stride;
-    memcpy(dst, top + offset + 1, width);
-    dst += stride;
-    memcpy(dst, top + offset + 2, width);
-    dst += stride;
-    memcpy(dst, top + offset + 3, width);
-    dst += stride;
-    memcpy(dst, top + offset + 4, width);
-    dst += stride;
-    memcpy(dst, top + offset + 5, width);
-    dst += stride;
-    memcpy(dst, top + offset + 6, width);
-    dst += stride;
-    memcpy(dst, top + offset + 7, width);
-    dst += stride;
-
-    offset += 8;
-    y += 8;
-  } while (y < height);
-}
-
-inline void DirectionalZone1_4xH(uint8_t* dst, ptrdiff_t stride,
-                                 const uint8_t* const top, const int height,
-                                 const int xstep, const bool upsampled) {
-  const int upsample_shift = static_cast<int>(upsampled);
-  const int scale_bits = 6 - upsample_shift;
-  const int rounding_bits = 5;
-  const int max_base_x = (height + 3 /* width - 1 */) << upsample_shift;
-  const __m128i final_top_val = _mm_set1_epi16(top[max_base_x]);
-  const __m128i sampler = upsampled ? _mm_set_epi64x(0, 0x0706050403020100)
-                                    : _mm_set_epi64x(0, 0x0403030202010100);
-  // Each 16-bit value here corresponds to a position that may exceed
-  // |max_base_x|. When added to the top_base_x, it is used to mask values
-  // that pass the end of |top|. Starting from 1 to simulate "cmpge" which is
-  // not supported for packed integers.
-  const __m128i offsets =
-      _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
-
-  // All rows from |min_corner_only_y| down will simply use memcpy. |max_base_x|
-  // is always greater than |height|, so clipping to 1 is enough to make the
-  // logic work.
-  const int xstep_units = std::max(xstep >> scale_bits, 1);
-  const int min_corner_only_y = std::min(max_base_x / xstep_units, height);
-
-  // Rows up to this y-value can be computed without checking for bounds.
-  int y = 0;
-  int top_x = xstep;
-
-  for (; y < min_corner_only_y; ++y, dst += stride, top_x += xstep) {
-    const int top_base_x = top_x >> scale_bits;
-
-    // Permit negative values of |top_x|.
-    const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
-    const __m128i shift = _mm_set1_epi8(shift_val);
-    const __m128i max_shift = _mm_set1_epi8(32);
-    const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
-    const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
-    __m128i top_index_vect = _mm_set1_epi16(top_base_x);
-    top_index_vect = _mm_add_epi16(top_index_vect, offsets);
-    const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x);
-
-    // Load 8 values because we will select the sampled values based on
-    // |upsampled|.
-    const __m128i values = LoadLo8(top + top_base_x);
-    const __m128i sampled_values = _mm_shuffle_epi8(values, sampler);
-    const __m128i past_max = _mm_cmpgt_epi16(top_index_vect, max_base_x_vect);
-    __m128i prod = _mm_maddubs_epi16(sampled_values, shifts);
-    prod = RightShiftWithRounding_U16(prod, rounding_bits);
-    // Replace pixels from invalid range with top-right corner.
-    prod = _mm_blendv_epi8(prod, final_top_val, past_max);
-    Store4(dst, _mm_packus_epi16(prod, prod));
-  }
-
-  // Fill in corner-only rows.
-  for (; y < height; ++y) {
-    memset(dst, top[max_base_x], /* width */ 4);
-    dst += stride;
-  }
-}
-
-// 7.11.2.4 (7) angle < 90
-inline void DirectionalZone1_Large(uint8_t* dest, ptrdiff_t stride,
-                                   const uint8_t* const top_row,
-                                   const int width, const int height,
-                                   const int xstep, const bool upsampled) {
-  const int upsample_shift = static_cast<int>(upsampled);
-  const __m128i sampler =
-      upsampled ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100)
-                : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100);
-  const int scale_bits = 6 - upsample_shift;
-  const int max_base_x = ((width + height) - 1) << upsample_shift;
-
-  const __m128i max_shift = _mm_set1_epi8(32);
-  const int rounding_bits = 5;
-  const int base_step = 1 << upsample_shift;
-  const int base_step8 = base_step << 3;
-
-  // All rows from |min_corner_only_y| down will simply use memcpy. |max_base_x|
-  // is always greater than |height|, so clipping to 1 is enough to make the
-  // logic work.
-  const int xstep_units = std::max(xstep >> scale_bits, 1);
-  const int min_corner_only_y = std::min(max_base_x / xstep_units, height);
-
-  // Rows up to this y-value can be computed without checking for bounds.
-  const int max_no_corner_y = std::min(
-      LeftShift((max_base_x - (base_step * width)), scale_bits) / xstep,
-      height);
-  // No need to check for exceeding |max_base_x| in the first loop.
-  int y = 0;
-  int top_x = xstep;
-  for (; y < max_no_corner_y; ++y, dest += stride, top_x += xstep) {
-    int top_base_x = top_x >> scale_bits;
-    // Permit negative values of |top_x|.
-    const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
-    const __m128i shift = _mm_set1_epi8(shift_val);
-    const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
-    const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
-    int x = 0;
-    do {
-      const __m128i top_vals = LoadUnaligned16(top_row + top_base_x);
-      __m128i vals = _mm_shuffle_epi8(top_vals, sampler);
-      vals = _mm_maddubs_epi16(vals, shifts);
-      vals = RightShiftWithRounding_U16(vals, rounding_bits);
-      StoreLo8(dest + x, _mm_packus_epi16(vals, vals));
-      top_base_x += base_step8;
-      x += 8;
-    } while (x < width);
-  }
-
-  // Each 16-bit value here corresponds to a position that may exceed
-  // |max_base_x|. When added to the top_base_x, it is used to mask values
-  // that pass the end of |top|. Starting from 1 to simulate "cmpge" which is
-  // not supported for packed integers.
-  const __m128i offsets =
-      _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
-
-  const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x);
-  const __m128i final_top_val = _mm_set1_epi16(top_row[max_base_x]);
-  const __m128i base_step8_vect = _mm_set1_epi16(base_step8);
-  for (; y < min_corner_only_y; ++y, dest += stride, top_x += xstep) {
-    int top_base_x = top_x >> scale_bits;
-
-    const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
-    const __m128i shift = _mm_set1_epi8(shift_val);
-    const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
-    const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
-    __m128i top_index_vect = _mm_set1_epi16(top_base_x);
-    top_index_vect = _mm_add_epi16(top_index_vect, offsets);
-
-    int x = 0;
-    const int min_corner_only_x =
-        std::min(width, ((max_base_x - top_base_x) >> upsample_shift) + 7) & ~7;
-    for (; x < min_corner_only_x;
-         x += 8, top_base_x += base_step8,
-         top_index_vect = _mm_add_epi16(top_index_vect, base_step8_vect)) {
-      const __m128i past_max = _mm_cmpgt_epi16(top_index_vect, max_base_x_vect);
-      // Assuming a buffer zone of 8 bytes at the end of top_row, this prevents
-      // reading out of bounds. If all indices are past max and we don't need to
-      // use the loaded bytes at all, |top_base_x| becomes 0. |top_base_x| will
-      // reset for the next |y|.
-      top_base_x &= ~_mm_cvtsi128_si32(past_max);
-      const __m128i top_vals = LoadUnaligned16(top_row + top_base_x);
-      __m128i vals = _mm_shuffle_epi8(top_vals, sampler);
-      vals = _mm_maddubs_epi16(vals, shifts);
-      vals = RightShiftWithRounding_U16(vals, rounding_bits);
-      vals = _mm_blendv_epi8(vals, final_top_val, past_max);
-      StoreLo8(dest + x, _mm_packus_epi16(vals, vals));
-    }
-    // Corner-only section of the row.
-    memset(dest + x, top_row[max_base_x], width - x);
-  }
-  // Fill in corner-only rows.
-  for (; y < height; ++y) {
-    memset(dest, top_row[max_base_x], width);
-    dest += stride;
-  }
-}
-
-// 7.11.2.4 (7) angle < 90
-inline void DirectionalZone1_SSE4_1(uint8_t* dest, ptrdiff_t stride,
-                                    const uint8_t* const top_row,
-                                    const int width, const int height,
-                                    const int xstep, const bool upsampled) {
-  const int upsample_shift = static_cast<int>(upsampled);
-  if (xstep == 64) {
-    DirectionalZone1_Step64(dest, stride, top_row, width, height);
-    return;
-  }
-  if (width == 4) {
-    DirectionalZone1_4xH(dest, stride, top_row, height, xstep, upsampled);
-    return;
-  }
-  if (width >= 32) {
-    DirectionalZone1_Large(dest, stride, top_row, width, height, xstep,
-                           upsampled);
-    return;
-  }
-  const __m128i sampler =
-      upsampled ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100)
-                : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100);
-  const int scale_bits = 6 - upsample_shift;
-  const int max_base_x = ((width + height) - 1) << upsample_shift;
-
-  const __m128i max_shift = _mm_set1_epi8(32);
-  const int rounding_bits = 5;
-  const int base_step = 1 << upsample_shift;
-  const int base_step8 = base_step << 3;
-
-  // No need to check for exceeding |max_base_x| in the loops.
-  if (((xstep * height) >> scale_bits) + base_step * width < max_base_x) {
-    int top_x = xstep;
-    int y = 0;
-    do {
-      int top_base_x = top_x >> scale_bits;
-      // Permit negative values of |top_x|.
-      const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
-      const __m128i shift = _mm_set1_epi8(shift_val);
-      const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
-      const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
-      int x = 0;
-      do {
-        const __m128i top_vals = LoadUnaligned16(top_row + top_base_x);
-        __m128i vals = _mm_shuffle_epi8(top_vals, sampler);
-        vals = _mm_maddubs_epi16(vals, shifts);
-        vals = RightShiftWithRounding_U16(vals, rounding_bits);
-        StoreLo8(dest + x, _mm_packus_epi16(vals, vals));
-        top_base_x += base_step8;
-        x += 8;
-      } while (x < width);
-      dest += stride;
-      top_x += xstep;
-    } while (++y < height);
-    return;
-  }
-
-  // Each 16-bit value here corresponds to a position that may exceed
-  // |max_base_x|. When added to the top_base_x, it is used to mask values
-  // that pass the end of |top|. Starting from 1 to simulate "cmpge" which is
-  // not supported for packed integers.
-  const __m128i offsets =
-      _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
-
-  const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x);
-  const __m128i final_top_val = _mm_set1_epi16(top_row[max_base_x]);
-  const __m128i base_step8_vect = _mm_set1_epi16(base_step8);
-  int top_x = xstep;
-  int y = 0;
-  do {
-    int top_base_x = top_x >> scale_bits;
-
-    if (top_base_x >= max_base_x) {
-      for (int i = y; i < height; ++i) {
-        memset(dest, top_row[max_base_x], width);
-        dest += stride;
-      }
-      return;
-    }
-
-    const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
-    const __m128i shift = _mm_set1_epi8(shift_val);
-    const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
-    const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
-    __m128i top_index_vect = _mm_set1_epi16(top_base_x);
-    top_index_vect = _mm_add_epi16(top_index_vect, offsets);
-
-    int x = 0;
-    for (; x < width - 8;
-         x += 8, top_base_x += base_step8,
-         top_index_vect = _mm_add_epi16(top_index_vect, base_step8_vect)) {
-      const __m128i past_max = _mm_cmpgt_epi16(top_index_vect, max_base_x_vect);
-      // Assuming a buffer zone of 8 bytes at the end of top_row, this prevents
-      // reading out of bounds. If all indices are past max and we don't need to
-      // use the loaded bytes at all, |top_base_x| becomes 0. |top_base_x| will
-      // reset for the next |y|.
-      top_base_x &= ~_mm_cvtsi128_si32(past_max);
-      const __m128i top_vals = LoadUnaligned16(top_row + top_base_x);
-      __m128i vals = _mm_shuffle_epi8(top_vals, sampler);
-      vals = _mm_maddubs_epi16(vals, shifts);
-      vals = RightShiftWithRounding_U16(vals, rounding_bits);
-      vals = _mm_blendv_epi8(vals, final_top_val, past_max);
-      StoreLo8(dest + x, _mm_packus_epi16(vals, vals));
-    }
-    const __m128i past_max = _mm_cmpgt_epi16(top_index_vect, max_base_x_vect);
-    __m128i vals;
-    if (upsampled) {
-      vals = LoadUnaligned16(top_row + top_base_x);
-    } else {
-      const __m128i top_vals = LoadLo8(top_row + top_base_x);
-      vals = _mm_shuffle_epi8(top_vals, sampler);
-      vals = _mm_insert_epi8(vals, top_row[top_base_x + 8], 15);
-    }
-    vals = _mm_maddubs_epi16(vals, shifts);
-    vals = RightShiftWithRounding_U16(vals, rounding_bits);
-    vals = _mm_blendv_epi8(vals, final_top_val, past_max);
-    StoreLo8(dest + x, _mm_packus_epi16(vals, vals));
-    dest += stride;
-    top_x += xstep;
-  } while (++y < height);
-}
-
-void DirectionalIntraPredictorZone1_SSE4_1(void* const dest, ptrdiff_t stride,
-                                           const void* const top_row,
-                                           const int width, const int height,
-                                           const int xstep,
-                                           const bool upsampled_top) {
-  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
-  auto* dst = static_cast<uint8_t*>(dest);
-  DirectionalZone1_SSE4_1(dst, stride, top_ptr, width, height, xstep,
-                          upsampled_top);
-}
-
-template <bool upsampled>
-inline void DirectionalZone3_4x4(uint8_t* dest, ptrdiff_t stride,
-                                 const uint8_t* const left_column,
-                                 const int base_left_y, const int ystep) {
-  // For use in the non-upsampled case.
-  const __m128i sampler = _mm_set_epi64x(0, 0x0403030202010100);
-  const int upsample_shift = static_cast<int>(upsampled);
-  const int scale_bits = 6 - upsample_shift;
-  const __m128i max_shift = _mm_set1_epi8(32);
-  const int rounding_bits = 5;
-
-  __m128i result_block[4];
-  for (int x = 0, left_y = base_left_y; x < 4; x++, left_y += ystep) {
-    const int left_base_y = left_y >> scale_bits;
-    const int shift_val = ((left_y << upsample_shift) & 0x3F) >> 1;
-    const __m128i shift = _mm_set1_epi8(shift_val);
-    const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
-    const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
-    __m128i vals;
-    if (upsampled) {
-      vals = LoadLo8(left_column + left_base_y);
-    } else {
-      const __m128i top_vals = LoadLo8(left_column + left_base_y);
-      vals = _mm_shuffle_epi8(top_vals, sampler);
-    }
-    vals = _mm_maddubs_epi16(vals, shifts);
-    vals = RightShiftWithRounding_U16(vals, rounding_bits);
-    result_block[x] = _mm_packus_epi16(vals, vals);
-  }
-  const __m128i result = Transpose4x4_U8(result_block);
-  // This is result_row0.
-  Store4(dest, result);
-  dest += stride;
-  const int result_row1 = _mm_extract_epi32(result, 1);
-  memcpy(dest, &result_row1, sizeof(result_row1));
-  dest += stride;
-  const int result_row2 = _mm_extract_epi32(result, 2);
-  memcpy(dest, &result_row2, sizeof(result_row2));
-  dest += stride;
-  const int result_row3 = _mm_extract_epi32(result, 3);
-  memcpy(dest, &result_row3, sizeof(result_row3));
-}
-
-template <bool upsampled, int height>
-inline void DirectionalZone3_8xH(uint8_t* dest, ptrdiff_t stride,
-                                 const uint8_t* const left_column,
-                                 const int base_left_y, const int ystep) {
-  // For use in the non-upsampled case.
-  const __m128i sampler =
-      _mm_set_epi64x(0x0807070606050504, 0x0403030202010100);
-  const int upsample_shift = static_cast<int>(upsampled);
-  const int scale_bits = 6 - upsample_shift;
-  const __m128i max_shift = _mm_set1_epi8(32);
-  const int rounding_bits = 5;
-
-  __m128i result_block[8];
-  for (int x = 0, left_y = base_left_y; x < 8; x++, left_y += ystep) {
-    const int left_base_y = left_y >> scale_bits;
-    const int shift_val = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
-    const __m128i shift = _mm_set1_epi8(shift_val);
-    const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
-    const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
-    __m128i vals;
-    if (upsampled) {
-      vals = LoadUnaligned16(left_column + left_base_y);
-    } else {
-      const __m128i top_vals = LoadUnaligned16(left_column + left_base_y);
-      vals = _mm_shuffle_epi8(top_vals, sampler);
-    }
-    vals = _mm_maddubs_epi16(vals, shifts);
-    result_block[x] = RightShiftWithRounding_U16(vals, rounding_bits);
-  }
-  Transpose8x8_U16(result_block, result_block);
-  for (int y = 0; y < height; ++y) {
-    StoreLo8(dest, _mm_packus_epi16(result_block[y], result_block[y]));
-    dest += stride;
-  }
-}
-
-// 7.11.2.4 (9) angle > 180
-void DirectionalIntraPredictorZone3_SSE4_1(void* dest, ptrdiff_t stride,
-                                           const void* const left_column,
-                                           const int width, const int height,
-                                           const int ystep,
-                                           const bool upsampled) {
-  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
-  auto* dst = static_cast<uint8_t*>(dest);
-  const int upsample_shift = static_cast<int>(upsampled);
-  if (width == 4 || height == 4) {
-    const ptrdiff_t stride4 = stride << 2;
-    if (upsampled) {
-      int left_y = ystep;
-      int x = 0;
-      do {
-        uint8_t* dst_x = dst + x;
-        int y = 0;
-        do {
-          DirectionalZone3_4x4<true>(
-              dst_x, stride, left_ptr + (y << upsample_shift), left_y, ystep);
-          dst_x += stride4;
-          y += 4;
-        } while (y < height);
-        left_y += ystep << 2;
-        x += 4;
-      } while (x < width);
-    } else {
-      int left_y = ystep;
-      int x = 0;
-      do {
-        uint8_t* dst_x = dst + x;
-        int y = 0;
-        do {
-          DirectionalZone3_4x4<false>(dst_x, stride, left_ptr + y, left_y,
-                                      ystep);
-          dst_x += stride4;
-          y += 4;
-        } while (y < height);
-        left_y += ystep << 2;
-        x += 4;
-      } while (x < width);
-    }
-    return;
-  }
-
-  const ptrdiff_t stride8 = stride << 3;
-  if (upsampled) {
-    int left_y = ystep;
-    int x = 0;
-    do {
-      uint8_t* dst_x = dst + x;
-      int y = 0;
-      do {
-        DirectionalZone3_8xH<true, 8>(
-            dst_x, stride, left_ptr + (y << upsample_shift), left_y, ystep);
-        dst_x += stride8;
-        y += 8;
-      } while (y < height);
-      left_y += ystep << 3;
-      x += 8;
-    } while (x < width);
-  } else {
-    int left_y = ystep;
-    int x = 0;
-    do {
-      uint8_t* dst_x = dst + x;
-      int y = 0;
-      do {
-        DirectionalZone3_8xH<false, 8>(
-            dst_x, stride, left_ptr + (y << upsample_shift), left_y, ystep);
-        dst_x += stride8;
-        y += 8;
-      } while (y < height);
-      left_y += ystep << 3;
-      x += 8;
-    } while (x < width);
-  }
-}
-
-//------------------------------------------------------------------------------
-// Directional Zone 2 Functions
-// 7.11.2.4 (8)
-
-// DirectionalBlend* selectively overwrites the values written by
-// DirectionalZone2FromLeftCol*. |zone_bounds| has one 16-bit index for each
-// row.
-template <int y_selector>
-inline void DirectionalBlend4_SSE4_1(uint8_t* dest,
-                                     const __m128i& dest_index_vect,
-                                     const __m128i& vals,
-                                     const __m128i& zone_bounds) {
-  const __m128i max_dest_x_vect = _mm_shufflelo_epi16(zone_bounds, y_selector);
-  const __m128i use_left = _mm_cmplt_epi16(dest_index_vect, max_dest_x_vect);
-  const __m128i original_vals = _mm_cvtepu8_epi16(Load4(dest));
-  const __m128i blended_vals = _mm_blendv_epi8(vals, original_vals, use_left);
-  Store4(dest, _mm_packus_epi16(blended_vals, blended_vals));
-}
-
-inline void DirectionalBlend8_SSE4_1(uint8_t* dest,
-                                     const __m128i& dest_index_vect,
-                                     const __m128i& vals,
-                                     const __m128i& zone_bounds,
-                                     const __m128i& bounds_selector) {
-  const __m128i max_dest_x_vect =
-      _mm_shuffle_epi8(zone_bounds, bounds_selector);
-  const __m128i use_left = _mm_cmplt_epi16(dest_index_vect, max_dest_x_vect);
-  const __m128i original_vals = _mm_cvtepu8_epi16(LoadLo8(dest));
-  const __m128i blended_vals = _mm_blendv_epi8(vals, original_vals, use_left);
-  StoreLo8(dest, _mm_packus_epi16(blended_vals, blended_vals));
-}
-
-constexpr int kDirectionalWeightBits = 5;
-// |source| is packed with 4 or 8 pairs of 8-bit values from left or top.
-// |shifts| is named to match the specification, with 4 or 8 pairs of (32 -
-// shift) and shift. Shift is guaranteed to be between 0 and 32.
-inline __m128i DirectionalZone2FromSource_SSE4_1(const uint8_t* const source,
-                                                 const __m128i& shifts,
-                                                 const __m128i& sampler) {
-  const __m128i src_vals = LoadUnaligned16(source);
-  __m128i vals = _mm_shuffle_epi8(src_vals, sampler);
-  vals = _mm_maddubs_epi16(vals, shifts);
-  return RightShiftWithRounding_U16(vals, kDirectionalWeightBits);
-}
-
-// Because the source values "move backwards" as the row index increases, the
-// indices derived from ystep are generally negative. This is accommodated by
-// making sure the relative indices are within [-15, 0] when the function is
-// called, and sliding them into the inclusive range [0, 15], relative to a
-// lower base address.
-constexpr int kPositiveIndexOffset = 15;
-
-template <bool upsampled>
-inline void DirectionalZone2FromLeftCol_4x4_SSE4_1(
-    uint8_t* dst, ptrdiff_t stride, const uint8_t* const left_column_base,
-    __m128i left_y) {
-  const int upsample_shift = static_cast<int>(upsampled);
-  const int scale_bits = 6 - upsample_shift;
-  const __m128i max_shifts = _mm_set1_epi8(32);
-  const __m128i shift_mask = _mm_set1_epi32(0x003F003F);
-  const __m128i index_increment = _mm_cvtsi32_si128(0x01010101);
-  const __m128i positive_offset = _mm_set1_epi8(kPositiveIndexOffset);
-  // Left_column and sampler are both offset by 15 so the indices are always
-  // positive.
-  const uint8_t* left_column = left_column_base - kPositiveIndexOffset;
-  for (int y = 0; y < 4; dst += stride, ++y) {
-    __m128i offset_y = _mm_srai_epi16(left_y, scale_bits);
-    offset_y = _mm_packs_epi16(offset_y, offset_y);
-
-    const __m128i adjacent = _mm_add_epi8(offset_y, index_increment);
-    __m128i sampler = _mm_unpacklo_epi8(offset_y, adjacent);
-    // Slide valid |offset_y| indices from range [-15, 0] to [0, 15] so they
-    // can work as shuffle indices. Some values may be out of bounds, but their
-    // pred results will be masked over by top prediction.
-    sampler = _mm_add_epi8(sampler, positive_offset);
-
-    __m128i shifts = _mm_srli_epi16(
-        _mm_and_si128(_mm_slli_epi16(left_y, upsample_shift), shift_mask), 1);
-    shifts = _mm_packus_epi16(shifts, shifts);
-    const __m128i opposite_shifts = _mm_sub_epi8(max_shifts, shifts);
-    shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
-    const __m128i vals = DirectionalZone2FromSource_SSE4_1(
-        left_column + (y << upsample_shift), shifts, sampler);
-    Store4(dst, _mm_packus_epi16(vals, vals));
-  }
-}
-
-// The height at which a load of 16 bytes will not contain enough source pixels
-// from |left_column| to supply an accurate row when computing 8 pixels at a
-// time. The values are found by inspection. By coincidence, all angles that
-// satisfy (ystep >> 6) == 2 map to the same value, so it is enough to look up
-// by ystep >> 6. The largest index for this lookup is 1023 >> 6 == 15.
-constexpr int kDirectionalZone2ShuffleInvalidHeight[16] = {
-    1024, 1024, 16, 16, 16, 16, 0, 0, 18, 0, 0, 0, 0, 0, 0, 40};
-
-template <bool upsampled>
-inline void DirectionalZone2FromLeftCol_8x8_SSE4_1(
-    uint8_t* dst, ptrdiff_t stride, const uint8_t* const left_column,
-    __m128i left_y) {
-  const int upsample_shift = static_cast<int>(upsampled);
-  const int scale_bits = 6 - upsample_shift;
-  const __m128i max_shifts = _mm_set1_epi8(32);
-  const __m128i shift_mask = _mm_set1_epi32(0x003F003F);
-  const __m128i index_increment = _mm_set1_epi8(1);
-  const __m128i denegation = _mm_set1_epi8(kPositiveIndexOffset);
-  for (int y = 0; y < 8; dst += stride, ++y) {
-    __m128i offset_y = _mm_srai_epi16(left_y, scale_bits);
-    offset_y = _mm_packs_epi16(offset_y, offset_y);
-    const __m128i adjacent = _mm_add_epi8(offset_y, index_increment);
-
-    // Offset the relative index because ystep is negative in Zone 2 and shuffle
-    // indices must be nonnegative.
-    __m128i sampler = _mm_unpacklo_epi8(offset_y, adjacent);
-    sampler = _mm_add_epi8(sampler, denegation);
-
-    __m128i shifts = _mm_srli_epi16(
-        _mm_and_si128(_mm_slli_epi16(left_y, upsample_shift), shift_mask), 1);
-    shifts = _mm_packus_epi16(shifts, shifts);
-    const __m128i opposite_shifts = _mm_sub_epi8(max_shifts, shifts);
-    shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
-
-    // The specification adds (y << 6) to left_y, which is subject to
-    // upsampling, but this puts sampler indices out of the 0-15 range. It is
-    // equivalent to offset the source address by (y << upsample_shift) instead.
-    const __m128i vals = DirectionalZone2FromSource_SSE4_1(
-        left_column - kPositiveIndexOffset + (y << upsample_shift), shifts,
-        sampler);
-    StoreLo8(dst, _mm_packus_epi16(vals, vals));
-  }
-}
-
-// |zone_bounds| is an epi16 of the relative x index at which base >= -(1 <<
-// upsampled_top), for each row. When there are 4 values, they can be duplicated
-// with a non-register shuffle mask.
-// |shifts| is one pair of weights that applies throughout a given row.
-template <bool upsampled_top>
-inline void DirectionalZone1Blend_4x4(
-    uint8_t* dest, const uint8_t* const top_row, ptrdiff_t stride,
-    __m128i sampler, const __m128i& zone_bounds, const __m128i& shifts,
-    const __m128i& dest_index_x, int top_x, const int xstep) {
-  const int upsample_shift = static_cast<int>(upsampled_top);
-  const int scale_bits_x = 6 - upsample_shift;
-  top_x -= xstep;
-
-  int top_base_x = (top_x >> scale_bits_x);
-  const __m128i vals0 = DirectionalZone2FromSource_SSE4_1(
-      top_row + top_base_x, _mm_shufflelo_epi16(shifts, 0x00), sampler);
-  DirectionalBlend4_SSE4_1<0x00>(dest, dest_index_x, vals0, zone_bounds);
-  top_x -= xstep;
-  dest += stride;
-
-  top_base_x = (top_x >> scale_bits_x);
-  const __m128i vals1 = DirectionalZone2FromSource_SSE4_1(
-      top_row + top_base_x, _mm_shufflelo_epi16(shifts, 0x55), sampler);
-  DirectionalBlend4_SSE4_1<0x55>(dest, dest_index_x, vals1, zone_bounds);
-  top_x -= xstep;
-  dest += stride;
-
-  top_base_x = (top_x >> scale_bits_x);
-  const __m128i vals2 = DirectionalZone2FromSource_SSE4_1(
-      top_row + top_base_x, _mm_shufflelo_epi16(shifts, 0xAA), sampler);
-  DirectionalBlend4_SSE4_1<0xAA>(dest, dest_index_x, vals2, zone_bounds);
-  top_x -= xstep;
-  dest += stride;
-
-  top_base_x = (top_x >> scale_bits_x);
-  const __m128i vals3 = DirectionalZone2FromSource_SSE4_1(
-      top_row + top_base_x, _mm_shufflelo_epi16(shifts, 0xFF), sampler);
-  DirectionalBlend4_SSE4_1<0xFF>(dest, dest_index_x, vals3, zone_bounds);
-}
-
-template <bool upsampled_top, int height>
-inline void DirectionalZone1Blend_8xH(
-    uint8_t* dest, const uint8_t* const top_row, ptrdiff_t stride,
-    __m128i sampler, const __m128i& zone_bounds, const __m128i& shifts,
-    const __m128i& dest_index_x, int top_x, const int xstep) {
-  const int upsample_shift = static_cast<int>(upsampled_top);
-  const int scale_bits_x = 6 - upsample_shift;
-
-  __m128i y_selector = _mm_set1_epi32(0x01000100);
-  const __m128i index_increment = _mm_set1_epi32(0x02020202);
-  for (int y = 0; y < height; ++y,
-           y_selector = _mm_add_epi8(y_selector, index_increment),
-           dest += stride) {
-    top_x -= xstep;
-    const int top_base_x = top_x >> scale_bits_x;
-    const __m128i vals = DirectionalZone2FromSource_SSE4_1(
-        top_row + top_base_x, _mm_shuffle_epi8(shifts, y_selector), sampler);
-    DirectionalBlend8_SSE4_1(dest, dest_index_x, vals, zone_bounds, y_selector);
-  }
-}
-
-// 7.11.2.4 (8) 90 < angle > 180
-// The strategy for this function is to know how many blocks can be processed
-// with just pixels from |top_ptr|, then handle mixed blocks, then handle only
-// blocks that take from |left_ptr|. Additionally, a fast index-shuffle
-// approach is used for pred values from |left_column| in sections that permit
-// it.
-template <bool upsampled_left, bool upsampled_top>
-inline void DirectionalZone2_SSE4_1(void* dest, ptrdiff_t stride,
-                                    const uint8_t* const top_row,
-                                    const uint8_t* const left_column,
-                                    const int width, const int height,
-                                    const int xstep, const int ystep) {
-  auto* dst = static_cast<uint8_t*>(dest);
-  const int upsample_left_shift = static_cast<int>(upsampled_left);
-  const int upsample_top_shift = static_cast<int>(upsampled_top);
-  const __m128i max_shift = _mm_set1_epi8(32);
-  const ptrdiff_t stride8 = stride << 3;
-  const __m128i dest_index_x =
-      _mm_set_epi32(0x00070006, 0x00050004, 0x00030002, 0x00010000);
-  const __m128i sampler_top =
-      upsampled_top
-          ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100)
-          : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100);
-  const __m128i shift_mask = _mm_set1_epi32(0x003F003F);
-  // All columns from |min_top_only_x| to the right will only need |top_row| to
-  // compute. This assumes minimum |xstep| is 3.
-  const int min_top_only_x = std::min((height * xstep) >> 6, width);
-
-  // For steep angles, the source pixels from left_column may not fit in a
-  // 16-byte load for shuffling.
-  // TODO(petersonab): Find a more precise formula for this subject to x.
-  const int max_shuffle_height =
-      std::min(height, kDirectionalZone2ShuffleInvalidHeight[ystep >> 6]);
-
-  const int xstep8 = xstep << 3;
-  const __m128i xstep8_vect = _mm_set1_epi16(xstep8);
-  // Accumulate xstep across 8 rows.
-  const __m128i xstep_dup = _mm_set1_epi16(-xstep);
-  const __m128i increments = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
-  const __m128i xstep_for_shift = _mm_mullo_epi16(xstep_dup, increments);
-  // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1
-  const __m128i scaled_one = _mm_set1_epi16(-64);
-  __m128i xstep_bounds_base =
-      (xstep == 64) ? _mm_sub_epi16(scaled_one, xstep_for_shift)
-                    : _mm_sub_epi16(_mm_set1_epi16(-1), xstep_for_shift);
-
-  const int left_base_increment = ystep >> 6;
-  const int ystep_remainder = ystep & 0x3F;
-  const int ystep8 = ystep << 3;
-  const int left_base_increment8 = ystep8 >> 6;
-  const int ystep_remainder8 = ystep8 & 0x3F;
-  const __m128i increment_left8 = _mm_set1_epi16(-ystep_remainder8);
-
-  // If the 64 scaling is regarded as a decimal point, the first value of the
-  // left_y vector omits the portion which is covered under the left_column
-  // offset. Following values need the full ystep as a relative offset.
-  const __m128i ystep_init = _mm_set1_epi16(-ystep_remainder);
-  const __m128i ystep_dup = _mm_set1_epi16(-ystep);
-  __m128i left_y = _mm_mullo_epi16(ystep_dup, dest_index_x);
-  left_y = _mm_add_epi16(ystep_init, left_y);
-
-  const __m128i increment_top8 = _mm_set1_epi16(8 << 6);
-  int x = 0;
-
-  // This loop treats each set of 4 columns in 3 stages with y-value boundaries.
-  // The first stage, before the first y-loop, covers blocks that are only
-  // computed from the top row. The second stage, comprising two y-loops, covers
-  // blocks that have a mixture of values computed from top or left. The final
-  // stage covers blocks that are only computed from the left.
-  for (int left_offset = -left_base_increment; x < min_top_only_x;
-       x += 8,
-           xstep_bounds_base = _mm_sub_epi16(xstep_bounds_base, increment_top8),
-           // Watch left_y because it can still get big.
-       left_y = _mm_add_epi16(left_y, increment_left8),
-           left_offset -= left_base_increment8) {
-    uint8_t* dst_x = dst + x;
-
-    // Round down to the nearest multiple of 8.
-    const int max_top_only_y = std::min(((x + 1) << 6) / xstep, height) & ~7;
-    DirectionalZone1_4xH(dst_x, stride, top_row + (x << upsample_top_shift),
-                         max_top_only_y, -xstep, upsampled_top);
-    DirectionalZone1_4xH(dst_x + 4, stride,
-                         top_row + ((x + 4) << upsample_top_shift),
-                         max_top_only_y, -xstep, upsampled_top);
-
-    int y = max_top_only_y;
-    dst_x += stride * y;
-    const int xstep_y = xstep * y;
-    const __m128i xstep_y_vect = _mm_set1_epi16(xstep_y);
-    // All rows from |min_left_only_y| down for this set of columns, only need
-    // |left_column| to compute.
-    const int min_left_only_y = std::min(((x + 8) << 6) / xstep, height);
-    // At high angles such that min_left_only_y < 8, ystep is low and xstep is
-    // high. This means that max_shuffle_height is unbounded and xstep_bounds
-    // will overflow in 16 bits. This is prevented by stopping the first
-    // blending loop at min_left_only_y for such cases, which means we skip over
-    // the second blending loop as well.
-    const int left_shuffle_stop_y =
-        std::min(max_shuffle_height, min_left_only_y);
-    __m128i xstep_bounds = _mm_add_epi16(xstep_bounds_base, xstep_y_vect);
-    __m128i xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift, xstep_y_vect);
-    int top_x = -xstep_y;
-
-    for (; y < left_shuffle_stop_y;
-         y += 8, dst_x += stride8,
-         xstep_bounds = _mm_add_epi16(xstep_bounds, xstep8_vect),
-         xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift_y, xstep8_vect),
-         top_x -= xstep8) {
-      DirectionalZone2FromLeftCol_8x8_SSE4_1<upsampled_left>(
-          dst_x, stride,
-          left_column + ((left_offset + y) << upsample_left_shift), left_y);
-
-      __m128i shifts = _mm_srli_epi16(
-          _mm_and_si128(_mm_slli_epi16(xstep_for_shift_y, upsample_top_shift),
-                        shift_mask),
-          1);
-      shifts = _mm_packus_epi16(shifts, shifts);
-      __m128i opposite_shifts = _mm_sub_epi8(max_shift, shifts);
-      shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
-      __m128i xstep_bounds_off = _mm_srai_epi16(xstep_bounds, 6);
-      DirectionalZone1Blend_8xH<upsampled_top, 8>(
-          dst_x, top_row + (x << upsample_top_shift), stride, sampler_top,
-          xstep_bounds_off, shifts, dest_index_x, top_x, xstep);
-    }
-    // Pick up from the last y-value, using the 10% slower but secure method for
-    // left prediction.
-    const auto base_left_y = static_cast<int16_t>(_mm_extract_epi16(left_y, 0));
-    for (; y < min_left_only_y;
-         y += 8, dst_x += stride8,
-         xstep_bounds = _mm_add_epi16(xstep_bounds, xstep8_vect),
-         xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift_y, xstep8_vect),
-         top_x -= xstep8) {
-      const __m128i xstep_bounds_off = _mm_srai_epi16(xstep_bounds, 6);
-
-      DirectionalZone3_8xH<upsampled_left, 8>(
-          dst_x, stride,
-          left_column + ((left_offset + y) << upsample_left_shift), base_left_y,
-          -ystep);
-
-      __m128i shifts = _mm_srli_epi16(
-          _mm_and_si128(_mm_slli_epi16(xstep_for_shift_y, upsample_top_shift),
-                        shift_mask),
-          1);
-      shifts = _mm_packus_epi16(shifts, shifts);
-      __m128i opposite_shifts = _mm_sub_epi8(max_shift, shifts);
-      shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
-      DirectionalZone1Blend_8xH<upsampled_top, 8>(
-          dst_x, top_row + (x << upsample_top_shift), stride, sampler_top,
-          xstep_bounds_off, shifts, dest_index_x, top_x, xstep);
-    }
-    // Loop over y for left_only rows.
-    for (; y < height; y += 8, dst_x += stride8) {
-      DirectionalZone3_8xH<upsampled_left, 8>(
-          dst_x, stride,
-          left_column + ((left_offset + y) << upsample_left_shift), base_left_y,
-          -ystep);
-    }
-  }
-  for (; x < width; x += 4) {
-    DirectionalZone1_4xH(dst + x, stride, top_row + (x << upsample_top_shift),
-                         height, -xstep, upsampled_top);
-  }
-}
-
-template <bool upsampled_left, bool upsampled_top>
-inline void DirectionalZone2_4_SSE4_1(void* dest, ptrdiff_t stride,
-                                      const uint8_t* const top_row,
-                                      const uint8_t* const left_column,
-                                      const int width, const int height,
-                                      const int xstep, const int ystep) {
-  auto* dst = static_cast<uint8_t*>(dest);
-  const int upsample_left_shift = static_cast<int>(upsampled_left);
-  const int upsample_top_shift = static_cast<int>(upsampled_top);
-  const __m128i max_shift = _mm_set1_epi8(32);
-  const ptrdiff_t stride4 = stride << 2;
-  const __m128i dest_index_x = _mm_set_epi32(0, 0, 0x00030002, 0x00010000);
-  const __m128i sampler_top =
-      upsampled_top
-          ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100)
-          : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100);
-  // All columns from |min_top_only_x| to the right will only need |top_row| to
-  // compute.
-  assert(xstep >= 3);
-  const int min_top_only_x = std::min((height * xstep) >> 6, width);
-
-  const int xstep4 = xstep << 2;
-  const __m128i xstep4_vect = _mm_set1_epi16(xstep4);
-  const __m128i xstep_dup = _mm_set1_epi16(-xstep);
-  const __m128i increments = _mm_set_epi32(0, 0, 0x00040003, 0x00020001);
-  __m128i xstep_for_shift = _mm_mullo_epi16(xstep_dup, increments);
-  const __m128i scaled_one = _mm_set1_epi16(-64);
-  // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1
-  __m128i xstep_bounds_base =
-      (xstep == 64) ? _mm_sub_epi16(scaled_one, xstep_for_shift)
-                    : _mm_sub_epi16(_mm_set1_epi16(-1), xstep_for_shift);
-
-  const int left_base_increment = ystep >> 6;
-  const int ystep_remainder = ystep & 0x3F;
-  const int ystep4 = ystep << 2;
-  const int left_base_increment4 = ystep4 >> 6;
-  // This is guaranteed to be less than 64, but accumulation may bring it past
-  // 64 for higher x values.
-  const int ystep_remainder4 = ystep4 & 0x3F;
-  const __m128i increment_left4 = _mm_set1_epi16(-ystep_remainder4);
-  const __m128i increment_top4 = _mm_set1_epi16(4 << 6);
-
-  // If the 64 scaling is regarded as a decimal point, the first value of the
-  // left_y vector omits the portion which will go into the left_column offset.
-  // Following values need the full ystep as a relative offset.
-  const __m128i ystep_init = _mm_set1_epi16(-ystep_remainder);
-  const __m128i ystep_dup = _mm_set1_epi16(-ystep);
-  __m128i left_y = _mm_mullo_epi16(ystep_dup, dest_index_x);
-  left_y = _mm_add_epi16(ystep_init, left_y);
-  const __m128i shift_mask = _mm_set1_epi32(0x003F003F);
-
-  int x = 0;
-  // Loop over x for columns with a mixture of sources.
-  for (int left_offset = -left_base_increment; x < min_top_only_x; x += 4,
-           xstep_bounds_base = _mm_sub_epi16(xstep_bounds_base, increment_top4),
-           left_y = _mm_add_epi16(left_y, increment_left4),
-           left_offset -= left_base_increment4) {
-    uint8_t* dst_x = dst + x;
-
-    // Round down to the nearest multiple of 8.
-    const int max_top_only_y = std::min((x << 6) / xstep, height) & 0xFFFFFFF4;
-    DirectionalZone1_4xH(dst_x, stride, top_row + (x << upsample_top_shift),
-                         max_top_only_y, -xstep, upsampled_top);
-    int y = max_top_only_y;
-    dst_x += stride * y;
-    const int xstep_y = xstep * y;
-    const __m128i xstep_y_vect = _mm_set1_epi16(xstep_y);
-    // All rows from |min_left_only_y| down for this set of columns, only need
-    // |left_column| to compute. Rounded up to the nearest multiple of 4.
-    const int min_left_only_y = std::min(((x + 4) << 6) / xstep, height);
-
-    __m128i xstep_bounds = _mm_add_epi16(xstep_bounds_base, xstep_y_vect);
-    __m128i xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift, xstep_y_vect);
-    int top_x = -xstep_y;
-
-    // Loop over y for mixed rows.
-    for (; y < min_left_only_y;
-         y += 4, dst_x += stride4,
-         xstep_bounds = _mm_add_epi16(xstep_bounds, xstep4_vect),
-         xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift_y, xstep4_vect),
-         top_x -= xstep4) {
-      DirectionalZone2FromLeftCol_4x4_SSE4_1<upsampled_left>(
-          dst_x, stride,
-          left_column + ((left_offset + y) * (1 << upsample_left_shift)),
-          left_y);
-
-      __m128i shifts = _mm_srli_epi16(
-          _mm_and_si128(_mm_slli_epi16(xstep_for_shift_y, upsample_top_shift),
-                        shift_mask),
-          1);
-      shifts = _mm_packus_epi16(shifts, shifts);
-      const __m128i opposite_shifts = _mm_sub_epi8(max_shift, shifts);
-      shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
-      const __m128i xstep_bounds_off = _mm_srai_epi16(xstep_bounds, 6);
-      DirectionalZone1Blend_4x4<upsampled_top>(
-          dst_x, top_row + (x << upsample_top_shift), stride, sampler_top,
-          xstep_bounds_off, shifts, dest_index_x, top_x, xstep);
-    }
-    // Loop over y for left-only rows, if any.
-    for (; y < height; y += 4, dst_x += stride4) {
-      DirectionalZone2FromLeftCol_4x4_SSE4_1<upsampled_left>(
-          dst_x, stride,
-          left_column + ((left_offset + y) << upsample_left_shift), left_y);
-    }
-  }
-  // Loop over top-only columns, if any.
-  for (; x < width; x += 4) {
-    DirectionalZone1_4xH(dst + x, stride, top_row + (x << upsample_top_shift),
-                         height, -xstep, upsampled_top);
-  }
-}
-
-void DirectionalIntraPredictorZone2_SSE4_1(void* const dest, ptrdiff_t stride,
-                                           const void* const top_row,
-                                           const void* const left_column,
-                                           const int width, const int height,
-                                           const int xstep, const int ystep,
-                                           const bool upsampled_top,
-                                           const bool upsampled_left) {
-  // Increasing the negative buffer for this function allows more rows to be
-  // processed at a time without branching in an inner loop to check the base.
-  uint8_t top_buffer[288];
-  uint8_t left_buffer[288];
-  memcpy(top_buffer + 128, static_cast<const uint8_t*>(top_row) - 16, 160);
-  memcpy(left_buffer + 128, static_cast<const uint8_t*>(left_column) - 16, 160);
-  const uint8_t* top_ptr = top_buffer + 144;
-  const uint8_t* left_ptr = left_buffer + 144;
-  if (width == 4 || height == 4) {
-    if (upsampled_left) {
-      if (upsampled_top) {
-        DirectionalZone2_4_SSE4_1<true, true>(dest, stride, top_ptr, left_ptr,
-                                              width, height, xstep, ystep);
-      } else {
-        DirectionalZone2_4_SSE4_1<true, false>(dest, stride, top_ptr, left_ptr,
-                                               width, height, xstep, ystep);
-      }
-    } else {
-      if (upsampled_top) {
-        DirectionalZone2_4_SSE4_1<false, true>(dest, stride, top_ptr, left_ptr,
-                                               width, height, xstep, ystep);
-      } else {
-        DirectionalZone2_4_SSE4_1<false, false>(dest, stride, top_ptr, left_ptr,
-                                                width, height, xstep, ystep);
-      }
-    }
-    return;
-  }
-  if (upsampled_left) {
-    if (upsampled_top) {
-      DirectionalZone2_SSE4_1<true, true>(dest, stride, top_ptr, left_ptr,
-                                          width, height, xstep, ystep);
-    } else {
-      DirectionalZone2_SSE4_1<true, false>(dest, stride, top_ptr, left_ptr,
-                                           width, height, xstep, ystep);
-    }
-  } else {
-    if (upsampled_top) {
-      DirectionalZone2_SSE4_1<false, true>(dest, stride, top_ptr, left_ptr,
-                                           width, height, xstep, ystep);
-    } else {
-      DirectionalZone2_SSE4_1<false, false>(dest, stride, top_ptr, left_ptr,
-                                            width, height, xstep, ystep);
-    }
-  }
-}
-
-//------------------------------------------------------------------------------
-// FilterIntraPredictor_SSE4_1
-
-// Apply all filter taps to the given 7 packed 16-bit values, keeping the 8th
-// at zero to preserve the sum.
-inline void Filter4x2_SSE4_1(uint8_t* dst, const ptrdiff_t stride,
-                             const __m128i& pixels, const __m128i& taps_0_1,
-                             const __m128i& taps_2_3, const __m128i& taps_4_5,
-                             const __m128i& taps_6_7) {
-  const __m128i mul_0_01 = _mm_maddubs_epi16(pixels, taps_0_1);
-  const __m128i mul_0_23 = _mm_maddubs_epi16(pixels, taps_2_3);
-  // |output_half| contains 8 partial sums.
-  __m128i output_half = _mm_hadd_epi16(mul_0_01, mul_0_23);
-  __m128i output = _mm_hadd_epi16(output_half, output_half);
-  const __m128i output_row0 =
-      _mm_packus_epi16(RightShiftWithRounding_S16(output, 4),
-                       /* arbitrary pack arg */ output);
-  Store4(dst, output_row0);
-  const __m128i mul_1_01 = _mm_maddubs_epi16(pixels, taps_4_5);
-  const __m128i mul_1_23 = _mm_maddubs_epi16(pixels, taps_6_7);
-  output_half = _mm_hadd_epi16(mul_1_01, mul_1_23);
-  output = _mm_hadd_epi16(output_half, output_half);
-  const __m128i output_row1 =
-      _mm_packus_epi16(RightShiftWithRounding_S16(output, 4),
-                       /* arbitrary pack arg */ output);
-  Store4(dst + stride, output_row1);
-}
-
-// 4xH transform sizes are given special treatment because LoadLo8 goes out
-// of bounds and every block involves the left column. This implementation
-// loads TL from the top row for the first block, so it is not
-inline void Filter4xH(uint8_t* dest, ptrdiff_t stride,
-                      const uint8_t* const top_ptr,
-                      const uint8_t* const left_ptr, FilterIntraPredictor pred,
-                      const int height) {
-  const __m128i taps_0_1 = LoadUnaligned16(kFilterIntraTaps[pred][0]);
-  const __m128i taps_2_3 = LoadUnaligned16(kFilterIntraTaps[pred][2]);
-  const __m128i taps_4_5 = LoadUnaligned16(kFilterIntraTaps[pred][4]);
-  const __m128i taps_6_7 = LoadUnaligned16(kFilterIntraTaps[pred][6]);
-  __m128i top = Load4(top_ptr - 1);
-  __m128i pixels = _mm_insert_epi8(top, top_ptr[3], 4);
-  __m128i left = (height == 4 ? Load4(left_ptr) : LoadLo8(left_ptr));
-  left = _mm_slli_si128(left, 5);
-
-  // Relative pixels: top[-1], top[0], top[1], top[2], top[3], left[0], left[1],
-  // left[2], left[3], left[4], left[5], left[6], left[7]
-  pixels = _mm_or_si128(left, pixels);
-
-  // Duplicate first 8 bytes.
-  pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
-  Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
-                   taps_6_7);
-  dest += stride;  // Move to y = 1.
-  pixels = Load4(dest);
-
-  // Relative pixels: top[0], top[1], top[2], top[3], empty, left[-2], left[-1],
-  // left[0], left[1], ...
-  pixels = _mm_or_si128(left, pixels);
-
-  // This mask rearranges bytes in the order: 6, 0, 1, 2, 3, 7, 8, 15. The last
-  // byte is an unused value, which shall be multiplied by 0 when we apply the
-  // filter.
-  constexpr int64_t kInsertTopLeftFirstMask = 0x0F08070302010006;
-
-  // Insert left[-1] in front as TL and put left[0] and left[1] at the end.
-  const __m128i pixel_order1 = _mm_set1_epi64x(kInsertTopLeftFirstMask);
-  pixels = _mm_shuffle_epi8(pixels, pixel_order1);
-  dest += stride;  // Move to y = 2.
-  Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
-                   taps_6_7);
-  dest += stride;  // Move to y = 3.
-
-  // Compute the middle 8 rows before using common code for the final 4 rows.
-  // Because the common code below this block assumes that
-  if (height == 16) {
-    // This shift allows us to use pixel_order2 twice after shifting by 2 later.
-    left = _mm_slli_si128(left, 1);
-    pixels = Load4(dest);
-
-    // Relative pixels: top[0], top[1], top[2], top[3], empty, empty, left[-4],
-    // left[-3], left[-2], left[-1], left[0], left[1], left[2], left[3]
-    pixels = _mm_or_si128(left, pixels);
-
-    // This mask rearranges bytes in the order: 9, 0, 1, 2, 3, 7, 8, 15. The
-    // last byte is an unused value, as above. The top-left was shifted to
-    // position nine to keep two empty spaces after the top pixels.
-    constexpr int64_t kInsertTopLeftSecondMask = 0x0F0B0A0302010009;
-
-    // Insert (relative) left[-1] in front as TL and put left[0] and left[1] at
-    // the end.
-    const __m128i pixel_order2 = _mm_set1_epi64x(kInsertTopLeftSecondMask);
-    pixels = _mm_shuffle_epi8(pixels, pixel_order2);
-    dest += stride;  // Move to y = 4.
-
-    // First 4x2 in the if body.
-    Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
-                     taps_6_7);
-
-    // Clear all but final pixel in the first 8 of left column.
-    __m128i keep_top_left = _mm_srli_si128(left, 13);
-    dest += stride;  // Move to y = 5.
-    pixels = Load4(dest);
-    left = _mm_srli_si128(left, 2);
-
-    // Relative pixels: top[0], top[1], top[2], top[3], left[-6],
-    // left[-5], left[-4], left[-3], left[-2], left[-1], left[0], left[1]
-    pixels = _mm_or_si128(left, pixels);
-    left = LoadLo8(left_ptr + 8);
-
-    pixels = _mm_shuffle_epi8(pixels, pixel_order2);
-    dest += stride;  // Move to y = 6.
-
-    // Second 4x2 in the if body.
-    Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
-                     taps_6_7);
-
-    // Position TL value so we can use pixel_order1.
-    keep_top_left = _mm_slli_si128(keep_top_left, 6);
-    dest += stride;  // Move to y = 7.
-    pixels = Load4(dest);
-    left = _mm_slli_si128(left, 7);
-    left = _mm_or_si128(left, keep_top_left);
-
-    // Relative pixels: top[0], top[1], top[2], top[3], empty, empty,
-    // left[-1], left[0], left[1], left[2], left[3], ...
-    pixels = _mm_or_si128(left, pixels);
-    pixels = _mm_shuffle_epi8(pixels, pixel_order1);
-    dest += stride;  // Move to y = 8.
-
-    // Third 4x2 in the if body.
-    Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
-                     taps_6_7);
-    dest += stride;  // Move to y = 9.
-
-    // Prepare final inputs.
-    pixels = Load4(dest);
-    left = _mm_srli_si128(left, 2);
-
-    // Relative pixels: top[0], top[1], top[2], top[3], left[-3], left[-2]
-    // left[-1], left[0], left[1], left[2], left[3], ...
-    pixels = _mm_or_si128(left, pixels);
-    pixels = _mm_shuffle_epi8(pixels, pixel_order1);
-    dest += stride;  // Move to y = 10.
-
-    // Fourth 4x2 in the if body.
-    Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
-                     taps_6_7);
-    dest += stride;  // Move to y = 11.
-  }
-
-  // In both the 8 and 16 case, we assume that the left vector has the next TL
-  // at position 8.
-  if (height > 4) {
-    // Erase prior left pixels by shifting TL to position 0.
-    left = _mm_srli_si128(left, 8);
-    left = _mm_slli_si128(left, 6);
-    pixels = Load4(dest);
-
-    // Relative pixels: top[0], top[1], top[2], top[3], empty, empty,
-    // left[-1], left[0], left[1], left[2], left[3], ...
-    pixels = _mm_or_si128(left, pixels);
-    pixels = _mm_shuffle_epi8(pixels, pixel_order1);
-    dest += stride;  // Move to y = 12 or 4.
-
-    // First of final two 4x2 blocks.
-    Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
-                     taps_6_7);
-    dest += stride;  // Move to y = 13 or 5.
-    pixels = Load4(dest);
-    left = _mm_srli_si128(left, 2);
-
-    // Relative pixels: top[0], top[1], top[2], top[3], left[-3], left[-2]
-    // left[-1], left[0], left[1], left[2], left[3], ...
-    pixels = _mm_or_si128(left, pixels);
-    pixels = _mm_shuffle_epi8(pixels, pixel_order1);
-    dest += stride;  // Move to y = 14 or 6.
-
-    // Last of final two 4x2 blocks.
-    Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
-                     taps_6_7);
-  }
-}
-
-void FilterIntraPredictor_SSE4_1(void* const dest, ptrdiff_t stride,
-                                 const void* const top_row,
-                                 const void* const left_column,
-                                 FilterIntraPredictor pred, const int width,
-                                 const int height) {
-  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
-  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
-  auto* dst = static_cast<uint8_t*>(dest);
-  if (width == 4) {
-    Filter4xH(dst, stride, top_ptr, left_ptr, pred, height);
-    return;
-  }
-
-  // There is one set of 7 taps for each of the 4x2 output pixels.
-  const __m128i taps_0_1 = LoadUnaligned16(kFilterIntraTaps[pred][0]);
-  const __m128i taps_2_3 = LoadUnaligned16(kFilterIntraTaps[pred][2]);
-  const __m128i taps_4_5 = LoadUnaligned16(kFilterIntraTaps[pred][4]);
-  const __m128i taps_6_7 = LoadUnaligned16(kFilterIntraTaps[pred][6]);
-
-  // This mask rearranges bytes in the order: 0, 1, 2, 3, 4, 8, 9, 15. The 15 at
-  // the end is an unused value, which shall be multiplied by 0 when we apply
-  // the filter.
-  constexpr int64_t kCondenseLeftMask = 0x0F09080403020100;
-
-  // Takes the "left section" and puts it right after p0-p4.
-  const __m128i pixel_order1 = _mm_set1_epi64x(kCondenseLeftMask);
-
-  // This mask rearranges bytes in the order: 8, 0, 1, 2, 3, 9, 10, 15. The last
-  // byte is unused as above.
-  constexpr int64_t kInsertTopLeftMask = 0x0F0A090302010008;
-
-  // Shuffles the "top left" from the left section, to the front. Used when
-  // grabbing data from left_column and not top_row.
-  const __m128i pixel_order2 = _mm_set1_epi64x(kInsertTopLeftMask);
-
-  // This first pass takes care of the cases where the top left pixel comes from
-  // top_row.
-  __m128i pixels = LoadLo8(top_ptr - 1);
-  __m128i left = _mm_slli_si128(Load4(left_column), 8);
-  pixels = _mm_or_si128(pixels, left);
-
-  // Two sets of the same pixels to multiply with two sets of taps.
-  pixels = _mm_shuffle_epi8(pixels, pixel_order1);
-  Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5, taps_6_7);
-  left = _mm_srli_si128(left, 1);
-
-  // Load
-  pixels = Load4(dst + stride);
-
-  // Because of the above shift, this OR 'invades' the final of the first 8
-  // bytes of |pixels|. This is acceptable because the 8th filter tap is always
-  // a padded 0.
-  pixels = _mm_or_si128(pixels, left);
-  pixels = _mm_shuffle_epi8(pixels, pixel_order2);
-  const ptrdiff_t stride2 = stride << 1;
-  const ptrdiff_t stride4 = stride << 2;
-  Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
-                   taps_6_7);
-  dst += 4;
-  for (int x = 3; x < width - 4; x += 4) {
-    pixels = Load4(top_ptr + x);
-    pixels = _mm_insert_epi8(pixels, top_ptr[x + 4], 4);
-    pixels = _mm_insert_epi8(pixels, dst[-1], 5);
-    pixels = _mm_insert_epi8(pixels, dst[stride - 1], 6);
-
-    // Duplicate bottom half into upper half.
-    pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
-    Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
-                     taps_6_7);
-    pixels = Load4(dst + stride - 1);
-    pixels = _mm_insert_epi8(pixels, dst[stride + 3], 4);
-    pixels = _mm_insert_epi8(pixels, dst[stride2 - 1], 5);
-    pixels = _mm_insert_epi8(pixels, dst[stride + stride2 - 1], 6);
-
-    // Duplicate bottom half into upper half.
-    pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
-    Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3,
-                     taps_4_5, taps_6_7);
-    dst += 4;
-  }
-
-  // Now we handle heights that reference previous blocks rather than top_row.
-  for (int y = 4; y < height; y += 4) {
-    // Leftmost 4x4 block for this height.
-    dst -= width;
-    dst += stride4;
-
-    // Top Left is not available by offset in these leftmost blocks.
-    pixels = Load4(dst - stride);
-    left = _mm_slli_si128(Load4(left_ptr + y - 1), 8);
-    left = _mm_insert_epi8(left, left_ptr[y + 3], 12);
-    pixels = _mm_or_si128(pixels, left);
-    pixels = _mm_shuffle_epi8(pixels, pixel_order2);
-    Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
-                     taps_6_7);
-
-    // The bytes shifted into positions 6 and 7 will be ignored by the shuffle.
-    left = _mm_srli_si128(left, 2);
-    pixels = Load4(dst + stride);
-    pixels = _mm_or_si128(pixels, left);
-    pixels = _mm_shuffle_epi8(pixels, pixel_order2);
-    Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3,
-                     taps_4_5, taps_6_7);
-
-    dst += 4;
-
-    // Remaining 4x4 blocks for this height.
-    for (int x = 4; x < width; x += 4) {
-      pixels = Load4(dst - stride - 1);
-      pixels = _mm_insert_epi8(pixels, dst[-stride + 3], 4);
-      pixels = _mm_insert_epi8(pixels, dst[-1], 5);
-      pixels = _mm_insert_epi8(pixels, dst[stride - 1], 6);
-
-      // Duplicate bottom half into upper half.
-      pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
-      Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
-                       taps_6_7);
-      pixels = Load4(dst + stride - 1);
-      pixels = _mm_insert_epi8(pixels, dst[stride + 3], 4);
-      pixels = _mm_insert_epi8(pixels, dst[stride2 - 1], 5);
-      pixels = _mm_insert_epi8(pixels, dst[stride2 + stride - 1], 6);
-
-      // Duplicate bottom half into upper half.
-      pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
-      Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3,
-                       taps_4_5, taps_6_7);
-      dst += 4;
-    }
-  }
-}
-
 void Init8bpp() {
   Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
   assert(dsp != nullptr);
@@ -2746,21 +1412,6 @@ void Init8bpp() {
 // These guards check if this version of the function was not superseded by
 // a higher optimization level, such as AVX. The corresponding #define also
 // prevents the C version from being added to the table.
-#if DSP_ENABLED_8BPP_SSE4_1(FilterIntraPredictor)
-  dsp->filter_intra_predictor = FilterIntraPredictor_SSE4_1;
-#endif
-#if DSP_ENABLED_8BPP_SSE4_1(DirectionalIntraPredictorZone1)
-  dsp->directional_intra_predictor_zone1 =
-      DirectionalIntraPredictorZone1_SSE4_1;
-#endif
-#if DSP_ENABLED_8BPP_SSE4_1(DirectionalIntraPredictorZone2)
-  dsp->directional_intra_predictor_zone2 =
-      DirectionalIntraPredictorZone2_SSE4_1;
-#endif
-#if DSP_ENABLED_8BPP_SSE4_1(DirectionalIntraPredictorZone3)
-  dsp->directional_intra_predictor_zone3 =
-      DirectionalIntraPredictorZone3_SSE4_1;
-#endif
 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorDcTop)
   dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcTop] =
       DcDefs::_4x4::DcTop;
@@ -3524,7 +2175,7 @@ void IntraPredInit_SSE4_1() {
 }  // namespace dsp
 }  // namespace libgav1
 
-#else  // !LIBGAV1_TARGETING_SSE4_1
+#else   // !LIBGAV1_TARGETING_SSE4_1
 namespace libgav1 {
 namespace dsp {
 
diff --git a/src/dsp/x86/intrapred_sse4.h b/src/dsp/x86/intrapred_sse4.h
index 7f4fcd7..1f6f30a 100644
--- a/src/dsp/x86/intrapred_sse4.h
+++ b/src/dsp/x86/intrapred_sse4.h
@@ -23,13 +23,9 @@
 namespace libgav1 {
 namespace dsp {
 
-// Initializes Dsp::intra_predictors, Dsp::directional_intra_predictor_zone*,
-// Dsp::cfl_intra_predictors, Dsp::cfl_subsamplers and
-// Dsp::filter_intra_predictor, see the defines below for specifics. These
-// functions are not thread-safe.
+// Initializes Dsp::intra_predictors. See the defines below for specifics.
+// These functions are not thread-safe.
 void IntraPredInit_SSE4_1();
-void IntraPredCflInit_SSE4_1();
-void IntraPredSmoothInit_SSE4_1();
 
 }  // namespace dsp
 }  // namespace libgav1
@@ -37,22 +33,6 @@ void IntraPredSmoothInit_SSE4_1();
 // If sse4 is enabled and the baseline isn't set due to a higher level of
 // optimization being enabled, signal the sse4 implementation should be used.
 #if LIBGAV1_TARGETING_SSE4_1
-#ifndef LIBGAV1_Dsp8bpp_FilterIntraPredictor
-#define LIBGAV1_Dsp8bpp_FilterIntraPredictor LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone1
-#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone1 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone2
-#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone2 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone3
-#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone3 LIBGAV1_CPU_SSE4_1
-#endif
-
 #ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcTop
 #define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcTop LIBGAV1_CPU_SSE4_1
 #endif
@@ -138,174 +118,6 @@ void IntraPredSmoothInit_SSE4_1();
   LIBGAV1_CPU_SSE4_1
 #endif
 
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler420
-#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler420 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler420
-#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler420
-#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler420
-#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler420 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler420
-#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler420
-#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler420
-#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler420 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler420
-#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler420 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler420
-#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler420
-#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler420
-#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler420 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler420
-#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler420
-#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler420
-#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler420 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler444
-#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler444 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler444
-#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler444
-#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler444
-#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler444 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler444
-#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler444
-#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler444
-#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler444 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler444
-#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler444 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler444
-#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler444
-#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler444
-#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler444 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler444
-#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler444
-#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler444
-#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler444 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflIntraPredictor
-#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflIntraPredictor LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflIntraPredictor
-#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflIntraPredictor
-#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflIntraPredictor
-#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflIntraPredictor LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflIntraPredictor
-#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflIntraPredictor
-#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflIntraPredictor
-#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflIntraPredictor LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflIntraPredictor
-#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflIntraPredictor LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflIntraPredictor
-#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflIntraPredictor
-#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflIntraPredictor
-#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflIntraPredictor LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflIntraPredictor
-#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflIntraPredictor
-#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflIntraPredictor
-#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflIntraPredictor LIBGAV1_CPU_SSE4_1
-#endif
-
 #ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcLeft
 #define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcLeft LIBGAV1_CPU_SSE4_1
 #endif
@@ -658,287 +470,6 @@ void IntraPredSmoothInit_SSE4_1();
   LIBGAV1_CPU_SSE4_1
 #endif
 
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmooth
-#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmooth LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmooth
-#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmooth LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmooth
-#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmooth \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmooth
-#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmooth LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmooth
-#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmooth LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmooth
-#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmooth \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmooth
-#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmooth \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmooth
-#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmooth \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmooth
-#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmooth \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmooth
-#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmooth \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmooth
-#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmooth \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmooth
-#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmooth \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmooth
-#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmooth \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmooth
-#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmooth \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmooth
-#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmooth \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmooth
-#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmooth \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmooth
-#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmooth \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmooth
-#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmooth \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmooth
-#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmooth \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothVertical
-#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothVertical
-#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothVertical
-#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothVertical
-#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothVertical
-#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothVertical
-#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothVertical
-#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothVertical
-#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothVertical
-#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothVertical
-#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothVertical
-#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothVertical
-#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothVertical
-#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothVertical
-#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothVertical
-#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothVertical
-#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothVertical
-#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothVertical
-#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothVertical
-#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothHorizontal
-#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothHorizontal
-#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothHorizontal
-#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothHorizontal
-#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothHorizontal
-#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothHorizontal
-#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothHorizontal
-#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothHorizontal
-#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothHorizontal
-#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothHorizontal
-#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothHorizontal
-#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothHorizontal
-#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothHorizontal
-#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothHorizontal
-#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothHorizontal
-#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothHorizontal
-#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothHorizontal
-#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothHorizontal
-#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothHorizontal
-#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
 //------------------------------------------------------------------------------
 // 10bpp
 
diff --git a/src/dsp/x86/inverse_transform_sse4.cc b/src/dsp/x86/inverse_transform_sse4.cc
index 787d706..12c008f 100644
--- a/src/dsp/x86/inverse_transform_sse4.cc
+++ b/src/dsp/x86/inverse_transform_sse4.cc
@@ -94,8 +94,7 @@ LIBGAV1_ALWAYS_INLINE void ButterflyRotation_4(__m128i* a, __m128i* b,
       static_cast<uint16_t>(cos128) | (static_cast<uint32_t>(sin128) << 16));
   const __m128i ba = _mm_unpacklo_epi16(*a, *b);
   const __m128i ab = _mm_unpacklo_epi16(*b, *a);
-  const __m128i sign =
-      _mm_set_epi32(0x80000001, 0x80000001, 0x80000001, 0x80000001);
+  const __m128i sign = _mm_set1_epi32(static_cast<int>(0x80000001));
   // -sin cos, -sin cos, -sin cos, -sin cos
   const __m128i msin_pcos = _mm_sign_epi16(psin_pcos, sign);
   const __m128i x0 = _mm_madd_epi16(ba, msin_pcos);
@@ -121,8 +120,7 @@ LIBGAV1_ALWAYS_INLINE void ButterflyRotation_8(__m128i* a, __m128i* b,
   const int16_t sin128 = Sin128(angle);
   const __m128i psin_pcos = _mm_set1_epi32(
       static_cast<uint16_t>(cos128) | (static_cast<uint32_t>(sin128) << 16));
-  const __m128i sign =
-      _mm_set_epi32(0x80000001, 0x80000001, 0x80000001, 0x80000001);
+  const __m128i sign = _mm_set1_epi32(static_cast<int>(0x80000001));
   // -sin cos, -sin cos, -sin cos, -sin cos
   const __m128i msin_pcos = _mm_sign_epi16(psin_pcos, sign);
   const __m128i ba = _mm_unpacklo_epi16(*a, *b);
@@ -229,7 +227,8 @@ LIBGAV1_ALWAYS_INLINE bool DctDcOnly(void* dest, int adjusted_tx_height,
   const __m128i v_src_lo = _mm_shufflelo_epi16(_mm_cvtsi32_si128(dst[0]), 0);
   const __m128i v_src =
       (width == 4) ? v_src_lo : _mm_shuffle_epi32(v_src_lo, 0);
-  const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0);
+  const __m128i v_mask =
+      _mm_set1_epi16(should_round ? static_cast<int16_t>(0xffff) : 0);
   const __m128i v_kTransformRowMultiplier =
       _mm_set1_epi16(kTransformRowMultiplier << 3);
   const __m128i v_src_round =
@@ -1039,7 +1038,8 @@ LIBGAV1_ALWAYS_INLINE bool Adst4DcOnly(void* dest, int adjusted_tx_height,
   auto* dst = static_cast<int16_t*>(dest);
   const __m128i v_src =
       _mm_shuffle_epi32(_mm_shufflelo_epi16(_mm_cvtsi32_si128(dst[0]), 0), 0);
-  const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0);
+  const __m128i v_mask =
+      _mm_set1_epi16(should_round ? static_cast<int16_t>(0xffff) : 0);
   const __m128i v_kTransformRowMultiplier =
       _mm_set1_epi16(kTransformRowMultiplier << 3);
   const __m128i v_src_round =
@@ -1194,7 +1194,8 @@ LIBGAV1_ALWAYS_INLINE bool Adst8DcOnly(void* dest, int adjusted_tx_height,
   __m128i s[8];
 
   const __m128i v_src = _mm_shufflelo_epi16(_mm_cvtsi32_si128(dst[0]), 0);
-  const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0);
+  const __m128i v_mask =
+      _mm_set1_epi16(should_round ? static_cast<int16_t>(0xffff) : 0);
   const __m128i v_kTransformRowMultiplier =
       _mm_set1_epi16(kTransformRowMultiplier << 3);
   const __m128i v_src_round =
@@ -1519,7 +1520,8 @@ LIBGAV1_ALWAYS_INLINE bool Adst16DcOnly(void* dest, int adjusted_tx_height,
   __m128i x[16];
 
   const __m128i v_src = _mm_shufflelo_epi16(_mm_cvtsi32_si128(dst[0]), 0);
-  const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0);
+  const __m128i v_mask =
+      _mm_set1_epi16(should_round ? static_cast<int16_t>(0xffff) : 0);
   const __m128i v_kTransformRowMultiplier =
       _mm_set1_epi16(kTransformRowMultiplier << 3);
   const __m128i v_src_round =
@@ -1615,7 +1617,8 @@ LIBGAV1_ALWAYS_INLINE bool Identity4DcOnly(void* dest, int adjusted_tx_height,
 
   auto* dst = static_cast<int16_t*>(dest);
   const __m128i v_src0 = _mm_cvtsi32_si128(dst[0]);
-  const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0);
+  const __m128i v_mask =
+      _mm_set1_epi16(should_round ? static_cast<int16_t>(0xffff) : 0);
   const __m128i v_kTransformRowMultiplier =
       _mm_set1_epi16(kTransformRowMultiplier << 3);
   const __m128i v_src_round =
@@ -1767,7 +1770,8 @@ LIBGAV1_ALWAYS_INLINE bool Identity8DcOnly(void* dest, int adjusted_tx_height,
 
   auto* dst = static_cast<int16_t*>(dest);
   const __m128i v_src0 = _mm_cvtsi32_si128(dst[0]);
-  const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0);
+  const __m128i v_mask =
+      _mm_set1_epi16(should_round ? static_cast<int16_t>(0xffff) : 0);
   const __m128i v_kTransformRowMultiplier =
       _mm_set1_epi16(kTransformRowMultiplier << 3);
   const __m128i v_src_round =
@@ -1859,7 +1863,8 @@ LIBGAV1_ALWAYS_INLINE bool Identity16DcOnly(void* dest, int adjusted_tx_height,
 
   auto* dst = static_cast<int16_t*>(dest);
   const __m128i v_src0 = _mm_cvtsi32_si128(dst[0]);
-  const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0);
+  const __m128i v_mask =
+      _mm_set1_epi16(should_round ? static_cast<int16_t>(0xffff) : 0);
   const __m128i v_kTransformRowMultiplier =
       _mm_set1_epi16(kTransformRowMultiplier << 3);
   const __m128i v_src_round0 =
@@ -2918,75 +2923,11 @@ void Wht4TransformLoopColumn_SSE4_1(TransformType tx_type,
 
 //------------------------------------------------------------------------------
 
-template <typename Residual, typename Pixel>
-void InitAll(Dsp* const dsp) {
-  // Maximum transform size for Dct is 64.
-  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kRow] =
-      Dct4TransformLoopRow_SSE4_1;
-  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kColumn] =
-      Dct4TransformLoopColumn_SSE4_1;
-  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kRow] =
-      Dct8TransformLoopRow_SSE4_1;
-  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kColumn] =
-      Dct8TransformLoopColumn_SSE4_1;
-  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kRow] =
-      Dct16TransformLoopRow_SSE4_1;
-  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kColumn] =
-      Dct16TransformLoopColumn_SSE4_1;
-  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kRow] =
-      Dct32TransformLoopRow_SSE4_1;
-  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kColumn] =
-      Dct32TransformLoopColumn_SSE4_1;
-  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kRow] =
-      Dct64TransformLoopRow_SSE4_1;
-  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kColumn] =
-      Dct64TransformLoopColumn_SSE4_1;
-
-  // Maximum transform size for Adst is 16.
-  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kRow] =
-      Adst4TransformLoopRow_SSE4_1;
-  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kColumn] =
-      Adst4TransformLoopColumn_SSE4_1;
-  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kRow] =
-      Adst8TransformLoopRow_SSE4_1;
-  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kColumn] =
-      Adst8TransformLoopColumn_SSE4_1;
-  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kRow] =
-      Adst16TransformLoopRow_SSE4_1;
-  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kColumn] =
-      Adst16TransformLoopColumn_SSE4_1;
-
-  // Maximum transform size for Identity transform is 32.
-  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kRow] =
-      Identity4TransformLoopRow_SSE4_1;
-  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kColumn] =
-      Identity4TransformLoopColumn_SSE4_1;
-  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kRow] =
-      Identity8TransformLoopRow_SSE4_1;
-  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kColumn] =
-      Identity8TransformLoopColumn_SSE4_1;
-  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kRow] =
-      Identity16TransformLoopRow_SSE4_1;
-  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kColumn] =
-      Identity16TransformLoopColumn_SSE4_1;
-  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize32][kRow] =
-      Identity32TransformLoopRow_SSE4_1;
-  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize32][kColumn] =
-      Identity32TransformLoopColumn_SSE4_1;
-
-  // Maximum transform size for Wht is 4.
-  dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kRow] =
-      Wht4TransformLoopRow_SSE4_1;
-  dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kColumn] =
-      Wht4TransformLoopColumn_SSE4_1;
-}
-
 void Init8bpp() {
   Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
   assert(dsp != nullptr);
-#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
-  InitAll<int16_t, uint8_t>(dsp);
-#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+
+  // Maximum transform size for Dct is 64.
 #if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize4_1DTransformDct)
   dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kRow] =
       Dct4TransformLoopRow_SSE4_1;
@@ -3017,6 +2958,8 @@ void Init8bpp() {
   dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kColumn] =
       Dct64TransformLoopColumn_SSE4_1;
 #endif
+
+  // Maximum transform size for Adst is 16.
 #if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize4_1DTransformAdst)
   dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kRow] =
       Adst4TransformLoopRow_SSE4_1;
@@ -3035,6 +2978,8 @@ void Init8bpp() {
   dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kColumn] =
       Adst16TransformLoopColumn_SSE4_1;
 #endif
+
+  // Maximum transform size for Identity transform is 32.
 #if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize4_1DTransformIdentity)
   dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kRow] =
       Identity4TransformLoopRow_SSE4_1;
@@ -3059,13 +3004,14 @@ void Init8bpp() {
   dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize32][kColumn] =
       Identity32TransformLoopColumn_SSE4_1;
 #endif
+
+  // Maximum transform size for Wht is 4.
 #if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize4_1DTransformWht)
   dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kRow] =
       Wht4TransformLoopRow_SSE4_1;
   dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kColumn] =
       Wht4TransformLoopColumn_SSE4_1;
 #endif
-#endif
 }
 
 }  // namespace
@@ -3075,7 +3021,7 @@ void InverseTransformInit_SSE4_1() { low_bitdepth::Init8bpp(); }
 
 }  // namespace dsp
 }  // namespace libgav1
-#else  // !LIBGAV1_TARGETING_SSE4_1
+#else   // !LIBGAV1_TARGETING_SSE4_1
 namespace libgav1 {
 namespace dsp {
 
diff --git a/src/dsp/x86/loop_filter_sse4.cc b/src/dsp/x86/loop_filter_sse4.cc
index d67b450..b9da2d5 100644
--- a/src/dsp/x86/loop_filter_sse4.cc
+++ b/src/dsp/x86/loop_filter_sse4.cc
@@ -350,7 +350,7 @@ void Horizontal6(void* dest, ptrdiff_t stride, int outer_thresh,
   const __m128i v_mask =
       _mm_shuffle_epi32(_mm_and_si128(v_needs_mask, v_isflat3_mask), 0);
 
-  if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi8(v_mask, v_mask)) == 0) {
+  if (_mm_test_all_zeros(v_mask, v_mask) == 0) {
     __m128i oqp1_f6;
     __m128i oqp0_f6;
 
@@ -454,7 +454,7 @@ void Vertical6(void* dest, ptrdiff_t stride, int outer_thresh, int inner_thresh,
   const __m128i v_mask =
       _mm_shuffle_epi32(_mm_and_si128(v_needs_mask, v_isflat3_mask), 0);
 
-  if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi8(v_mask, v_mask)) == 0) {
+  if (_mm_test_all_zeros(v_mask, v_mask) == 0) {
     __m128i oqp1_f6;
     __m128i oqp0_f6;
 
@@ -595,7 +595,7 @@ void Horizontal8(void* dest, ptrdiff_t stride, int outer_thresh,
   const __m128i v_mask =
       _mm_shuffle_epi32(_mm_and_si128(v_needs_mask, v_isflat4_mask), 0);
 
-  if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi8(v_mask, v_mask)) == 0) {
+  if (_mm_test_all_zeros(v_mask, v_mask) == 0) {
     __m128i oqp2_f8;
     __m128i oqp1_f8;
     __m128i oqp0_f8;
@@ -697,7 +697,7 @@ void Vertical8(void* dest, ptrdiff_t stride, int outer_thresh, int inner_thresh,
   const __m128i v_mask =
       _mm_shuffle_epi32(_mm_and_si128(v_needs_mask, v_isflat4_mask), 0);
 
-  if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi8(v_mask, v_mask)) == 0) {
+  if (_mm_test_all_zeros(v_mask, v_mask) == 0) {
     __m128i oqp2_f8;
     __m128i oqp1_f8;
     __m128i oqp0_f8;
@@ -838,7 +838,7 @@ void Horizontal14(void* dest, ptrdiff_t stride, int outer_thresh,
   const __m128i v_mask =
       _mm_shuffle_epi32(_mm_and_si128(v_needs_mask, v_isflat4_mask), 0);
 
-  if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi8(v_mask, v_mask)) == 0) {
+  if (_mm_test_all_zeros(v_mask, v_mask) == 0) {
     const __m128i p6 = Load4(dst - 7 * stride);
     const __m128i p5 = Load4(dst - 6 * stride);
     const __m128i p4 = Load4(dst - 5 * stride);
@@ -864,8 +864,7 @@ void Horizontal14(void* dest, ptrdiff_t stride, int outer_thresh,
     oqp1 = _mm_blendv_epi8(oqp1, oqp1_f8, v_mask);
     oqp0 = _mm_blendv_epi8(oqp0, oqp0_f8, v_mask);
 
-    if (_mm_test_all_zeros(v_flat4_mask,
-                           _mm_cmpeq_epi8(v_flat4_mask, v_flat4_mask)) == 0) {
+    if (_mm_test_all_zeros(v_flat4_mask, v_flat4_mask) == 0) {
       __m128i oqp5_f14;
       __m128i oqp4_f14;
       __m128i oqp3_f14;
@@ -1050,7 +1049,7 @@ void Vertical14(void* dest, ptrdiff_t stride, int outer_thresh,
   const __m128i v_mask =
       _mm_shuffle_epi32(_mm_and_si128(v_needs_mask, v_isflat4_mask), 0);
 
-  if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi8(v_mask, v_mask)) == 0) {
+  if (_mm_test_all_zeros(v_mask, v_mask) == 0) {
     const __m128i v_isflatouter4_mask =
         IsFlat4(qp6, qp5, qp4, qp0, v_flat_thresh);
     const __m128i v_flat4_mask =
@@ -1066,8 +1065,7 @@ void Vertical14(void* dest, ptrdiff_t stride, int outer_thresh,
     oqp1 = _mm_blendv_epi8(oqp1, oqp1_f8, v_mask);
     oqp0 = _mm_blendv_epi8(oqp0, oqp0_f8, v_mask);
 
-    if (_mm_test_all_zeros(v_flat4_mask,
-                           _mm_cmpeq_epi8(v_flat4_mask, v_flat4_mask)) == 0) {
+    if (_mm_test_all_zeros(v_flat4_mask, v_flat4_mask) == 0) {
       __m128i oqp5_f14;
       __m128i oqp4_f14;
       __m128i oqp3_f14;
@@ -1458,7 +1456,7 @@ void LoopFilterFuncs_SSE4_1<bitdepth>::Horizontal6(void* dest,
   const __m128i v_mask_lo = _mm_and_si128(v_needs_mask, v_isflat3_mask);
   const __m128i v_mask = _mm_unpacklo_epi64(v_mask_lo, v_mask_lo);
 
-  if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi16(v_mask, v_mask)) == 0) {
+  if (_mm_test_all_zeros(v_mask, v_mask) == 0) {
     __m128i oqp1_f6;
     __m128i oqp0_f6;
 
@@ -1572,7 +1570,7 @@ void LoopFilterFuncs_SSE4_1<bitdepth>::Vertical6(void* dest, ptrdiff_t stride8,
   const __m128i v_mask_lo = _mm_and_si128(v_needs_mask, v_isflat3_mask);
   const __m128i v_mask = _mm_unpacklo_epi64(v_mask_lo, v_mask_lo);
 
-  if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi16(v_mask, v_mask)) == 0) {
+  if (_mm_test_all_zeros(v_mask, v_mask) == 0) {
     __m128i oqp1_f6;
     __m128i oqp0_f6;
 
@@ -1711,7 +1709,7 @@ void LoopFilterFuncs_SSE4_1<bitdepth>::Horizontal8(void* dest,
   const __m128i v_mask_lo = _mm_and_si128(v_needs_mask, v_isflat4_mask);
   const __m128i v_mask = _mm_unpacklo_epi64(v_mask_lo, v_mask_lo);
 
-  if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi16(v_mask, v_mask)) == 0) {
+  if (_mm_test_all_zeros(v_mask, v_mask) == 0) {
     __m128i oqp2_f8;
     __m128i oqp1_f8;
     __m128i oqp0_f8;
@@ -1821,7 +1819,7 @@ void LoopFilterFuncs_SSE4_1<bitdepth>::Vertical8(void* dest, ptrdiff_t stride8,
   const __m128i v_mask_lo = _mm_and_si128(v_needs_mask, v_isflat4_mask);
   const __m128i v_mask = _mm_unpacklo_epi64(v_mask_lo, v_mask_lo);
 
-  if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi16(v_mask, v_mask)) == 0) {
+  if (_mm_test_all_zeros(v_mask, v_mask) == 0) {
     __m128i oqp2_f8;
     __m128i oqp1_f8;
     __m128i oqp0_f8;
@@ -1957,7 +1955,7 @@ void LoopFilterFuncs_SSE4_1<bitdepth>::Horizontal14(void* dest,
   const __m128i v_mask_lo = _mm_and_si128(v_needs_mask, v_isflat4_mask);
   const __m128i v_mask = _mm_unpacklo_epi64(v_mask_lo, v_mask_lo);
 
-  if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi16(v_mask, v_mask)) == 0) {
+  if (_mm_test_all_zeros(v_mask, v_mask) == 0) {
     const __m128i p6 = LoadLo8(dst - 7 * stride);
     const __m128i p5 = LoadLo8(dst - 6 * stride);
     const __m128i p4 = LoadLo8(dst - 5 * stride);
@@ -1984,8 +1982,7 @@ void LoopFilterFuncs_SSE4_1<bitdepth>::Horizontal14(void* dest,
     oqp1 = _mm_blendv_epi8(oqp1, oqp1_f8, v_mask);
     oqp0 = _mm_blendv_epi8(oqp0, oqp0_f8, v_mask);
 
-    if (_mm_test_all_zeros(v_flat4_mask,
-                           _mm_cmpeq_epi16(v_flat4_mask, v_flat4_mask)) == 0) {
+    if (_mm_test_all_zeros(v_flat4_mask, v_flat4_mask) == 0) {
       __m128i oqp5_f14;
       __m128i oqp4_f14;
       __m128i oqp3_f14;
@@ -2133,7 +2130,7 @@ void LoopFilterFuncs_SSE4_1<bitdepth>::Vertical14(void* dest, ptrdiff_t stride8,
   const __m128i v_mask_lo = _mm_and_si128(v_needs_mask, v_isflat4_mask);
   const __m128i v_mask = _mm_unpacklo_epi64(v_mask_lo, v_mask_lo);
 
-  if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi16(v_mask, v_mask)) == 0) {
+  if (_mm_test_all_zeros(v_mask, v_mask) == 0) {
     const __m128i v_isflatouter4_mask =
         IsFlat4(qp6, qp5, qp4, qp0, v_flat_thresh);
     const __m128i v_flat4_mask_lo = _mm_and_si128(v_mask, v_isflatouter4_mask);
@@ -2150,8 +2147,7 @@ void LoopFilterFuncs_SSE4_1<bitdepth>::Vertical14(void* dest, ptrdiff_t stride8,
     oqp1 = _mm_blendv_epi8(oqp1, oqp1_f8, v_mask);
     oqp0 = _mm_blendv_epi8(oqp0, oqp0_f8, v_mask);
 
-    if (_mm_test_all_zeros(v_flat4_mask,
-                           _mm_cmpeq_epi16(v_flat4_mask, v_flat4_mask)) == 0) {
+    if (_mm_test_all_zeros(v_flat4_mask, v_flat4_mask) == 0) {
       __m128i oqp5_f14;
       __m128i oqp4_f14;
       __m128i oqp3_f14;
@@ -2245,7 +2241,7 @@ void LoopFilterInit_SSE4_1() {
 }  // namespace dsp
 }  // namespace libgav1
 
-#else  // !LIBGAV1_TARGETING_SSE4_1
+#else   // !LIBGAV1_TARGETING_SSE4_1
 namespace libgav1 {
 namespace dsp {
 
diff --git a/src/dsp/x86/loop_restoration_10bit_avx2.cc b/src/dsp/x86/loop_restoration_10bit_avx2.cc
index 702bdea..b38f322 100644
--- a/src/dsp/x86/loop_restoration_10bit_avx2.cc
+++ b/src/dsp/x86/loop_restoration_10bit_avx2.cc
@@ -28,7 +28,6 @@
 #include "src/dsp/constants.h"
 #include "src/dsp/dsp.h"
 #include "src/dsp/x86/common_avx2.h"
-#include "src/dsp/x86/common_sse4.h"
 #include "src/utils/common.h"
 #include "src/utils/constants.h"
 
@@ -472,12 +471,12 @@ inline void WienerVerticalTap1(const int16_t* wiener_buffer,
   }
 }
 
-void WienerFilter_AVX2(const RestorationUnitInfo& restoration_info,
-                       const void* const source, const void* const top_border,
-                       const void* const bottom_border, const ptrdiff_t stride,
-                       const int width, const int height,
-                       RestorationBuffer* const restoration_buffer,
-                       void* const dest) {
+void WienerFilter_AVX2(
+    const RestorationUnitInfo& restoration_info, const void* const source,
+    const ptrdiff_t stride, const void* const top_border,
+    const ptrdiff_t top_border_stride, const void* const bottom_border,
+    const ptrdiff_t bottom_border_stride, const int width, const int height,
+    RestorationBuffer* const restoration_buffer, void* const dest) {
   const int16_t* const number_leading_zero_coefficients =
       restoration_info.wiener_info.number_leading_zero_coefficients;
   const int number_rows_to_skip = std::max(
@@ -502,39 +501,42 @@ void WienerFilter_AVX2(const RestorationUnitInfo& restoration_info,
       LoadLo8(restoration_info.wiener_info.filter[WienerInfo::kHorizontal]);
   const __m256i coefficients_horizontal = _mm256_broadcastq_epi64(c);
   if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) {
-    WienerHorizontalTap7(top + (2 - height_extra) * stride - 3, stride,
-                         wiener_stride, height_extra, &coefficients_horizontal,
-                         &wiener_buffer_horizontal);
-    WienerHorizontalTap7(src - 3, stride, wiener_stride, height,
+    WienerHorizontalTap7(top + (2 - height_extra) * top_border_stride - 3,
+                         top_border_stride, wiener_stride, height_extra,
                          &coefficients_horizontal, &wiener_buffer_horizontal);
-    WienerHorizontalTap7(bottom - 3, stride, wiener_stride, height_extra,
+    WienerHorizontalTap7(src - 3, stride, wiener_stride, height,
                          &coefficients_horizontal, &wiener_buffer_horizontal);
-  } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
-    WienerHorizontalTap5(top + (2 - height_extra) * stride - 2, stride,
-                         wiener_stride, height_extra, &coefficients_horizontal,
+    WienerHorizontalTap7(bottom - 3, bottom_border_stride, wiener_stride,
+                         height_extra, &coefficients_horizontal,
                          &wiener_buffer_horizontal);
-    WienerHorizontalTap5(src - 2, stride, wiener_stride, height,
+  } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
+    WienerHorizontalTap5(top + (2 - height_extra) * top_border_stride - 2,
+                         top_border_stride, wiener_stride, height_extra,
                          &coefficients_horizontal, &wiener_buffer_horizontal);
-    WienerHorizontalTap5(bottom - 2, stride, wiener_stride, height_extra,
+    WienerHorizontalTap5(src - 2, stride, wiener_stride, height,
                          &coefficients_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap5(bottom - 2, bottom_border_stride, wiener_stride,
+                         height_extra, &coefficients_horizontal,
+                         &wiener_buffer_horizontal);
   } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) {
     // The maximum over-reads happen here.
-    WienerHorizontalTap3(top + (2 - height_extra) * stride - 1, stride,
-                         wiener_stride, height_extra, &coefficients_horizontal,
-                         &wiener_buffer_horizontal);
-    WienerHorizontalTap3(src - 1, stride, wiener_stride, height,
+    WienerHorizontalTap3(top + (2 - height_extra) * top_border_stride - 1,
+                         top_border_stride, wiener_stride, height_extra,
                          &coefficients_horizontal, &wiener_buffer_horizontal);
-    WienerHorizontalTap3(bottom - 1, stride, wiener_stride, height_extra,
+    WienerHorizontalTap3(src - 1, stride, wiener_stride, height,
                          &coefficients_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap3(bottom - 1, bottom_border_stride, wiener_stride,
+                         height_extra, &coefficients_horizontal,
+                         &wiener_buffer_horizontal);
   } else {
     assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3);
-    WienerHorizontalTap1(top + (2 - height_extra) * stride, stride,
-                         wiener_stride, height_extra,
+    WienerHorizontalTap1(top + (2 - height_extra) * top_border_stride,
+                         top_border_stride, wiener_stride, height_extra,
                          &wiener_buffer_horizontal);
     WienerHorizontalTap1(src, stride, wiener_stride, height,
                          &wiener_buffer_horizontal);
-    WienerHorizontalTap1(bottom, stride, wiener_stride, height_extra,
-                         &wiener_buffer_horizontal);
+    WienerHorizontalTap1(bottom, bottom_border_stride, wiener_stride,
+                         height_extra, &wiener_buffer_horizontal);
   }
 
   // vertical filtering.
@@ -566,12 +568,2575 @@ void WienerFilter_AVX2(const RestorationUnitInfo& restoration_info,
   }
 }
 
+//------------------------------------------------------------------------------
+// SGR
+
+constexpr int kSumOffset = 24;
+
+// SIMD overreads the number of pixels in SIMD registers - (width % 8) - 2 *
+// padding pixels, where padding is 3 for Pass 1 and 2 for Pass 2. The number of
+// bytes in SIMD registers is 16 for SSE4.1 and 32 for AVX2.
+constexpr int kOverreadInBytesPass1_128 = 4;
+constexpr int kOverreadInBytesPass2_128 = 8;
+constexpr int kOverreadInBytesPass1_256 = kOverreadInBytesPass1_128 + 16;
+constexpr int kOverreadInBytesPass2_256 = kOverreadInBytesPass2_128 + 16;
+
+inline void LoadAligned16x2U16(const uint16_t* const src[2], const ptrdiff_t x,
+                               __m128i dst[2]) {
+  dst[0] = LoadAligned16(src[0] + x);
+  dst[1] = LoadAligned16(src[1] + x);
+}
+
+inline void LoadAligned32x2U16(const uint16_t* const src[2], const ptrdiff_t x,
+                               __m256i dst[2]) {
+  dst[0] = LoadAligned32(src[0] + x);
+  dst[1] = LoadAligned32(src[1] + x);
+}
+
+inline void LoadAligned32x2U16Msan(const uint16_t* const src[2],
+                                   const ptrdiff_t x, const ptrdiff_t border,
+                                   __m256i dst[2]) {
+  dst[0] = LoadAligned32Msan(src[0] + x, sizeof(**src) * (x + 16 - border));
+  dst[1] = LoadAligned32Msan(src[1] + x, sizeof(**src) * (x + 16 - border));
+}
+
+inline void LoadAligned16x3U16(const uint16_t* const src[3], const ptrdiff_t x,
+                               __m128i dst[3]) {
+  dst[0] = LoadAligned16(src[0] + x);
+  dst[1] = LoadAligned16(src[1] + x);
+  dst[2] = LoadAligned16(src[2] + x);
+}
+
+inline void LoadAligned32x3U16(const uint16_t* const src[3], const ptrdiff_t x,
+                               __m256i dst[3]) {
+  dst[0] = LoadAligned32(src[0] + x);
+  dst[1] = LoadAligned32(src[1] + x);
+  dst[2] = LoadAligned32(src[2] + x);
+}
+
+inline void LoadAligned32x3U16Msan(const uint16_t* const src[3],
+                                   const ptrdiff_t x, const ptrdiff_t border,
+                                   __m256i dst[3]) {
+  dst[0] = LoadAligned32Msan(src[0] + x, sizeof(**src) * (x + 16 - border));
+  dst[1] = LoadAligned32Msan(src[1] + x, sizeof(**src) * (x + 16 - border));
+  dst[2] = LoadAligned32Msan(src[2] + x, sizeof(**src) * (x + 16 - border));
+}
+
+inline void LoadAligned32U32(const uint32_t* const src, __m128i dst[2]) {
+  dst[0] = LoadAligned16(src + 0);
+  dst[1] = LoadAligned16(src + 4);
+}
+
+inline void LoadAligned32x2U32(const uint32_t* const src[2], const ptrdiff_t x,
+                               __m128i dst[2][2]) {
+  LoadAligned32U32(src[0] + x, dst[0]);
+  LoadAligned32U32(src[1] + x, dst[1]);
+}
+
+inline void LoadAligned64x2U32(const uint32_t* const src[2], const ptrdiff_t x,
+                               __m256i dst[2][2]) {
+  LoadAligned64(src[0] + x, dst[0]);
+  LoadAligned64(src[1] + x, dst[1]);
+}
+
+inline void LoadAligned64x2U32Msan(const uint32_t* const src[2],
+                                   const ptrdiff_t x, const ptrdiff_t border,
+                                   __m256i dst[2][2]) {
+  LoadAligned64Msan(src[0] + x, sizeof(**src) * (x + 16 - border), dst[0]);
+  LoadAligned64Msan(src[1] + x, sizeof(**src) * (x + 16 - border), dst[1]);
+}
+
+inline void LoadAligned32x3U32(const uint32_t* const src[3], const ptrdiff_t x,
+                               __m128i dst[3][2]) {
+  LoadAligned32U32(src[0] + x, dst[0]);
+  LoadAligned32U32(src[1] + x, dst[1]);
+  LoadAligned32U32(src[2] + x, dst[2]);
+}
+
+inline void LoadAligned64x3U32(const uint32_t* const src[3], const ptrdiff_t x,
+                               __m256i dst[3][2]) {
+  LoadAligned64(src[0] + x, dst[0]);
+  LoadAligned64(src[1] + x, dst[1]);
+  LoadAligned64(src[2] + x, dst[2]);
+}
+
+inline void LoadAligned64x3U32Msan(const uint32_t* const src[3],
+                                   const ptrdiff_t x, const ptrdiff_t border,
+                                   __m256i dst[3][2]) {
+  LoadAligned64Msan(src[0] + x, sizeof(**src) * (x + 16 - border), dst[0]);
+  LoadAligned64Msan(src[1] + x, sizeof(**src) * (x + 16 - border), dst[1]);
+  LoadAligned64Msan(src[2] + x, sizeof(**src) * (x + 16 - border), dst[2]);
+}
+
+inline void StoreAligned32U32(uint32_t* const dst, const __m128i src[2]) {
+  StoreAligned16(dst + 0, src[0]);
+  StoreAligned16(dst + 4, src[1]);
+}
+
+// The AVX2 ymm register holds ma[0], ma[1], ..., ma[7], and ma[16], ma[17],
+// ..., ma[23].
+// There is an 8 pixel gap between the first half and the second half.
+constexpr int kMaStoreOffset = 8;
+
+inline void StoreAligned32_ma(uint16_t* src, const __m256i v) {
+  StoreAligned16(src + 0 * 8, _mm256_extracti128_si256(v, 0));
+  StoreAligned16(src + 2 * 8, _mm256_extracti128_si256(v, 1));
+}
+
+inline void StoreAligned64_ma(uint16_t* src, const __m256i v[2]) {
+  // The next 4 lines are much faster than:
+  // StoreAligned32(src + 0, _mm256_permute2x128_si256(v[0], v[1], 0x20));
+  // StoreAligned32(src + 16, _mm256_permute2x128_si256(v[0], v[1], 0x31));
+  StoreAligned16(src + 0 * 8, _mm256_extracti128_si256(v[0], 0));
+  StoreAligned16(src + 1 * 8, _mm256_extracti128_si256(v[1], 0));
+  StoreAligned16(src + 2 * 8, _mm256_extracti128_si256(v[0], 1));
+  StoreAligned16(src + 3 * 8, _mm256_extracti128_si256(v[1], 1));
+}
+
+// Don't use _mm_cvtepu8_epi16() or _mm_cvtepu16_epi32() in the following
+// functions. Some compilers may generate super inefficient code and the whole
+// decoder could be 15% slower.
+
+inline __m256i VaddlLo8(const __m256i src0, const __m256i src1) {
+  const __m256i s0 = _mm256_unpacklo_epi8(src0, _mm256_setzero_si256());
+  const __m256i s1 = _mm256_unpacklo_epi8(src1, _mm256_setzero_si256());
+  return _mm256_add_epi16(s0, s1);
+}
+
+inline __m256i VaddlHi8(const __m256i src0, const __m256i src1) {
+  const __m256i s0 = _mm256_unpackhi_epi8(src0, _mm256_setzero_si256());
+  const __m256i s1 = _mm256_unpackhi_epi8(src1, _mm256_setzero_si256());
+  return _mm256_add_epi16(s0, s1);
+}
+
+inline __m256i VaddwLo8(const __m256i src0, const __m256i src1) {
+  const __m256i s1 = _mm256_unpacklo_epi8(src1, _mm256_setzero_si256());
+  return _mm256_add_epi16(src0, s1);
+}
+
+inline __m256i VaddwHi8(const __m256i src0, const __m256i src1) {
+  const __m256i s1 = _mm256_unpackhi_epi8(src1, _mm256_setzero_si256());
+  return _mm256_add_epi16(src0, s1);
+}
+
+inline __m256i VmullNLo8(const __m256i src0, const int src1) {
+  const __m256i s0 = _mm256_unpacklo_epi16(src0, _mm256_setzero_si256());
+  return _mm256_madd_epi16(s0, _mm256_set1_epi32(src1));
+}
+
+inline __m256i VmullNHi8(const __m256i src0, const int src1) {
+  const __m256i s0 = _mm256_unpackhi_epi16(src0, _mm256_setzero_si256());
+  return _mm256_madd_epi16(s0, _mm256_set1_epi32(src1));
+}
+
+inline __m128i VmullLo16(const __m128i src0, const __m128i src1) {
+  const __m128i s0 = _mm_unpacklo_epi16(src0, _mm_setzero_si128());
+  const __m128i s1 = _mm_unpacklo_epi16(src1, _mm_setzero_si128());
+  return _mm_madd_epi16(s0, s1);
+}
+
+inline __m256i VmullLo16(const __m256i src0, const __m256i src1) {
+  const __m256i s0 = _mm256_unpacklo_epi16(src0, _mm256_setzero_si256());
+  const __m256i s1 = _mm256_unpacklo_epi16(src1, _mm256_setzero_si256());
+  return _mm256_madd_epi16(s0, s1);
+}
+
+inline __m128i VmullHi16(const __m128i src0, const __m128i src1) {
+  const __m128i s0 = _mm_unpackhi_epi16(src0, _mm_setzero_si128());
+  const __m128i s1 = _mm_unpackhi_epi16(src1, _mm_setzero_si128());
+  return _mm_madd_epi16(s0, s1);
+}
+
+inline __m256i VmullHi16(const __m256i src0, const __m256i src1) {
+  const __m256i s0 = _mm256_unpackhi_epi16(src0, _mm256_setzero_si256());
+  const __m256i s1 = _mm256_unpackhi_epi16(src1, _mm256_setzero_si256());
+  return _mm256_madd_epi16(s0, s1);
+}
+
+inline __m128i VrshrU16(const __m128i src0, const int src1) {
+  const __m128i sum = _mm_add_epi16(src0, _mm_set1_epi16(1 << (src1 - 1)));
+  return _mm_srli_epi16(sum, src1);
+}
+
+inline __m256i VrshrU16(const __m256i src0, const int src1) {
+  const __m256i sum =
+      _mm256_add_epi16(src0, _mm256_set1_epi16(1 << (src1 - 1)));
+  return _mm256_srli_epi16(sum, src1);
+}
+
+inline __m256i VrshrS32(const __m256i src0, const int src1) {
+  const __m256i sum =
+      _mm256_add_epi32(src0, _mm256_set1_epi32(1 << (src1 - 1)));
+  return _mm256_srai_epi32(sum, src1);
+}
+
+inline __m128i VrshrU32(const __m128i src0, const int src1) {
+  const __m128i sum = _mm_add_epi32(src0, _mm_set1_epi32(1 << (src1 - 1)));
+  return _mm_srli_epi32(sum, src1);
+}
+
+inline __m256i VrshrU32(const __m256i src0, const int src1) {
+  const __m256i sum =
+      _mm256_add_epi32(src0, _mm256_set1_epi32(1 << (src1 - 1)));
+  return _mm256_srli_epi32(sum, src1);
+}
+
+inline void Square(const __m128i src, __m128i dst[2]) {
+  const __m128i s0 = _mm_unpacklo_epi16(src, _mm_setzero_si128());
+  const __m128i s1 = _mm_unpackhi_epi16(src, _mm_setzero_si128());
+  dst[0] = _mm_madd_epi16(s0, s0);
+  dst[1] = _mm_madd_epi16(s1, s1);
+}
+
+inline void Square(const __m256i src, __m256i dst[2]) {
+  const __m256i s0 = _mm256_unpacklo_epi16(src, _mm256_setzero_si256());
+  const __m256i s1 = _mm256_unpackhi_epi16(src, _mm256_setzero_si256());
+  dst[0] = _mm256_madd_epi16(s0, s0);
+  dst[1] = _mm256_madd_epi16(s1, s1);
+}
+
+inline void Prepare3_8(const __m256i src[2], __m256i dst[3]) {
+  dst[0] = _mm256_alignr_epi8(src[1], src[0], 0);
+  dst[1] = _mm256_alignr_epi8(src[1], src[0], 1);
+  dst[2] = _mm256_alignr_epi8(src[1], src[0], 2);
+}
+
+inline void Prepare3_16(const __m128i src[2], __m128i dst[3]) {
+  dst[0] = src[0];
+  dst[1] = _mm_alignr_epi8(src[1], src[0], 2);
+  dst[2] = _mm_alignr_epi8(src[1], src[0], 4);
+}
+
+inline void Prepare3_32(const __m128i src[2], __m128i dst[3]) {
+  dst[0] = src[0];
+  dst[1] = _mm_alignr_epi8(src[1], src[0], 4);
+  dst[2] = _mm_alignr_epi8(src[1], src[0], 8);
+}
+
+inline void Prepare3_32(const __m256i src[2], __m256i dst[3]) {
+  dst[0] = src[0];
+  dst[1] = _mm256_alignr_epi8(src[1], src[0], 4);
+  dst[2] = _mm256_alignr_epi8(src[1], src[0], 8);
+}
+
+inline void Prepare5_16(const __m128i src[2], __m128i dst[5]) {
+  Prepare3_16(src, dst);
+  dst[3] = _mm_alignr_epi8(src[1], src[0], 6);
+  dst[4] = _mm_alignr_epi8(src[1], src[0], 8);
+}
+
+inline void Prepare5_32(const __m128i src[2], __m128i dst[5]) {
+  Prepare3_32(src, dst);
+  dst[3] = _mm_alignr_epi8(src[1], src[0], 12);
+  dst[4] = src[1];
+}
+
+inline void Prepare5_32(const __m256i src[2], __m256i dst[5]) {
+  Prepare3_32(src, dst);
+  dst[3] = _mm256_alignr_epi8(src[1], src[0], 12);
+  dst[4] = src[1];
+}
+
+inline __m128i Sum3_16(const __m128i src0, const __m128i src1,
+                       const __m128i src2) {
+  const __m128i sum = _mm_add_epi16(src0, src1);
+  return _mm_add_epi16(sum, src2);
+}
+
+inline __m256i Sum3_16(const __m256i src0, const __m256i src1,
+                       const __m256i src2) {
+  const __m256i sum = _mm256_add_epi16(src0, src1);
+  return _mm256_add_epi16(sum, src2);
+}
+
+inline __m128i Sum3_16(const __m128i src[3]) {
+  return Sum3_16(src[0], src[1], src[2]);
+}
+
+inline __m256i Sum3_16(const __m256i src[3]) {
+  return Sum3_16(src[0], src[1], src[2]);
+}
+
+inline __m128i Sum3_32(const __m128i src0, const __m128i src1,
+                       const __m128i src2) {
+  const __m128i sum = _mm_add_epi32(src0, src1);
+  return _mm_add_epi32(sum, src2);
+}
+
+inline __m256i Sum3_32(const __m256i src0, const __m256i src1,
+                       const __m256i src2) {
+  const __m256i sum = _mm256_add_epi32(src0, src1);
+  return _mm256_add_epi32(sum, src2);
+}
+
+inline __m128i Sum3_32(const __m128i src[3]) {
+  return Sum3_32(src[0], src[1], src[2]);
+}
+
+inline __m256i Sum3_32(const __m256i src[3]) {
+  return Sum3_32(src[0], src[1], src[2]);
+}
+
+inline void Sum3_32(const __m128i src[3][2], __m128i dst[2]) {
+  dst[0] = Sum3_32(src[0][0], src[1][0], src[2][0]);
+  dst[1] = Sum3_32(src[0][1], src[1][1], src[2][1]);
+}
+
+inline void Sum3_32(const __m256i src[3][2], __m256i dst[2]) {
+  dst[0] = Sum3_32(src[0][0], src[1][0], src[2][0]);
+  dst[1] = Sum3_32(src[0][1], src[1][1], src[2][1]);
+}
+
+inline __m256i Sum3WLo16(const __m256i src[3]) {
+  const __m256i sum = VaddlLo8(src[0], src[1]);
+  return VaddwLo8(sum, src[2]);
+}
+
+inline __m256i Sum3WHi16(const __m256i src[3]) {
+  const __m256i sum = VaddlHi8(src[0], src[1]);
+  return VaddwHi8(sum, src[2]);
+}
+
+inline __m128i Sum5_16(const __m128i src[5]) {
+  const __m128i sum01 = _mm_add_epi16(src[0], src[1]);
+  const __m128i sum23 = _mm_add_epi16(src[2], src[3]);
+  const __m128i sum = _mm_add_epi16(sum01, sum23);
+  return _mm_add_epi16(sum, src[4]);
+}
+
+inline __m256i Sum5_16(const __m256i src[5]) {
+  const __m256i sum01 = _mm256_add_epi16(src[0], src[1]);
+  const __m256i sum23 = _mm256_add_epi16(src[2], src[3]);
+  const __m256i sum = _mm256_add_epi16(sum01, sum23);
+  return _mm256_add_epi16(sum, src[4]);
+}
+
+inline __m128i Sum5_32(const __m128i* const src0, const __m128i* const src1,
+                       const __m128i* const src2, const __m128i* const src3,
+                       const __m128i* const src4) {
+  const __m128i sum01 = _mm_add_epi32(*src0, *src1);
+  const __m128i sum23 = _mm_add_epi32(*src2, *src3);
+  const __m128i sum = _mm_add_epi32(sum01, sum23);
+  return _mm_add_epi32(sum, *src4);
+}
+
+inline __m256i Sum5_32(const __m256i* const src0, const __m256i* const src1,
+                       const __m256i* const src2, const __m256i* const src3,
+                       const __m256i* const src4) {
+  const __m256i sum01 = _mm256_add_epi32(*src0, *src1);
+  const __m256i sum23 = _mm256_add_epi32(*src2, *src3);
+  const __m256i sum = _mm256_add_epi32(sum01, sum23);
+  return _mm256_add_epi32(sum, *src4);
+}
+
+inline __m128i Sum5_32(const __m128i src[5]) {
+  return Sum5_32(&src[0], &src[1], &src[2], &src[3], &src[4]);
+}
+
+inline __m256i Sum5_32(const __m256i src[5]) {
+  return Sum5_32(&src[0], &src[1], &src[2], &src[3], &src[4]);
+}
+
+inline void Sum5_32(const __m128i src[5][2], __m128i dst[2]) {
+  dst[0] = Sum5_32(&src[0][0], &src[1][0], &src[2][0], &src[3][0], &src[4][0]);
+  dst[1] = Sum5_32(&src[0][1], &src[1][1], &src[2][1], &src[3][1], &src[4][1]);
+}
+
+inline void Sum5_32(const __m256i src[5][2], __m256i dst[2]) {
+  dst[0] = Sum5_32(&src[0][0], &src[1][0], &src[2][0], &src[3][0], &src[4][0]);
+  dst[1] = Sum5_32(&src[0][1], &src[1][1], &src[2][1], &src[3][1], &src[4][1]);
+}
+
+inline __m128i Sum3Horizontal16(const __m128i src[2]) {
+  __m128i s[3];
+  Prepare3_16(src, s);
+  return Sum3_16(s);
+}
+
+inline __m256i Sum3Horizontal16(const uint16_t* const src,
+                                const ptrdiff_t over_read_in_bytes) {
+  __m256i s[3];
+  s[0] = LoadUnaligned32Msan(src + 0, over_read_in_bytes + 0);
+  s[1] = LoadUnaligned32Msan(src + 1, over_read_in_bytes + 2);
+  s[2] = LoadUnaligned32Msan(src + 2, over_read_in_bytes + 4);
+  return Sum3_16(s);
+}
+
+inline __m128i Sum5Horizontal16(const __m128i src[2]) {
+  __m128i s[5];
+  Prepare5_16(src, s);
+  return Sum5_16(s);
+}
+
+inline __m256i Sum5Horizontal16(const uint16_t* const src,
+                                const ptrdiff_t over_read_in_bytes) {
+  __m256i s[5];
+  s[0] = LoadUnaligned32Msan(src + 0, over_read_in_bytes + 0);
+  s[1] = LoadUnaligned32Msan(src + 1, over_read_in_bytes + 2);
+  s[2] = LoadUnaligned32Msan(src + 2, over_read_in_bytes + 4);
+  s[3] = LoadUnaligned32Msan(src + 3, over_read_in_bytes + 6);
+  s[4] = LoadUnaligned32Msan(src + 4, over_read_in_bytes + 8);
+  return Sum5_16(s);
+}
+
+inline void SumHorizontal16(const uint16_t* const src,
+                            const ptrdiff_t over_read_in_bytes,
+                            __m256i* const row3, __m256i* const row5) {
+  __m256i s[5];
+  s[0] = LoadUnaligned32Msan(src + 0, over_read_in_bytes + 0);
+  s[1] = LoadUnaligned32Msan(src + 1, over_read_in_bytes + 2);
+  s[2] = LoadUnaligned32Msan(src + 2, over_read_in_bytes + 4);
+  s[3] = LoadUnaligned32Msan(src + 3, over_read_in_bytes + 6);
+  s[4] = LoadUnaligned32Msan(src + 4, over_read_in_bytes + 8);
+  const __m256i sum04 = _mm256_add_epi16(s[0], s[4]);
+  *row3 = Sum3_16(s + 1);
+  *row5 = _mm256_add_epi16(sum04, *row3);
+}
+
+inline void SumHorizontal16(const uint16_t* const src,
+                            const ptrdiff_t over_read_in_bytes,
+                            __m256i* const row3_0, __m256i* const row3_1,
+                            __m256i* const row5_0, __m256i* const row5_1) {
+  SumHorizontal16(src + 0, over_read_in_bytes + 0, row3_0, row5_0);
+  SumHorizontal16(src + 16, over_read_in_bytes + 32, row3_1, row5_1);
+}
+
+inline void SumHorizontal32(const __m128i src[5], __m128i* const row_sq3,
+                            __m128i* const row_sq5) {
+  const __m128i sum04 = _mm_add_epi32(src[0], src[4]);
+  *row_sq3 = Sum3_32(src + 1);
+  *row_sq5 = _mm_add_epi32(sum04, *row_sq3);
+}
+
+inline void SumHorizontal32(const __m256i src[5], __m256i* const row_sq3,
+                            __m256i* const row_sq5) {
+  const __m256i sum04 = _mm256_add_epi32(src[0], src[4]);
+  *row_sq3 = Sum3_32(src + 1);
+  *row_sq5 = _mm256_add_epi32(sum04, *row_sq3);
+}
+
+inline void SumHorizontal32(const __m128i src[3], __m128i* const row_sq3_0,
+                            __m128i* const row_sq3_1, __m128i* const row_sq5_0,
+                            __m128i* const row_sq5_1) {
+  __m128i s[5];
+  Prepare5_32(src + 0, s);
+  SumHorizontal32(s, row_sq3_0, row_sq5_0);
+  Prepare5_32(src + 1, s);
+  SumHorizontal32(s, row_sq3_1, row_sq5_1);
+}
+
+inline void SumHorizontal32(const __m256i src[3], __m256i* const row_sq3_0,
+                            __m256i* const row_sq3_1, __m256i* const row_sq5_0,
+                            __m256i* const row_sq5_1) {
+  __m256i s[5];
+  Prepare5_32(src + 0, s);
+  SumHorizontal32(s, row_sq3_0, row_sq5_0);
+  Prepare5_32(src + 1, s);
+  SumHorizontal32(s, row_sq3_1, row_sq5_1);
+}
+
+inline void Sum3Horizontal32(const __m128i src[3], __m128i dst[2]) {
+  __m128i s[3];
+  Prepare3_32(src + 0, s);
+  dst[0] = Sum3_32(s);
+  Prepare3_32(src + 1, s);
+  dst[1] = Sum3_32(s);
+}
+
+inline void Sum3Horizontal32(const __m256i src[3], __m256i dst[2]) {
+  __m256i s[3];
+  Prepare3_32(src + 0, s);
+  dst[0] = Sum3_32(s);
+  Prepare3_32(src + 1, s);
+  dst[1] = Sum3_32(s);
+}
+
+inline void Sum5Horizontal32(const __m128i src[3], __m128i dst[2]) {
+  __m128i s[5];
+  Prepare5_32(src + 0, s);
+  dst[0] = Sum5_32(s);
+  Prepare5_32(src + 1, s);
+  dst[1] = Sum5_32(s);
+}
+
+inline void Sum5Horizontal32(const __m256i src[3], __m256i dst[2]) {
+  __m256i s[5];
+  Prepare5_32(src + 0, s);
+  dst[0] = Sum5_32(s);
+  Prepare5_32(src + 1, s);
+  dst[1] = Sum5_32(s);
+}
+
+void SumHorizontal16(const __m128i src[2], __m128i* const row3,
+                     __m128i* const row5) {
+  __m128i s[5];
+  Prepare5_16(src, s);
+  const __m128i sum04 = _mm_add_epi16(s[0], s[4]);
+  *row3 = Sum3_16(s + 1);
+  *row5 = _mm_add_epi16(sum04, *row3);
+}
+
+inline __m256i Sum343Lo(const __m256i ma3[3]) {
+  const __m256i sum = Sum3WLo16(ma3);
+  const __m256i sum3 = Sum3_16(sum, sum, sum);
+  return VaddwLo8(sum3, ma3[1]);
+}
+
+inline __m256i Sum343Hi(const __m256i ma3[3]) {
+  const __m256i sum = Sum3WHi16(ma3);
+  const __m256i sum3 = Sum3_16(sum, sum, sum);
+  return VaddwHi8(sum3, ma3[1]);
+}
+
+inline __m256i Sum343(const __m256i src[3]) {
+  const __m256i sum = Sum3_32(src);
+  const __m256i sum3 = Sum3_32(sum, sum, sum);
+  return _mm256_add_epi32(sum3, src[1]);
+}
+
+inline void Sum343(const __m256i src[3], __m256i dst[2]) {
+  __m256i s[3];
+  Prepare3_32(src + 0, s);
+  dst[0] = Sum343(s);
+  Prepare3_32(src + 1, s);
+  dst[1] = Sum343(s);
+}
+
+inline __m256i Sum565Lo(const __m256i src[3]) {
+  const __m256i sum = Sum3WLo16(src);
+  const __m256i sum4 = _mm256_slli_epi16(sum, 2);
+  const __m256i sum5 = _mm256_add_epi16(sum4, sum);
+  return VaddwLo8(sum5, src[1]);
+}
+
+inline __m256i Sum565Hi(const __m256i src[3]) {
+  const __m256i sum = Sum3WHi16(src);
+  const __m256i sum4 = _mm256_slli_epi16(sum, 2);
+  const __m256i sum5 = _mm256_add_epi16(sum4, sum);
+  return VaddwHi8(sum5, src[1]);
+}
+
+inline __m256i Sum565(const __m256i src[3]) {
+  const __m256i sum = Sum3_32(src);
+  const __m256i sum4 = _mm256_slli_epi32(sum, 2);
+  const __m256i sum5 = _mm256_add_epi32(sum4, sum);
+  return _mm256_add_epi32(sum5, src[1]);
+}
+
+inline void Sum565(const __m256i src[3], __m256i dst[2]) {
+  __m256i s[3];
+  Prepare3_32(src + 0, s);
+  dst[0] = Sum565(s);
+  Prepare3_32(src + 1, s);
+  dst[1] = Sum565(s);
+}
+
+inline void BoxSum(const uint16_t* src, const ptrdiff_t src_stride,
+                   const ptrdiff_t width, const ptrdiff_t sum_stride,
+                   const ptrdiff_t sum_width, uint16_t* sum3, uint16_t* sum5,
+                   uint32_t* square_sum3, uint32_t* square_sum5) {
+  const ptrdiff_t overread_in_bytes_128 =
+      kOverreadInBytesPass1_128 - sizeof(*src) * width;
+  const ptrdiff_t overread_in_bytes_256 =
+      kOverreadInBytesPass1_256 - sizeof(*src) * width;
+  int y = 2;
+  do {
+    __m128i s0[2], sq_128[4], s3, s5, sq3[2], sq5[2];
+    __m256i sq[8];
+    s0[0] = LoadUnaligned16Msan(src + 0, overread_in_bytes_128 + 0);
+    s0[1] = LoadUnaligned16Msan(src + 8, overread_in_bytes_128 + 16);
+    Square(s0[0], sq_128 + 0);
+    Square(s0[1], sq_128 + 2);
+    SumHorizontal16(s0, &s3, &s5);
+    StoreAligned16(sum3, s3);
+    StoreAligned16(sum5, s5);
+    SumHorizontal32(sq_128, &sq3[0], &sq3[1], &sq5[0], &sq5[1]);
+    StoreAligned32U32(square_sum3, sq3);
+    StoreAligned32U32(square_sum5, sq5);
+    src += 8;
+    sum3 += 8;
+    sum5 += 8;
+    square_sum3 += 8;
+    square_sum5 += 8;
+    sq[0] = SetrM128i(sq_128[2], sq_128[2]);
+    sq[1] = SetrM128i(sq_128[3], sq_128[3]);
+    ptrdiff_t x = sum_width;
+    do {
+      __m256i s[2], row3[2], row5[2], row_sq3[2], row_sq5[2];
+      s[0] = LoadUnaligned32Msan(
+          src + 8, overread_in_bytes_256 + sizeof(*src) * (sum_width - x + 8));
+      s[1] = LoadUnaligned32Msan(
+          src + 24,
+          overread_in_bytes_256 + sizeof(*src) * (sum_width - x + 24));
+      Square(s[0], sq + 2);
+      Square(s[1], sq + 6);
+      sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21);
+      sq[1] = _mm256_permute2x128_si256(sq[1], sq[3], 0x21);
+      sq[4] = _mm256_permute2x128_si256(sq[2], sq[6], 0x21);
+      sq[5] = _mm256_permute2x128_si256(sq[3], sq[7], 0x21);
+      SumHorizontal16(
+          src, overread_in_bytes_256 + sizeof(*src) * (sum_width - x + 8),
+          &row3[0], &row3[1], &row5[0], &row5[1]);
+      StoreAligned64(sum3, row3);
+      StoreAligned64(sum5, row5);
+      SumHorizontal32(sq + 0, &row_sq3[0], &row_sq3[1], &row_sq5[0],
+                      &row_sq5[1]);
+      StoreAligned64(square_sum3 + 0, row_sq3);
+      StoreAligned64(square_sum5 + 0, row_sq5);
+      SumHorizontal32(sq + 4, &row_sq3[0], &row_sq3[1], &row_sq5[0],
+                      &row_sq5[1]);
+      StoreAligned64(square_sum3 + 16, row_sq3);
+      StoreAligned64(square_sum5 + 16, row_sq5);
+      sq[0] = sq[6];
+      sq[1] = sq[7];
+      src += 32;
+      sum3 += 32;
+      sum5 += 32;
+      square_sum3 += 32;
+      square_sum5 += 32;
+      x -= 32;
+    } while (x != 0);
+    src += src_stride - sum_width - 8;
+    sum3 += sum_stride - sum_width - 8;
+    sum5 += sum_stride - sum_width - 8;
+    square_sum3 += sum_stride - sum_width - 8;
+    square_sum5 += sum_stride - sum_width - 8;
+  } while (--y != 0);
+}
+
+template <int size>
+inline void BoxSum(const uint16_t* src, const ptrdiff_t src_stride,
+                   const ptrdiff_t width, const ptrdiff_t sum_stride,
+                   const ptrdiff_t sum_width, uint16_t* sums,
+                   uint32_t* square_sums) {
+  static_assert(size == 3 || size == 5, "");
+  int overread_in_bytes_128, overread_in_bytes_256;
+  if (size == 3) {
+    overread_in_bytes_128 = kOverreadInBytesPass2_128;
+    overread_in_bytes_256 = kOverreadInBytesPass2_256;
+  } else {
+    overread_in_bytes_128 = kOverreadInBytesPass1_128;
+    overread_in_bytes_256 = kOverreadInBytesPass1_256;
+  }
+  overread_in_bytes_128 -= sizeof(*src) * width;
+  overread_in_bytes_256 -= sizeof(*src) * width;
+  int y = 2;
+  do {
+    __m128i s_128[2], ss, sq_128[4], sqs[2];
+    __m256i sq[8];
+    s_128[0] = LoadUnaligned16Msan(src + 0, overread_in_bytes_128);
+    s_128[1] = LoadUnaligned16Msan(src + 8, overread_in_bytes_128 + 16);
+    Square(s_128[0], sq_128 + 0);
+    Square(s_128[1], sq_128 + 2);
+    if (size == 3) {
+      ss = Sum3Horizontal16(s_128);
+      Sum3Horizontal32(sq_128, sqs);
+    } else {
+      ss = Sum5Horizontal16(s_128);
+      Sum5Horizontal32(sq_128, sqs);
+    }
+    StoreAligned16(sums, ss);
+    StoreAligned32U32(square_sums, sqs);
+    src += 8;
+    sums += 8;
+    square_sums += 8;
+    sq[0] = SetrM128i(sq_128[2], sq_128[2]);
+    sq[1] = SetrM128i(sq_128[3], sq_128[3]);
+    ptrdiff_t x = sum_width;
+    do {
+      __m256i s[2], row[2], row_sq[4];
+      s[0] = LoadUnaligned32Msan(
+          src + 8, overread_in_bytes_256 + sizeof(*src) * (sum_width - x + 8));
+      s[1] = LoadUnaligned32Msan(
+          src + 24,
+          overread_in_bytes_256 + sizeof(*src) * (sum_width - x + 24));
+      Square(s[0], sq + 2);
+      Square(s[1], sq + 6);
+      sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21);
+      sq[1] = _mm256_permute2x128_si256(sq[1], sq[3], 0x21);
+      sq[4] = _mm256_permute2x128_si256(sq[2], sq[6], 0x21);
+      sq[5] = _mm256_permute2x128_si256(sq[3], sq[7], 0x21);
+      if (size == 3) {
+        row[0] = Sum3Horizontal16(
+            src, overread_in_bytes_256 + sizeof(*src) * (sum_width - x + 8));
+        row[1] =
+            Sum3Horizontal16(src + 16, overread_in_bytes_256 +
+                                           sizeof(*src) * (sum_width - x + 24));
+        Sum3Horizontal32(sq + 0, row_sq + 0);
+        Sum3Horizontal32(sq + 4, row_sq + 2);
+      } else {
+        row[0] = Sum5Horizontal16(
+            src, overread_in_bytes_256 + sizeof(*src) * (sum_width - x + 8));
+        row[1] =
+            Sum5Horizontal16(src + 16, overread_in_bytes_256 +
+                                           sizeof(*src) * (sum_width - x + 24));
+        Sum5Horizontal32(sq + 0, row_sq + 0);
+        Sum5Horizontal32(sq + 4, row_sq + 2);
+      }
+      StoreAligned64(sums, row);
+      StoreAligned64(square_sums + 0, row_sq + 0);
+      StoreAligned64(square_sums + 16, row_sq + 2);
+      sq[0] = sq[6];
+      sq[1] = sq[7];
+      src += 32;
+      sums += 32;
+      square_sums += 32;
+      x -= 32;
+    } while (x != 0);
+    src += src_stride - sum_width - 8;
+    sums += sum_stride - sum_width - 8;
+    square_sums += sum_stride - sum_width - 8;
+  } while (--y != 0);
+}
+
+template <int n>
+inline __m128i CalculateMa(const __m128i sum, const __m128i sum_sq,
+                           const uint32_t scale) {
+  static_assert(n == 9 || n == 25, "");
+  // a = |sum_sq|
+  // d = |sum|
+  // p = (a * n < d * d) ? 0 : a * n - d * d;
+  const __m128i dxd = _mm_madd_epi16(sum, sum);
+  // _mm_mullo_epi32() has high latency. Using shifts and additions instead.
+  // Some compilers could do this for us but we make this explicit.
+  // return _mm_mullo_epi32(sum_sq, _mm_set1_epi32(n));
+  __m128i axn = _mm_add_epi32(sum_sq, _mm_slli_epi32(sum_sq, 3));
+  if (n == 25) axn = _mm_add_epi32(axn, _mm_slli_epi32(sum_sq, 4));
+  const __m128i sub = _mm_sub_epi32(axn, dxd);
+  const __m128i p = _mm_max_epi32(sub, _mm_setzero_si128());
+  const __m128i pxs = _mm_mullo_epi32(p, _mm_set1_epi32(scale));
+  return VrshrU32(pxs, kSgrProjScaleBits);
+}
+
+template <int n>
+inline __m128i CalculateMa(const __m128i sum, const __m128i sum_sq[2],
+                           const uint32_t scale) {
+  static_assert(n == 9 || n == 25, "");
+  const __m128i b = VrshrU16(sum, 2);
+  const __m128i sum_lo = _mm_unpacklo_epi16(b, _mm_setzero_si128());
+  const __m128i sum_hi = _mm_unpackhi_epi16(b, _mm_setzero_si128());
+  const __m128i z0 = CalculateMa<n>(sum_lo, VrshrU32(sum_sq[0], 4), scale);
+  const __m128i z1 = CalculateMa<n>(sum_hi, VrshrU32(sum_sq[1], 4), scale);
+  return _mm_packus_epi32(z0, z1);
+}
+
+template <int n>
+inline __m256i CalculateMa(const __m256i sum, const __m256i sum_sq,
+                           const uint32_t scale) {
+  static_assert(n == 9 || n == 25, "");
+  // a = |sum_sq|
+  // d = |sum|
+  // p = (a * n < d * d) ? 0 : a * n - d * d;
+  const __m256i dxd = _mm256_madd_epi16(sum, sum);
+  // _mm256_mullo_epi32() has high latency. Using shifts and additions instead.
+  // Some compilers could do this for us but we make this explicit.
+  // return _mm256_mullo_epi32(sum_sq, _mm256_set1_epi32(n));
+  __m256i axn = _mm256_add_epi32(sum_sq, _mm256_slli_epi32(sum_sq, 3));
+  if (n == 25) axn = _mm256_add_epi32(axn, _mm256_slli_epi32(sum_sq, 4));
+  const __m256i sub = _mm256_sub_epi32(axn, dxd);
+  const __m256i p = _mm256_max_epi32(sub, _mm256_setzero_si256());
+  const __m256i pxs = _mm256_mullo_epi32(p, _mm256_set1_epi32(scale));
+  return VrshrU32(pxs, kSgrProjScaleBits);
+}
+
+template <int n>
+inline __m256i CalculateMa(const __m256i sum, const __m256i sum_sq[2],
+                           const uint32_t scale) {
+  static_assert(n == 9 || n == 25, "");
+  const __m256i b = VrshrU16(sum, 2);
+  const __m256i sum_lo = _mm256_unpacklo_epi16(b, _mm256_setzero_si256());
+  const __m256i sum_hi = _mm256_unpackhi_epi16(b, _mm256_setzero_si256());
+  const __m256i z0 = CalculateMa<n>(sum_lo, VrshrU32(sum_sq[0], 4), scale);
+  const __m256i z1 = CalculateMa<n>(sum_hi, VrshrU32(sum_sq[1], 4), scale);
+  return _mm256_packus_epi32(z0, z1);
+}
+
+inline void CalculateB5(const __m128i sum, const __m128i ma, __m128i b[2]) {
+  // one_over_n == 164.
+  constexpr uint32_t one_over_n =
+      ((1 << kSgrProjReciprocalBits) + (25 >> 1)) / 25;
+  // one_over_n_quarter == 41.
+  constexpr uint32_t one_over_n_quarter = one_over_n >> 2;
+  static_assert(one_over_n == one_over_n_quarter << 2, "");
+  // |ma| is in range [0, 255].
+  const __m128i m = _mm_maddubs_epi16(ma, _mm_set1_epi16(one_over_n_quarter));
+  const __m128i m0 = VmullLo16(m, sum);
+  const __m128i m1 = VmullHi16(m, sum);
+  b[0] = VrshrU32(m0, kSgrProjReciprocalBits - 2);
+  b[1] = VrshrU32(m1, kSgrProjReciprocalBits - 2);
+}
+
+inline void CalculateB5(const __m256i sum, const __m256i ma, __m256i b[2]) {
+  // one_over_n == 164.
+  constexpr uint32_t one_over_n =
+      ((1 << kSgrProjReciprocalBits) + (25 >> 1)) / 25;
+  // one_over_n_quarter == 41.
+  constexpr uint32_t one_over_n_quarter = one_over_n >> 2;
+  static_assert(one_over_n == one_over_n_quarter << 2, "");
+  // |ma| is in range [0, 255].
+  const __m256i m =
+      _mm256_maddubs_epi16(ma, _mm256_set1_epi16(one_over_n_quarter));
+  const __m256i m0 = VmullLo16(m, sum);
+  const __m256i m1 = VmullHi16(m, sum);
+  b[0] = VrshrU32(m0, kSgrProjReciprocalBits - 2);
+  b[1] = VrshrU32(m1, kSgrProjReciprocalBits - 2);
+}
+
+inline void CalculateB3(const __m128i sum, const __m128i ma, __m128i b[2]) {
+  // one_over_n == 455.
+  constexpr uint32_t one_over_n =
+      ((1 << kSgrProjReciprocalBits) + (9 >> 1)) / 9;
+  const __m128i m0 = VmullLo16(ma, sum);
+  const __m128i m1 = VmullHi16(ma, sum);
+  const __m128i m2 = _mm_mullo_epi32(m0, _mm_set1_epi32(one_over_n));
+  const __m128i m3 = _mm_mullo_epi32(m1, _mm_set1_epi32(one_over_n));
+  b[0] = VrshrU32(m2, kSgrProjReciprocalBits);
+  b[1] = VrshrU32(m3, kSgrProjReciprocalBits);
+}
+
+inline void CalculateB3(const __m256i sum, const __m256i ma, __m256i b[2]) {
+  // one_over_n == 455.
+  constexpr uint32_t one_over_n =
+      ((1 << kSgrProjReciprocalBits) + (9 >> 1)) / 9;
+  const __m256i m0 = VmullLo16(ma, sum);
+  const __m256i m1 = VmullHi16(ma, sum);
+  const __m256i m2 = _mm256_mullo_epi32(m0, _mm256_set1_epi32(one_over_n));
+  const __m256i m3 = _mm256_mullo_epi32(m1, _mm256_set1_epi32(one_over_n));
+  b[0] = VrshrU32(m2, kSgrProjReciprocalBits);
+  b[1] = VrshrU32(m3, kSgrProjReciprocalBits);
+}
+
+inline void CalculateSumAndIndex5(const __m128i s5[5], const __m128i sq5[5][2],
+                                  const uint32_t scale, __m128i* const sum,
+                                  __m128i* const index) {
+  __m128i sum_sq[2];
+  *sum = Sum5_16(s5);
+  Sum5_32(sq5, sum_sq);
+  *index = CalculateMa<25>(*sum, sum_sq, scale);
+}
+
+inline void CalculateSumAndIndex5(const __m256i s5[5], const __m256i sq5[5][2],
+                                  const uint32_t scale, __m256i* const sum,
+                                  __m256i* const index) {
+  __m256i sum_sq[2];
+  *sum = Sum5_16(s5);
+  Sum5_32(sq5, sum_sq);
+  *index = CalculateMa<25>(*sum, sum_sq, scale);
+}
+
+inline void CalculateSumAndIndex3(const __m128i s3[3], const __m128i sq3[3][2],
+                                  const uint32_t scale, __m128i* const sum,
+                                  __m128i* const index) {
+  __m128i sum_sq[2];
+  *sum = Sum3_16(s3);
+  Sum3_32(sq3, sum_sq);
+  *index = CalculateMa<9>(*sum, sum_sq, scale);
+}
+
+inline void CalculateSumAndIndex3(const __m256i s3[3], const __m256i sq3[3][2],
+                                  const uint32_t scale, __m256i* const sum,
+                                  __m256i* const index) {
+  __m256i sum_sq[2];
+  *sum = Sum3_16(s3);
+  Sum3_32(sq3, sum_sq);
+  *index = CalculateMa<9>(*sum, sum_sq, scale);
+}
+
+template <int n>
+inline void LookupIntermediate(const __m128i sum, const __m128i index,
+                               __m128i* const ma, __m128i b[2]) {
+  static_assert(n == 9 || n == 25, "");
+  const __m128i idx = _mm_packus_epi16(index, index);
+  // Actually it's not stored and loaded. The compiler will use a 64-bit
+  // general-purpose register to process. Faster than using _mm_extract_epi8().
+  uint8_t temp[8];
+  StoreLo8(temp, idx);
+  *ma = _mm_cvtsi32_si128(kSgrMaLookup[temp[0]]);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[1]], 1);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[2]], 2);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[3]], 3);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[4]], 4);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[5]], 5);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[6]], 6);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[7]], 7);
+  // b = ma * b * one_over_n
+  // |ma| = [0, 255]
+  // |sum| is a box sum with radius 1 or 2.
+  // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
+  // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
+  // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
+  // When radius is 2 |n| is 25. |one_over_n| is 164.
+  // When radius is 1 |n| is 9. |one_over_n| is 455.
+  // |kSgrProjReciprocalBits| is 12.
+  // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
+  // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
+  const __m128i maq = _mm_unpacklo_epi8(*ma, _mm_setzero_si128());
+  if (n == 9) {
+    CalculateB3(sum, maq, b);
+  } else {
+    CalculateB5(sum, maq, b);
+  }
+}
+
+// Repeat the first 48 elements in kSgrMaLookup with a period of 16.
+alignas(32) constexpr uint8_t kSgrMaLookupAvx2[96] = {
+    255, 128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17, 16,
+    255, 128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17, 16,
+    15,  14,  13, 13, 12, 12, 11, 11, 10, 10, 9,  9,  9,  9,  8,  8,
+    15,  14,  13, 13, 12, 12, 11, 11, 10, 10, 9,  9,  9,  9,  8,  8,
+    8,   8,   7,  7,  7,  7,  7,  6,  6,  6,  6,  6,  6,  6,  5,  5,
+    8,   8,   7,  7,  7,  7,  7,  6,  6,  6,  6,  6,  6,  6,  5,  5};
+
+// Set the shuffle control mask of indices out of range [0, 15] to (1xxxxxxx)b
+// to get value 0 as the shuffle result. The most significiant bit 1 comes
+// either from the comparison instruction, or from the sign bit of the index.
+inline __m128i ShuffleIndex(const __m128i table, const __m128i index) {
+  __m128i mask;
+  mask = _mm_cmpgt_epi8(index, _mm_set1_epi8(15));
+  mask = _mm_or_si128(mask, index);
+  return _mm_shuffle_epi8(table, mask);
+}
+
+inline __m256i ShuffleIndex(const __m256i table, const __m256i index) {
+  __m256i mask;
+  mask = _mm256_cmpgt_epi8(index, _mm256_set1_epi8(15));
+  mask = _mm256_or_si256(mask, index);
+  return _mm256_shuffle_epi8(table, mask);
+}
+
+inline __m128i AdjustValue(const __m128i value, const __m128i index,
+                           const int threshold) {
+  const __m128i thresholds = _mm_set1_epi8(threshold - 128);
+  const __m128i offset = _mm_cmpgt_epi8(index, thresholds);
+  return _mm_add_epi8(value, offset);
+}
+
+inline __m256i AdjustValue(const __m256i value, const __m256i index,
+                           const int threshold) {
+  const __m256i thresholds = _mm256_set1_epi8(threshold - 128);
+  const __m256i offset = _mm256_cmpgt_epi8(index, thresholds);
+  return _mm256_add_epi8(value, offset);
+}
+
+inline void CalculateIntermediate(const __m128i sum[2], const __m128i index[2],
+                                  __m128i* const ma, __m128i b0[2],
+                                  __m128i b1[2]) {
+  // Use table lookup to read elements whose indices are less than 48.
+  const __m128i c0 = LoadAligned16(kSgrMaLookup + 0 * 16);
+  const __m128i c1 = LoadAligned16(kSgrMaLookup + 1 * 16);
+  const __m128i c2 = LoadAligned16(kSgrMaLookup + 2 * 16);
+  const __m128i indices = _mm_packus_epi16(index[0], index[1]);
+  __m128i idx;
+  // Clip idx to 127 to apply signed comparison instructions.
+  idx = _mm_min_epu8(indices, _mm_set1_epi8(127));
+  // All elements whose indices are less than 48 are set to 0.
+  // Get shuffle results for indices in range [0, 15].
+  *ma = ShuffleIndex(c0, idx);
+  // Get shuffle results for indices in range [16, 31].
+  // Subtract 16 to utilize the sign bit of the index.
+  idx = _mm_sub_epi8(idx, _mm_set1_epi8(16));
+  const __m128i res1 = ShuffleIndex(c1, idx);
+  // Use OR instruction to combine shuffle results together.
+  *ma = _mm_or_si128(*ma, res1);
+  // Get shuffle results for indices in range [32, 47].
+  // Subtract 16 to utilize the sign bit of the index.
+  idx = _mm_sub_epi8(idx, _mm_set1_epi8(16));
+  const __m128i res2 = ShuffleIndex(c2, idx);
+  *ma = _mm_or_si128(*ma, res2);
+
+  // For elements whose indices are larger than 47, since they seldom change
+  // values with the increase of the index, we use comparison and arithmetic
+  // operations to calculate their values.
+  // Add -128 to apply signed comparison instructions.
+  idx = _mm_add_epi8(indices, _mm_set1_epi8(-128));
+  // Elements whose indices are larger than 47 (with value 0) are set to 5.
+  *ma = _mm_max_epu8(*ma, _mm_set1_epi8(5));
+  *ma = AdjustValue(*ma, idx, 55);   // 55 is the last index which value is 5.
+  *ma = AdjustValue(*ma, idx, 72);   // 72 is the last index which value is 4.
+  *ma = AdjustValue(*ma, idx, 101);  // 101 is the last index which value is 3.
+  *ma = AdjustValue(*ma, idx, 169);  // 169 is the last index which value is 2.
+  *ma = AdjustValue(*ma, idx, 254);  // 254 is the last index which value is 1.
+
+  // b = ma * b * one_over_n
+  // |ma| = [0, 255]
+  // |sum| is a box sum with radius 1 or 2.
+  // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
+  // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
+  // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
+  // When radius is 2 |n| is 25. |one_over_n| is 164.
+  // When radius is 1 |n| is 9. |one_over_n| is 455.
+  // |kSgrProjReciprocalBits| is 12.
+  // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
+  // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
+  const __m128i maq0 = _mm_unpacklo_epi8(*ma, _mm_setzero_si128());
+  CalculateB3(sum[0], maq0, b0);
+  const __m128i maq1 = _mm_unpackhi_epi8(*ma, _mm_setzero_si128());
+  CalculateB3(sum[1], maq1, b1);
+}
+
+template <int n>
+inline void CalculateIntermediate(const __m256i sum[2], const __m256i index[2],
+                                  __m256i ma[3], __m256i b0[2], __m256i b1[2]) {
+  static_assert(n == 9 || n == 25, "");
+  // Use table lookup to read elements whose indices are less than 48.
+  const __m256i c0 = LoadAligned32(kSgrMaLookupAvx2 + 0 * 32);
+  const __m256i c1 = LoadAligned32(kSgrMaLookupAvx2 + 1 * 32);
+  const __m256i c2 = LoadAligned32(kSgrMaLookupAvx2 + 2 * 32);
+  const __m256i indices = _mm256_packus_epi16(index[0], index[1]);  // 0 2 1 3
+  __m256i idx, mas;
+  // Clip idx to 127 to apply signed comparison instructions.
+  idx = _mm256_min_epu8(indices, _mm256_set1_epi8(127));
+  // All elements whose indices are less than 48 are set to 0.
+  // Get shuffle results for indices in range [0, 15].
+  mas = ShuffleIndex(c0, idx);
+  // Get shuffle results for indices in range [16, 31].
+  // Subtract 16 to utilize the sign bit of the index.
+  idx = _mm256_sub_epi8(idx, _mm256_set1_epi8(16));
+  const __m256i res1 = ShuffleIndex(c1, idx);
+  // Use OR instruction to combine shuffle results together.
+  mas = _mm256_or_si256(mas, res1);
+  // Get shuffle results for indices in range [32, 47].
+  // Subtract 16 to utilize the sign bit of the index.
+  idx = _mm256_sub_epi8(idx, _mm256_set1_epi8(16));
+  const __m256i res2 = ShuffleIndex(c2, idx);
+  mas = _mm256_or_si256(mas, res2);
+
+  // For elements whose indices are larger than 47, since they seldom change
+  // values with the increase of the index, we use comparison and arithmetic
+  // operations to calculate their values.
+  // Add -128 to apply signed comparison instructions.
+  idx = _mm256_add_epi8(indices, _mm256_set1_epi8(-128));
+  // Elements whose indices are larger than 47 (with value 0) are set to 5.
+  mas = _mm256_max_epu8(mas, _mm256_set1_epi8(5));
+  mas = AdjustValue(mas, idx, 55);   // 55 is the last index which value is 5.
+  mas = AdjustValue(mas, idx, 72);   // 72 is the last index which value is 4.
+  mas = AdjustValue(mas, idx, 101);  // 101 is the last index which value is 3.
+  mas = AdjustValue(mas, idx, 169);  // 169 is the last index which value is 2.
+  mas = AdjustValue(mas, idx, 254);  // 254 is the last index which value is 1.
+
+  ma[2] = _mm256_permute4x64_epi64(mas, 0x63);     // 32-39 8-15 16-23 24-31
+  ma[0] = _mm256_blend_epi32(ma[0], ma[2], 0xfc);  //  0-7  8-15 16-23 24-31
+  ma[1] = _mm256_permute2x128_si256(ma[0], ma[2], 0x21);
+
+  // b = ma * b * one_over_n
+  // |ma| = [0, 255]
+  // |sum| is a box sum with radius 1 or 2.
+  // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
+  // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
+  // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
+  // When radius is 2 |n| is 25. |one_over_n| is 164.
+  // When radius is 1 |n| is 9. |one_over_n| is 455.
+  // |kSgrProjReciprocalBits| is 12.
+  // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
+  // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
+  const __m256i maq0 = _mm256_unpackhi_epi8(ma[0], _mm256_setzero_si256());
+  const __m256i maq1 = _mm256_unpacklo_epi8(ma[1], _mm256_setzero_si256());
+  __m256i sums[2];
+  sums[0] = _mm256_permute2x128_si256(sum[0], sum[1], 0x20);
+  sums[1] = _mm256_permute2x128_si256(sum[0], sum[1], 0x31);
+  if (n == 9) {
+    CalculateB3(sums[0], maq0, b0);
+    CalculateB3(sums[1], maq1, b1);
+  } else {
+    CalculateB5(sums[0], maq0, b0);
+    CalculateB5(sums[1], maq1, b1);
+  }
+}
+
+inline void CalculateIntermediate5(const __m128i s5[5], const __m128i sq5[5][2],
+                                   const uint32_t scale, __m128i* const ma,
+                                   __m128i b[2]) {
+  __m128i sum, index;
+  CalculateSumAndIndex5(s5, sq5, scale, &sum, &index);
+  LookupIntermediate<25>(sum, index, ma, b);
+}
+
+inline void CalculateIntermediate3(const __m128i s3[3], const __m128i sq3[3][2],
+                                   const uint32_t scale, __m128i* const ma,
+                                   __m128i b[2]) {
+  __m128i sum, index;
+  CalculateSumAndIndex3(s3, sq3, scale, &sum, &index);
+  LookupIntermediate<9>(sum, index, ma, b);
+}
+
+inline void Store343_444(const __m256i b3[3], const ptrdiff_t x,
+                         __m256i sum_b343[2], __m256i sum_b444[2],
+                         uint32_t* const b343, uint32_t* const b444) {
+  __m256i b[3], sum_b111[2];
+  Prepare3_32(b3 + 0, b);
+  sum_b111[0] = Sum3_32(b);
+  sum_b444[0] = _mm256_slli_epi32(sum_b111[0], 2);
+  sum_b343[0] = _mm256_sub_epi32(sum_b444[0], sum_b111[0]);
+  sum_b343[0] = _mm256_add_epi32(sum_b343[0], b[1]);
+  Prepare3_32(b3 + 1, b);
+  sum_b111[1] = Sum3_32(b);
+  sum_b444[1] = _mm256_slli_epi32(sum_b111[1], 2);
+  sum_b343[1] = _mm256_sub_epi32(sum_b444[1], sum_b111[1]);
+  sum_b343[1] = _mm256_add_epi32(sum_b343[1], b[1]);
+  StoreAligned64(b444 + x, sum_b444);
+  StoreAligned64(b343 + x, sum_b343);
+}
+
+inline void Store343_444Lo(const __m256i ma3[3], const __m256i b3[2],
+                           const ptrdiff_t x, __m256i* const sum_ma343,
+                           __m256i* const sum_ma444, __m256i sum_b343[2],
+                           __m256i sum_b444[2], uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  const __m256i sum_ma111 = Sum3WLo16(ma3);
+  *sum_ma444 = _mm256_slli_epi16(sum_ma111, 2);
+  StoreAligned32_ma(ma444 + x, *sum_ma444);
+  const __m256i sum333 = _mm256_sub_epi16(*sum_ma444, sum_ma111);
+  *sum_ma343 = VaddwLo8(sum333, ma3[1]);
+  StoreAligned32_ma(ma343 + x, *sum_ma343);
+  Store343_444(b3, x, sum_b343, sum_b444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m256i ma3[3], const __m256i b3[2],
+                           const ptrdiff_t x, __m256i* const sum_ma343,
+                           __m256i* const sum_ma444, __m256i sum_b343[2],
+                           __m256i sum_b444[2], uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  const __m256i sum_ma111 = Sum3WHi16(ma3);
+  *sum_ma444 = _mm256_slli_epi16(sum_ma111, 2);
+  StoreAligned32_ma(ma444 + x, *sum_ma444);
+  const __m256i sum333 = _mm256_sub_epi16(*sum_ma444, sum_ma111);
+  *sum_ma343 = VaddwHi8(sum333, ma3[1]);
+  StoreAligned32_ma(ma343 + x, *sum_ma343);
+  Store343_444(b3, x + kMaStoreOffset, sum_b343, sum_b444, b343, b444);
+}
+
+inline void Store343_444Lo(const __m256i ma3[3], const __m256i b3[2],
+                           const ptrdiff_t x, __m256i* const sum_ma343,
+                           __m256i sum_b343[2], uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  __m256i sum_ma444, sum_b444[2];
+  Store343_444Lo(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, sum_b444, ma343,
+                 ma444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m256i ma3[3], const __m256i b3[2],
+                           const ptrdiff_t x, __m256i* const sum_ma343,
+                           __m256i sum_b343[2], uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  __m256i sum_ma444, sum_b444[2];
+  Store343_444Hi(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, sum_b444, ma343,
+                 ma444, b343, b444);
+}
+
+inline void Store343_444Lo(const __m256i ma3[3], const __m256i b3[2],
+                           const ptrdiff_t x, uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  __m256i sum_ma343, sum_b343[2];
+  Store343_444Lo(ma3, b3, x, &sum_ma343, sum_b343, ma343, ma444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m256i ma3[3], const __m256i b3[2],
+                           const ptrdiff_t x, uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  __m256i sum_ma343, sum_b343[2];
+  Store343_444Hi(ma3, b3, x, &sum_ma343, sum_b343, ma343, ma444, b343, b444);
+}
+
+// Don't combine the following 2 functions, which would be slower.
+inline void Store343_444(const __m256i ma3[3], const __m256i b3[6],
+                         const ptrdiff_t x, __m256i* const sum_ma343_lo,
+                         __m256i* const sum_ma343_hi,
+                         __m256i* const sum_ma444_lo,
+                         __m256i* const sum_ma444_hi, __m256i sum_b343_lo[2],
+                         __m256i sum_b343_hi[2], __m256i sum_b444_lo[2],
+                         __m256i sum_b444_hi[2], uint16_t* const ma343,
+                         uint16_t* const ma444, uint32_t* const b343,
+                         uint32_t* const b444) {
+  __m256i sum_mat343[2], sum_mat444[2];
+  const __m256i sum_ma111_lo = Sum3WLo16(ma3);
+  sum_mat444[0] = _mm256_slli_epi16(sum_ma111_lo, 2);
+  const __m256i sum333_lo = _mm256_sub_epi16(sum_mat444[0], sum_ma111_lo);
+  sum_mat343[0] = VaddwLo8(sum333_lo, ma3[1]);
+  Store343_444(b3, x, sum_b343_lo, sum_b444_lo, b343, b444);
+  const __m256i sum_ma111_hi = Sum3WHi16(ma3);
+  sum_mat444[1] = _mm256_slli_epi16(sum_ma111_hi, 2);
+  *sum_ma444_lo = _mm256_permute2x128_si256(sum_mat444[0], sum_mat444[1], 0x20);
+  *sum_ma444_hi = _mm256_permute2x128_si256(sum_mat444[0], sum_mat444[1], 0x31);
+  StoreAligned32(ma444 + x + 0, *sum_ma444_lo);
+  StoreAligned32(ma444 + x + 16, *sum_ma444_hi);
+  const __m256i sum333_hi = _mm256_sub_epi16(sum_mat444[1], sum_ma111_hi);
+  sum_mat343[1] = VaddwHi8(sum333_hi, ma3[1]);
+  *sum_ma343_lo = _mm256_permute2x128_si256(sum_mat343[0], sum_mat343[1], 0x20);
+  *sum_ma343_hi = _mm256_permute2x128_si256(sum_mat343[0], sum_mat343[1], 0x31);
+  StoreAligned32(ma343 + x + 0, *sum_ma343_lo);
+  StoreAligned32(ma343 + x + 16, *sum_ma343_hi);
+  Store343_444(b3 + 3, x + 16, sum_b343_hi, sum_b444_hi, b343, b444);
+}
+
+inline void Store343_444(const __m256i ma3[3], const __m256i b3[6],
+                         const ptrdiff_t x, __m256i* const sum_ma343_lo,
+                         __m256i* const sum_ma343_hi, __m256i sum_b343_lo[2],
+                         __m256i sum_b343_hi[2], uint16_t* const ma343,
+                         uint16_t* const ma444, uint32_t* const b343,
+                         uint32_t* const b444) {
+  __m256i sum_ma444[2], sum_b444[2], sum_mat343[2];
+  const __m256i sum_ma111_lo = Sum3WLo16(ma3);
+  sum_ma444[0] = _mm256_slli_epi16(sum_ma111_lo, 2);
+  const __m256i sum333_lo = _mm256_sub_epi16(sum_ma444[0], sum_ma111_lo);
+  sum_mat343[0] = VaddwLo8(sum333_lo, ma3[1]);
+  Store343_444(b3, x, sum_b343_lo, sum_b444, b343, b444);
+  const __m256i sum_ma111_hi = Sum3WHi16(ma3);
+  sum_ma444[1] = _mm256_slli_epi16(sum_ma111_hi, 2);
+  StoreAligned64_ma(ma444 + x, sum_ma444);
+  const __m256i sum333_hi = _mm256_sub_epi16(sum_ma444[1], sum_ma111_hi);
+  sum_mat343[1] = VaddwHi8(sum333_hi, ma3[1]);
+  *sum_ma343_lo = _mm256_permute2x128_si256(sum_mat343[0], sum_mat343[1], 0x20);
+  *sum_ma343_hi = _mm256_permute2x128_si256(sum_mat343[0], sum_mat343[1], 0x31);
+  StoreAligned32(ma343 + x + 0, *sum_ma343_lo);
+  StoreAligned32(ma343 + x + 16, *sum_ma343_hi);
+  Store343_444(b3 + 3, x + 16, sum_b343_hi, sum_b444, b343, b444);
+}
+
+inline void PermuteB(const __m256i t[4], __m256i b[7]) {
+  // Input:
+  //                             0     1      2     3  // b[0]
+  //                             4     5      6     7  // b[1]
+  //  8     9     10    11      24    25     26    27  // t[0]
+  // 12    13     14    15      28    29     30    31  // t[1]
+  // 16    17     18    19      32    33     34    35  // t[2]
+  // 20    21     22    23      36    37     38    39  // t[3]
+
+  // Output:
+  //  0     1      2     3       8     9     10    11  // b[0]
+  //  4     5      6     7      12    13     14    15  // b[1]
+  //  8     9     10    11      16    17     18    19  // b[2]
+  // 16    17     18    19      24    25     26    27  // b[3]
+  // 20    21     22    23      28    29     30    31  // b[4]
+  // 24    25     26    27      32    33     34    35  // b[5]
+  // 20    21     22    23      36    37     38    39  // b[6]
+  b[0] = _mm256_permute2x128_si256(b[0], t[0], 0x21);
+  b[1] = _mm256_permute2x128_si256(b[1], t[1], 0x21);
+  b[2] = _mm256_permute2x128_si256(t[0], t[2], 0x20);
+  b[3] = _mm256_permute2x128_si256(t[2], t[0], 0x30);
+  b[4] = _mm256_permute2x128_si256(t[3], t[1], 0x30);
+  b[5] = _mm256_permute2x128_si256(t[0], t[2], 0x31);
+  b[6] = t[3];
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5Lo(
+    const __m128i s[2][2], const uint32_t scale, uint16_t* const sum5[5],
+    uint32_t* const square_sum5[5], __m128i sq[2][4], __m128i* const ma,
+    __m128i b[2]) {
+  __m128i s5[2][5], sq5[5][2];
+  Square(s[0][1], sq[0] + 2);
+  Square(s[1][1], sq[1] + 2);
+  s5[0][3] = Sum5Horizontal16(s[0]);
+  StoreAligned16(sum5[3], s5[0][3]);
+  s5[0][4] = Sum5Horizontal16(s[1]);
+  StoreAligned16(sum5[4], s5[0][4]);
+  Sum5Horizontal32(sq[0], sq5[3]);
+  StoreAligned32U32(square_sum5[3], sq5[3]);
+  Sum5Horizontal32(sq[1], sq5[4]);
+  StoreAligned32U32(square_sum5[4], sq5[4]);
+  LoadAligned16x3U16(sum5, 0, s5[0]);
+  LoadAligned32x3U32(square_sum5, 0, sq5);
+  CalculateIntermediate5(s5[0], sq5, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5(
+    const uint16_t* const src0, const uint16_t* const src1,
+    const ptrdiff_t over_read_in_bytes, const ptrdiff_t sum_width,
+    const ptrdiff_t x, const uint32_t scale, uint16_t* const sum5[5],
+    uint32_t* const square_sum5[5], __m256i sq[2][8], __m256i ma[3],
+    __m256i b[3]) {
+  __m256i s[2], s5[2][5], sq5[5][2], sum[2], index[2], t[4];
+  s[0] = LoadUnaligned32Msan(src0 + 8, over_read_in_bytes + 16);
+  s[1] = LoadUnaligned32Msan(src1 + 8, over_read_in_bytes + 16);
+  Square(s[0], sq[0] + 2);
+  Square(s[1], sq[1] + 2);
+  sq[0][0] = _mm256_permute2x128_si256(sq[0][0], sq[0][2], 0x21);
+  sq[0][1] = _mm256_permute2x128_si256(sq[0][1], sq[0][3], 0x21);
+  sq[1][0] = _mm256_permute2x128_si256(sq[1][0], sq[1][2], 0x21);
+  sq[1][1] = _mm256_permute2x128_si256(sq[1][1], sq[1][3], 0x21);
+  s5[0][3] = Sum5Horizontal16(src0 + 0, over_read_in_bytes + 0);
+  s5[1][3] = Sum5Horizontal16(src0 + 16, over_read_in_bytes + 32);
+  s5[0][4] = Sum5Horizontal16(src1 + 0, over_read_in_bytes + 0);
+  s5[1][4] = Sum5Horizontal16(src1 + 16, over_read_in_bytes + 32);
+  StoreAligned32(sum5[3] + x + 0, s5[0][3]);
+  StoreAligned32(sum5[3] + x + 16, s5[1][3]);
+  StoreAligned32(sum5[4] + x + 0, s5[0][4]);
+  StoreAligned32(sum5[4] + x + 16, s5[1][4]);
+  Sum5Horizontal32(sq[0], sq5[3]);
+  StoreAligned64(square_sum5[3] + x, sq5[3]);
+  Sum5Horizontal32(sq[1], sq5[4]);
+  StoreAligned64(square_sum5[4] + x, sq5[4]);
+  LoadAligned32x3U16(sum5, x, s5[0]);
+  LoadAligned64x3U32(square_sum5, x, sq5);
+  CalculateSumAndIndex5(s5[0], sq5, scale, &sum[0], &index[0]);
+
+  s[0] = LoadUnaligned32Msan(src0 + 24, over_read_in_bytes + 48);
+  s[1] = LoadUnaligned32Msan(src1 + 24, over_read_in_bytes + 48);
+  Square(s[0], sq[0] + 6);
+  Square(s[1], sq[1] + 6);
+  sq[0][4] = _mm256_permute2x128_si256(sq[0][2], sq[0][6], 0x21);
+  sq[0][5] = _mm256_permute2x128_si256(sq[0][3], sq[0][7], 0x21);
+  sq[1][4] = _mm256_permute2x128_si256(sq[1][2], sq[1][6], 0x21);
+  sq[1][5] = _mm256_permute2x128_si256(sq[1][3], sq[1][7], 0x21);
+  Sum5Horizontal32(sq[0] + 4, sq5[3]);
+  StoreAligned64(square_sum5[3] + x + 16, sq5[3]);
+  Sum5Horizontal32(sq[1] + 4, sq5[4]);
+  StoreAligned64(square_sum5[4] + x + 16, sq5[4]);
+  LoadAligned32x3U16Msan(sum5, x + 16, sum_width, s5[1]);
+  LoadAligned64x3U32Msan(square_sum5, x + 16, sum_width, sq5);
+  CalculateSumAndIndex5(s5[1], sq5, scale, &sum[1], &index[1]);
+  CalculateIntermediate<25>(sum, index, ma, t, t + 2);
+  PermuteB(t, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRowLo(
+    const __m128i s[2], const uint32_t scale, const uint16_t* const sum5[5],
+    const uint32_t* const square_sum5[5], __m128i sq[4], __m128i* const ma,
+    __m128i b[2]) {
+  __m128i s5[5], sq5[5][2];
+  Square(s[1], sq + 2);
+  s5[3] = s5[4] = Sum5Horizontal16(s);
+  Sum5Horizontal32(sq, sq5[3]);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  LoadAligned16x3U16(sum5, 0, s5);
+  LoadAligned32x3U32(square_sum5, 0, sq5);
+  CalculateIntermediate5(s5, sq5, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRow(
+    const uint16_t* const src, const ptrdiff_t over_read_in_bytes,
+    const ptrdiff_t sum_width, const ptrdiff_t x, const uint32_t scale,
+    const uint16_t* const sum5[5], const uint32_t* const square_sum5[5],
+    __m256i sq[3], __m256i ma[3], __m256i b[3]) {
+  const __m256i s0 = LoadUnaligned32Msan(src + 8, over_read_in_bytes + 16);
+  __m256i s5[2][5], sq5[5][2], sum[2], index[2], t[4];
+  Square(s0, sq + 2);
+  sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21);
+  sq[1] = _mm256_permute2x128_si256(sq[1], sq[3], 0x21);
+  s5[0][3] = Sum5Horizontal16(src + 0, over_read_in_bytes + 0);
+  s5[1][3] = Sum5Horizontal16(src + 16, over_read_in_bytes + 32);
+  s5[0][4] = s5[0][3];
+  s5[1][4] = s5[1][3];
+  Sum5Horizontal32(sq, sq5[3]);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  LoadAligned32x3U16(sum5, x, s5[0]);
+  LoadAligned64x3U32(square_sum5, x, sq5);
+  CalculateSumAndIndex5(s5[0], sq5, scale, &sum[0], &index[0]);
+
+  const __m256i s1 = LoadUnaligned32Msan(src + 24, over_read_in_bytes + 48);
+  Square(s1, sq + 6);
+  sq[4] = _mm256_permute2x128_si256(sq[2], sq[6], 0x21);
+  sq[5] = _mm256_permute2x128_si256(sq[3], sq[7], 0x21);
+  Sum5Horizontal32(sq + 4, sq5[3]);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  LoadAligned32x3U16Msan(sum5, x + 16, sum_width, s5[1]);
+  LoadAligned64x3U32Msan(square_sum5, x + 16, sum_width, sq5);
+  CalculateSumAndIndex5(s5[1], sq5, scale, &sum[1], &index[1]);
+  CalculateIntermediate<25>(sum, index, ma, t, t + 2);
+  PermuteB(t, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3Lo(
+    const __m128i s[2], const uint32_t scale, uint16_t* const sum3[3],
+    uint32_t* const square_sum3[3], __m128i sq[4], __m128i* const ma,
+    __m128i b[2]) {
+  __m128i s3[3], sq3[3][2];
+  Square(s[1], sq + 2);
+  s3[2] = Sum3Horizontal16(s);
+  StoreAligned16(sum3[2], s3[2]);
+  Sum3Horizontal32(sq, sq3[2]);
+  StoreAligned32U32(square_sum3[2], sq3[2]);
+  LoadAligned16x2U16(sum3, 0, s3);
+  LoadAligned32x2U32(square_sum3, 0, sq3);
+  CalculateIntermediate3(s3, sq3, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3(
+    const uint16_t* const src, const ptrdiff_t over_read_in_bytes,
+    const ptrdiff_t x, const ptrdiff_t sum_width, const uint32_t scale,
+    uint16_t* const sum3[3], uint32_t* const square_sum3[3], __m256i sq[8],
+    __m256i ma[3], __m256i b[7]) {
+  __m256i s[2], s3[4], sq3[3][2], sum[2], index[2], t[4];
+  s[0] = LoadUnaligned32Msan(src + 8, over_read_in_bytes + 16);
+  s[1] = LoadUnaligned32Msan(src + 24, over_read_in_bytes + 48);
+  Square(s[0], sq + 2);
+  sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21);
+  sq[1] = _mm256_permute2x128_si256(sq[1], sq[3], 0x21);
+  s3[2] = Sum3Horizontal16(src, over_read_in_bytes);
+  s3[3] = Sum3Horizontal16(src + 16, over_read_in_bytes + 32);
+  StoreAligned64(sum3[2] + x, s3 + 2);
+  Sum3Horizontal32(sq + 0, sq3[2]);
+  StoreAligned64(square_sum3[2] + x, sq3[2]);
+  LoadAligned32x2U16(sum3, x, s3);
+  LoadAligned64x2U32(square_sum3, x, sq3);
+  CalculateSumAndIndex3(s3, sq3, scale, &sum[0], &index[0]);
+
+  Square(s[1], sq + 6);
+  sq[4] = _mm256_permute2x128_si256(sq[2], sq[6], 0x21);
+  sq[5] = _mm256_permute2x128_si256(sq[3], sq[7], 0x21);
+  Sum3Horizontal32(sq + 4, sq3[2]);
+  StoreAligned64(square_sum3[2] + x + 16, sq3[2]);
+  LoadAligned32x2U16Msan(sum3, x + 16, sum_width, s3 + 1);
+  LoadAligned64x2U32Msan(square_sum3, x + 16, sum_width, sq3);
+  CalculateSumAndIndex3(s3 + 1, sq3, scale, &sum[1], &index[1]);
+  CalculateIntermediate<9>(sum, index, ma, t, t + 2);
+  PermuteB(t, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLo(
+    const __m128i s[2][4], const uint16_t scales[2], uint16_t* const sum3[4],
+    uint16_t* const sum5[5], uint32_t* const square_sum3[4],
+    uint32_t* const square_sum5[5], __m128i sq[2][8], __m128i ma3[2][3],
+    __m128i b3[2][10], __m128i* const ma5, __m128i b5[2]) {
+  __m128i s3[4], s5[5], sq3[4][2], sq5[5][2], sum[2], index[2];
+  Square(s[0][1], sq[0] + 2);
+  Square(s[1][1], sq[1] + 2);
+  SumHorizontal16(s[0], &s3[2], &s5[3]);
+  SumHorizontal16(s[1], &s3[3], &s5[4]);
+  StoreAligned16(sum3[2], s3[2]);
+  StoreAligned16(sum3[3], s3[3]);
+  StoreAligned16(sum5[3], s5[3]);
+  StoreAligned16(sum5[4], s5[4]);
+  SumHorizontal32(sq[0], &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  StoreAligned32U32(square_sum3[2], sq3[2]);
+  StoreAligned32U32(square_sum5[3], sq5[3]);
+  SumHorizontal32(sq[1], &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+  StoreAligned32U32(square_sum3[3], sq3[3]);
+  StoreAligned32U32(square_sum5[4], sq5[4]);
+  LoadAligned16x2U16(sum3, 0, s3);
+  LoadAligned32x2U32(square_sum3, 0, sq3);
+  LoadAligned16x3U16(sum5, 0, s5);
+  LoadAligned32x3U32(square_sum5, 0, sq5);
+  CalculateSumAndIndex3(s3 + 0, sq3 + 0, scales[1], &sum[0], &index[0]);
+  CalculateSumAndIndex3(s3 + 1, sq3 + 1, scales[1], &sum[1], &index[1]);
+  CalculateIntermediate(sum, index, &ma3[0][0], b3[0], b3[1]);
+  ma3[1][0] = _mm_srli_si128(ma3[0][0], 8);
+  CalculateIntermediate5(s5, sq5, scales[0], ma5, b5);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess(
+    const uint16_t* const src0, const uint16_t* const src1,
+    const ptrdiff_t over_read_in_bytes, const ptrdiff_t x,
+    const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    const ptrdiff_t sum_width, __m256i sq[2][8], __m256i ma3[2][3],
+    __m256i b3[2][7], __m256i ma5[3], __m256i b5[5]) {
+  __m256i s[2], s3[2][4], s5[2][5], sq3[4][2], sq5[5][2], sum_3[2][2],
+      index_3[2][2], sum_5[2], index_5[2], t[4];
+  s[0] = LoadUnaligned32Msan(src0 + 8, over_read_in_bytes + 16);
+  s[1] = LoadUnaligned32Msan(src1 + 8, over_read_in_bytes + 16);
+  Square(s[0], sq[0] + 2);
+  Square(s[1], sq[1] + 2);
+  sq[0][0] = _mm256_permute2x128_si256(sq[0][0], sq[0][2], 0x21);
+  sq[0][1] = _mm256_permute2x128_si256(sq[0][1], sq[0][3], 0x21);
+  sq[1][0] = _mm256_permute2x128_si256(sq[1][0], sq[1][2], 0x21);
+  sq[1][1] = _mm256_permute2x128_si256(sq[1][1], sq[1][3], 0x21);
+  SumHorizontal16(src0, over_read_in_bytes, &s3[0][2], &s3[1][2], &s5[0][3],
+                  &s5[1][3]);
+  SumHorizontal16(src1, over_read_in_bytes, &s3[0][3], &s3[1][3], &s5[0][4],
+                  &s5[1][4]);
+  StoreAligned32(sum3[2] + x + 0, s3[0][2]);
+  StoreAligned32(sum3[2] + x + 16, s3[1][2]);
+  StoreAligned32(sum3[3] + x + 0, s3[0][3]);
+  StoreAligned32(sum3[3] + x + 16, s3[1][3]);
+  StoreAligned32(sum5[3] + x + 0, s5[0][3]);
+  StoreAligned32(sum5[3] + x + 16, s5[1][3]);
+  StoreAligned32(sum5[4] + x + 0, s5[0][4]);
+  StoreAligned32(sum5[4] + x + 16, s5[1][4]);
+  SumHorizontal32(sq[0], &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  SumHorizontal32(sq[1], &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+  StoreAligned64(square_sum3[2] + x, sq3[2]);
+  StoreAligned64(square_sum5[3] + x, sq5[3]);
+  StoreAligned64(square_sum3[3] + x, sq3[3]);
+  StoreAligned64(square_sum5[4] + x, sq5[4]);
+  LoadAligned32x2U16(sum3, x, s3[0]);
+  LoadAligned64x2U32(square_sum3, x, sq3);
+  CalculateSumAndIndex3(s3[0], sq3, scales[1], &sum_3[0][0], &index_3[0][0]);
+  CalculateSumAndIndex3(s3[0] + 1, sq3 + 1, scales[1], &sum_3[1][0],
+                        &index_3[1][0]);
+  LoadAligned32x3U16(sum5, x, s5[0]);
+  LoadAligned64x3U32(square_sum5, x, sq5);
+  CalculateSumAndIndex5(s5[0], sq5, scales[0], &sum_5[0], &index_5[0]);
+
+  s[0] = LoadUnaligned32Msan(src0 + 24, over_read_in_bytes + 48);
+  s[1] = LoadUnaligned32Msan(src1 + 24, over_read_in_bytes + 48);
+  Square(s[0], sq[0] + 6);
+  Square(s[1], sq[1] + 6);
+  sq[0][4] = _mm256_permute2x128_si256(sq[0][2], sq[0][6], 0x21);
+  sq[0][5] = _mm256_permute2x128_si256(sq[0][3], sq[0][7], 0x21);
+  sq[1][4] = _mm256_permute2x128_si256(sq[1][2], sq[1][6], 0x21);
+  sq[1][5] = _mm256_permute2x128_si256(sq[1][3], sq[1][7], 0x21);
+  SumHorizontal32(sq[0] + 4, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  SumHorizontal32(sq[1] + 4, &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+  StoreAligned64(square_sum3[2] + x + 16, sq3[2]);
+  StoreAligned64(square_sum5[3] + x + 16, sq5[3]);
+  StoreAligned64(square_sum3[3] + x + 16, sq3[3]);
+  StoreAligned64(square_sum5[4] + x + 16, sq5[4]);
+  LoadAligned32x2U16Msan(sum3, x + 16, sum_width, s3[1]);
+  LoadAligned64x2U32Msan(square_sum3, x + 16, sum_width, sq3);
+  CalculateSumAndIndex3(s3[1], sq3, scales[1], &sum_3[0][1], &index_3[0][1]);
+  CalculateSumAndIndex3(s3[1] + 1, sq3 + 1, scales[1], &sum_3[1][1],
+                        &index_3[1][1]);
+  CalculateIntermediate<9>(sum_3[0], index_3[0], ma3[0], t, t + 2);
+  PermuteB(t, b3[0]);
+  CalculateIntermediate<9>(sum_3[1], index_3[1], ma3[1], t, t + 2);
+  PermuteB(t, b3[1]);
+  LoadAligned32x3U16Msan(sum5, x + 16, sum_width, s5[1]);
+  LoadAligned64x3U32Msan(square_sum5, x + 16, sum_width, sq5);
+  CalculateSumAndIndex5(s5[1], sq5, scales[0], &sum_5[1], &index_5[1]);
+  CalculateIntermediate<25>(sum_5, index_5, ma5, t, t + 2);
+  PermuteB(t, b5);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRowLo(
+    const __m128i s[2], const uint16_t scales[2], const uint16_t* const sum3[4],
+    const uint16_t* const sum5[5], const uint32_t* const square_sum3[4],
+    const uint32_t* const square_sum5[5], __m128i sq[4], __m128i* const ma3,
+    __m128i* const ma5, __m128i b3[2], __m128i b5[2]) {
+  __m128i s3[3], s5[5], sq3[3][2], sq5[5][2];
+  Square(s[1], sq + 2);
+  SumHorizontal16(s, &s3[2], &s5[3]);
+  SumHorizontal32(sq, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  LoadAligned16x3U16(sum5, 0, s5);
+  s5[4] = s5[3];
+  LoadAligned32x3U32(square_sum5, 0, sq5);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  CalculateIntermediate5(s5, sq5, scales[0], ma5, b5);
+  LoadAligned16x2U16(sum3, 0, s3);
+  LoadAligned32x2U32(square_sum3, 0, sq3);
+  CalculateIntermediate3(s3, sq3, scales[1], ma3, b3);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRow(
+    const uint16_t* const src, const ptrdiff_t over_read_in_bytes,
+    const ptrdiff_t sum_width, const ptrdiff_t x, const uint16_t scales[2],
+    const uint16_t* const sum3[4], const uint16_t* const sum5[5],
+    const uint32_t* const square_sum3[4], const uint32_t* const square_sum5[5],
+    __m256i sq[6], __m256i ma3[2], __m256i ma5[2], __m256i b3[5],
+    __m256i b5[5]) {
+  const __m256i s0 = LoadUnaligned32Msan(src + 8, over_read_in_bytes + 16);
+  __m256i s3[2][3], s5[2][5], sq3[4][2], sq5[5][2], sum_3[2], index_3[2],
+      sum_5[2], index_5[2], t[4];
+  Square(s0, sq + 2);
+  sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21);
+  sq[1] = _mm256_permute2x128_si256(sq[1], sq[3], 0x21);
+  SumHorizontal16(src, over_read_in_bytes, &s3[0][2], &s3[1][2], &s5[0][3],
+                  &s5[1][3]);
+  SumHorizontal32(sq, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  LoadAligned32x2U16(sum3, x, s3[0]);
+  LoadAligned64x2U32(square_sum3, x, sq3);
+  CalculateSumAndIndex3(s3[0], sq3, scales[1], &sum_3[0], &index_3[0]);
+  LoadAligned32x3U16(sum5, x, s5[0]);
+  s5[0][4] = s5[0][3];
+  LoadAligned64x3U32(square_sum5, x, sq5);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  CalculateSumAndIndex5(s5[0], sq5, scales[0], &sum_5[0], &index_5[0]);
+
+  const __m256i s1 = LoadUnaligned32Msan(src + 24, over_read_in_bytes + 48);
+  Square(s1, sq + 6);
+  sq[4] = _mm256_permute2x128_si256(sq[2], sq[6], 0x21);
+  sq[5] = _mm256_permute2x128_si256(sq[3], sq[7], 0x21);
+  SumHorizontal32(sq + 4, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  LoadAligned32x2U16Msan(sum3, x + 16, sum_width, s3[1]);
+  LoadAligned64x2U32Msan(square_sum3, x + 16, sum_width, sq3);
+  CalculateSumAndIndex3(s3[1], sq3, scales[1], &sum_3[1], &index_3[1]);
+  CalculateIntermediate<9>(sum_3, index_3, ma3, t, t + 2);
+  PermuteB(t, b3);
+  LoadAligned32x3U16Msan(sum5, x + 16, sum_width, s5[1]);
+  s5[1][4] = s5[1][3];
+  LoadAligned64x3U32Msan(square_sum5, x + 16, sum_width, sq5);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  CalculateSumAndIndex5(s5[1], sq5, scales[0], &sum_5[1], &index_5[1]);
+  CalculateIntermediate<25>(sum_5, index_5, ma5, t, t + 2);
+  PermuteB(t, b5);
+}
+
+inline void BoxSumFilterPreProcess5(const uint16_t* const src0,
+                                    const uint16_t* const src1, const int width,
+                                    const uint32_t scale,
+                                    uint16_t* const sum5[5],
+                                    uint32_t* const square_sum5[5],
+                                    const ptrdiff_t sum_width, uint16_t* ma565,
+                                    uint32_t* b565) {
+  const ptrdiff_t overread_in_bytes =
+      kOverreadInBytesPass1_128 - sizeof(*src0) * width;
+  __m128i s[2][2], ma0, sq_128[2][4], b0[2];
+  __m256i mas[3], sq[2][8], bs[10];
+  s[0][0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+  s[0][1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+  s[1][0] = LoadUnaligned16Msan(src1 + 0, overread_in_bytes + 0);
+  s[1][1] = LoadUnaligned16Msan(src1 + 8, overread_in_bytes + 16);
+  Square(s[0][0], sq_128[0]);
+  Square(s[1][0], sq_128[1]);
+  BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq_128, &ma0, b0);
+  sq[0][0] = SetrM128i(sq_128[0][2], sq_128[0][2]);
+  sq[0][1] = SetrM128i(sq_128[0][3], sq_128[0][3]);
+  sq[1][0] = SetrM128i(sq_128[1][2], sq_128[1][2]);
+  sq[1][1] = SetrM128i(sq_128[1][3], sq_128[1][3]);
+  mas[0] = SetrM128i(ma0, ma0);
+  bs[0] = SetrM128i(b0[0], b0[0]);
+  bs[1] = SetrM128i(b0[1], b0[1]);
+
+  int x = 0;
+  do {
+    __m256i ma5[3], ma[2], b[4];
+    BoxFilterPreProcess5(
+        src0 + x + 8, src1 + x + 8,
+        kOverreadInBytesPass1_256 + sizeof(*src0) * (x + 8 - width), sum_width,
+        x + 8, scale, sum5, square_sum5, sq, mas, bs);
+    Prepare3_8(mas, ma5);
+    ma[0] = Sum565Lo(ma5);
+    ma[1] = Sum565Hi(ma5);
+    StoreAligned64_ma(ma565, ma);
+    Sum565(bs + 0, b + 0);
+    Sum565(bs + 3, b + 2);
+    StoreAligned64(b565, b + 0);
+    StoreAligned64(b565 + 16, b + 2);
+    sq[0][0] = sq[0][6];
+    sq[0][1] = sq[0][7];
+    sq[1][0] = sq[1][6];
+    sq[1][1] = sq[1][7];
+    mas[0] = mas[2];
+    bs[0] = bs[5];
+    bs[1] = bs[6];
+    ma565 += 32;
+    b565 += 32;
+    x += 32;
+  } while (x < width);
+}
+
+template <bool calculate444>
+LIBGAV1_ALWAYS_INLINE void BoxSumFilterPreProcess3(
+    const uint16_t* const src, const int width, const uint32_t scale,
+    uint16_t* const sum3[3], uint32_t* const square_sum3[3],
+    const ptrdiff_t sum_width, uint16_t* ma343, uint16_t* ma444, uint32_t* b343,
+    uint32_t* b444) {
+  const ptrdiff_t overread_in_bytes_128 =
+      kOverreadInBytesPass2_128 - sizeof(*src) * width;
+  __m128i s[2], ma0, sq_128[4], b0[2];
+  __m256i mas[3], sq[8], bs[7];
+  s[0] = LoadUnaligned16Msan(src + 0, overread_in_bytes_128 + 0);
+  s[1] = LoadUnaligned16Msan(src + 8, overread_in_bytes_128 + 16);
+  Square(s[0], sq_128);
+  BoxFilterPreProcess3Lo(s, scale, sum3, square_sum3, sq_128, &ma0, b0);
+  sq[0] = SetrM128i(sq_128[2], sq_128[2]);
+  sq[1] = SetrM128i(sq_128[3], sq_128[3]);
+  mas[0] = SetrM128i(ma0, ma0);
+  bs[0] = SetrM128i(b0[0], b0[0]);
+  bs[1] = SetrM128i(b0[1], b0[1]);
+
+  int x = 0;
+  do {
+    __m256i ma3[3];
+    BoxFilterPreProcess3(
+        src + x + 8, kOverreadInBytesPass2_256 + sizeof(*src) * (x + 8 - width),
+        x + 8, sum_width, scale, sum3, square_sum3, sq, mas, bs);
+    Prepare3_8(mas, ma3);
+    if (calculate444) {  // NOLINT(readability-simplify-boolean-expr)
+      Store343_444Lo(ma3, bs + 0, 0, ma343, ma444, b343, b444);
+      Store343_444Hi(ma3, bs + 3, kMaStoreOffset, ma343, ma444, b343, b444);
+      ma444 += 32;
+      b444 += 32;
+    } else {
+      __m256i ma[2], b[4];
+      ma[0] = Sum343Lo(ma3);
+      ma[1] = Sum343Hi(ma3);
+      StoreAligned64_ma(ma343, ma);
+      Sum343(bs + 0, b + 0);
+      Sum343(bs + 3, b + 2);
+      StoreAligned64(b343 + 0, b + 0);
+      StoreAligned64(b343 + 16, b + 2);
+    }
+    sq[0] = sq[6];
+    sq[1] = sq[7];
+    mas[0] = mas[2];
+    bs[0] = bs[5];
+    bs[1] = bs[6];
+    ma343 += 32;
+    b343 += 32;
+    x += 32;
+  } while (x < width);
+}
+
+inline void BoxSumFilterPreProcess(
+    const uint16_t* const src0, const uint16_t* const src1, const int width,
+    const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    const ptrdiff_t sum_width, uint16_t* const ma343[4], uint16_t* const ma444,
+    uint16_t* ma565, uint32_t* const b343[4], uint32_t* const b444,
+    uint32_t* b565) {
+  const ptrdiff_t overread_in_bytes =
+      kOverreadInBytesPass1_128 - sizeof(*src0) * width;
+  __m128i s[2][4], ma3_128[2][3], ma5_128[3], sq_128[2][8], b3_128[2][10],
+      b5_128[10];
+  __m256i ma3[2][3], ma5[3], sq[2][8], b3[2][7], b5[7];
+  s[0][0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+  s[0][1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+  s[1][0] = LoadUnaligned16Msan(src1 + 0, overread_in_bytes + 0);
+  s[1][1] = LoadUnaligned16Msan(src1 + 8, overread_in_bytes + 16);
+  Square(s[0][0], sq_128[0]);
+  Square(s[1][0], sq_128[1]);
+  BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq_128,
+                        ma3_128, b3_128, &ma5_128[0], b5_128);
+  sq[0][0] = SetrM128i(sq_128[0][2], sq_128[0][2]);
+  sq[0][1] = SetrM128i(sq_128[0][3], sq_128[0][3]);
+  sq[1][0] = SetrM128i(sq_128[1][2], sq_128[1][2]);
+  sq[1][1] = SetrM128i(sq_128[1][3], sq_128[1][3]);
+  ma3[0][0] = SetrM128i(ma3_128[0][0], ma3_128[0][0]);
+  ma3[1][0] = SetrM128i(ma3_128[1][0], ma3_128[1][0]);
+  ma5[0] = SetrM128i(ma5_128[0], ma5_128[0]);
+  b3[0][0] = SetrM128i(b3_128[0][0], b3_128[0][0]);
+  b3[0][1] = SetrM128i(b3_128[0][1], b3_128[0][1]);
+  b3[1][0] = SetrM128i(b3_128[1][0], b3_128[1][0]);
+  b3[1][1] = SetrM128i(b3_128[1][1], b3_128[1][1]);
+  b5[0] = SetrM128i(b5_128[0], b5_128[0]);
+  b5[1] = SetrM128i(b5_128[1], b5_128[1]);
+
+  int x = 0;
+  do {
+    __m256i ma[2], b[4], ma3x[3], ma5x[3];
+    BoxFilterPreProcess(
+        src0 + x + 8, src1 + x + 8,
+        kOverreadInBytesPass1_256 + sizeof(*src0) * (x + 8 - width), x + 8,
+        scales, sum3, sum5, square_sum3, square_sum5, sum_width, sq, ma3, b3,
+        ma5, b5);
+    Prepare3_8(ma3[0], ma3x);
+    ma[0] = Sum343Lo(ma3x);
+    ma[1] = Sum343Hi(ma3x);
+    StoreAligned64_ma(ma343[0] + x, ma);
+    Sum343(b3[0], b);
+    Sum343(b3[0] + 3, b + 2);
+    StoreAligned64(b343[0] + x, b);
+    StoreAligned64(b343[0] + x + 16, b + 2);
+    Prepare3_8(ma3[1], ma3x);
+    Store343_444Lo(ma3x, b3[1], x, ma343[1], ma444, b343[1], b444);
+    Store343_444Hi(ma3x, b3[1] + 3, x + kMaStoreOffset, ma343[1], ma444,
+                   b343[1], b444);
+    Prepare3_8(ma5, ma5x);
+    ma[0] = Sum565Lo(ma5x);
+    ma[1] = Sum565Hi(ma5x);
+    StoreAligned64_ma(ma565, ma);
+    Sum565(b5, b);
+    StoreAligned64(b565, b);
+    Sum565(b5 + 3, b);
+    StoreAligned64(b565 + 16, b);
+    sq[0][0] = sq[0][6];
+    sq[0][1] = sq[0][7];
+    sq[1][0] = sq[1][6];
+    sq[1][1] = sq[1][7];
+    ma3[0][0] = ma3[0][2];
+    ma3[1][0] = ma3[1][2];
+    ma5[0] = ma5[2];
+    b3[0][0] = b3[0][5];
+    b3[0][1] = b3[0][6];
+    b3[1][0] = b3[1][5];
+    b3[1][1] = b3[1][6];
+    b5[0] = b5[5];
+    b5[1] = b5[6];
+    ma565 += 32;
+    b565 += 32;
+    x += 32;
+  } while (x < width);
+}
+
+template <int shift>
+inline __m256i FilterOutput(const __m256i ma_x_src, const __m256i b) {
+  // ma: 255 * 32 = 8160 (13 bits)
+  // b: 65088 * 32 = 2082816 (21 bits)
+  // v: b - ma * 255 (22 bits)
+  const __m256i v = _mm256_sub_epi32(b, ma_x_src);
+  // kSgrProjSgrBits = 8
+  // kSgrProjRestoreBits = 4
+  // shift = 4 or 5
+  // v >> 8 or 9 (13 bits)
+  return VrshrS32(v, kSgrProjSgrBits + shift - kSgrProjRestoreBits);
+}
+
+template <int shift>
+inline __m256i CalculateFilteredOutput(const __m256i src, const __m256i ma,
+                                       const __m256i b[2]) {
+  const __m256i ma_x_src_lo = VmullLo16(ma, src);
+  const __m256i ma_x_src_hi = VmullHi16(ma, src);
+  const __m256i dst_lo = FilterOutput<shift>(ma_x_src_lo, b[0]);
+  const __m256i dst_hi = FilterOutput<shift>(ma_x_src_hi, b[1]);
+  return _mm256_packs_epi32(dst_lo, dst_hi);  // 13 bits
+}
+
+inline __m256i CalculateFilteredOutputPass1(const __m256i src,
+                                            const __m256i ma[2],
+                                            const __m256i b[2][2]) {
+  const __m256i ma_sum = _mm256_add_epi16(ma[0], ma[1]);
+  __m256i b_sum[2];
+  b_sum[0] = _mm256_add_epi32(b[0][0], b[1][0]);
+  b_sum[1] = _mm256_add_epi32(b[0][1], b[1][1]);
+  return CalculateFilteredOutput<5>(src, ma_sum, b_sum);
+}
+
+inline __m256i CalculateFilteredOutputPass2(const __m256i src,
+                                            const __m256i ma[3],
+                                            const __m256i b[3][2]) {
+  const __m256i ma_sum = Sum3_16(ma);
+  __m256i b_sum[2];
+  Sum3_32(b, b_sum);
+  return CalculateFilteredOutput<5>(src, ma_sum, b_sum);
+}
+
+inline __m256i SelfGuidedFinal(const __m256i src, const __m256i v[2]) {
+  const __m256i v_lo =
+      VrshrS32(v[0], kSgrProjRestoreBits + kSgrProjPrecisionBits);
+  const __m256i v_hi =
+      VrshrS32(v[1], kSgrProjRestoreBits + kSgrProjPrecisionBits);
+  const __m256i vv = _mm256_packs_epi32(v_lo, v_hi);
+  return _mm256_add_epi16(src, vv);
+}
+
+inline __m256i SelfGuidedDoubleMultiplier(const __m256i src,
+                                          const __m256i filter[2], const int w0,
+                                          const int w2) {
+  __m256i v[2];
+  const __m256i w0_w2 =
+      _mm256_set1_epi32((w2 << 16) | static_cast<uint16_t>(w0));
+  const __m256i f_lo = _mm256_unpacklo_epi16(filter[0], filter[1]);
+  const __m256i f_hi = _mm256_unpackhi_epi16(filter[0], filter[1]);
+  v[0] = _mm256_madd_epi16(w0_w2, f_lo);
+  v[1] = _mm256_madd_epi16(w0_w2, f_hi);
+  return SelfGuidedFinal(src, v);
+}
+
+inline __m256i SelfGuidedSingleMultiplier(const __m256i src,
+                                          const __m256i filter, const int w0) {
+  // weight: -96 to 96 (Sgrproj_Xqd_Min/Max)
+  __m256i v[2];
+  v[0] = VmullNLo8(filter, w0);
+  v[1] = VmullNHi8(filter, w0);
+  return SelfGuidedFinal(src, v);
+}
+
+inline void ClipAndStore(uint16_t* const dst, const __m256i val) {
+  const __m256i val0 = _mm256_max_epi16(val, _mm256_setzero_si256());
+  const __m256i val1 = _mm256_min_epi16(val0, _mm256_set1_epi16(1023));
+  StoreUnaligned32(dst, val1);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPass1(
+    const uint16_t* const src, const uint16_t* const src0,
+    const uint16_t* const src1, const ptrdiff_t stride, uint16_t* const sum5[5],
+    uint32_t* const square_sum5[5], const int width, const ptrdiff_t sum_width,
+    const uint32_t scale, const int16_t w0, uint16_t* const ma565[2],
+    uint32_t* const b565[2], uint16_t* const dst) {
+  const ptrdiff_t overread_in_bytes =
+      kOverreadInBytesPass1_128 - sizeof(*src0) * width;
+  __m128i s[2][2], ma0, sq_128[2][4], b0[2];
+  __m256i mas[3], sq[2][8], bs[7];
+  s[0][0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+  s[0][1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+  s[1][0] = LoadUnaligned16Msan(src1 + 0, overread_in_bytes + 0);
+  s[1][1] = LoadUnaligned16Msan(src1 + 8, overread_in_bytes + 16);
+  Square(s[0][0], sq_128[0]);
+  Square(s[1][0], sq_128[1]);
+  BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq_128, &ma0, b0);
+  sq[0][0] = SetrM128i(sq_128[0][2], sq_128[0][2]);
+  sq[0][1] = SetrM128i(sq_128[0][3], sq_128[0][3]);
+  sq[1][0] = SetrM128i(sq_128[1][2], sq_128[1][2]);
+  sq[1][1] = SetrM128i(sq_128[1][3], sq_128[1][3]);
+  mas[0] = SetrM128i(ma0, ma0);
+  bs[0] = SetrM128i(b0[0], b0[0]);
+  bs[1] = SetrM128i(b0[1], b0[1]);
+
+  int x = 0;
+  do {
+    __m256i ma5[3], ma[4], b[4][2];
+    BoxFilterPreProcess5(
+        src0 + x + 8, src1 + x + 8,
+        kOverreadInBytesPass1_256 + sizeof(*src0) * (x + 8 - width), sum_width,
+        x + 8, scale, sum5, square_sum5, sq, mas, bs);
+    Prepare3_8(mas, ma5);
+    ma[2] = Sum565Lo(ma5);
+    ma[3] = Sum565Hi(ma5);
+    ma[1] = _mm256_permute2x128_si256(ma[2], ma[3], 0x20);
+    ma[3] = _mm256_permute2x128_si256(ma[2], ma[3], 0x31);
+    StoreAligned32(ma565[1] + x + 0, ma[1]);
+    StoreAligned32(ma565[1] + x + 16, ma[3]);
+    Sum565(bs + 0, b[1]);
+    Sum565(bs + 3, b[3]);
+    StoreAligned64(b565[1] + x, b[1]);
+    StoreAligned64(b565[1] + x + 16, b[3]);
+    const __m256i sr0_lo = LoadUnaligned32(src + x + 0);
+    ma[0] = LoadAligned32(ma565[0] + x);
+    LoadAligned64(b565[0] + x, b[0]);
+    const __m256i p0 = CalculateFilteredOutputPass1(sr0_lo, ma, b);
+    const __m256i d0 = SelfGuidedSingleMultiplier(sr0_lo, p0, w0);
+    ClipAndStore(dst + x + 0, d0);
+    const __m256i sr0_hi = LoadUnaligned32(src + x + 16);
+    ma[2] = LoadAligned32(ma565[0] + x + 16);
+    LoadAligned64(b565[0] + x + 16, b[2]);
+    const __m256i p1 = CalculateFilteredOutputPass1(sr0_hi, ma + 2, b + 2);
+    const __m256i d1 = SelfGuidedSingleMultiplier(sr0_hi, p1, w0);
+    ClipAndStore(dst + x + 16, d1);
+    const __m256i sr1_lo = LoadUnaligned32(src + stride + x + 0);
+    const __m256i p10 = CalculateFilteredOutput<4>(sr1_lo, ma[1], b[1]);
+    const __m256i d10 = SelfGuidedSingleMultiplier(sr1_lo, p10, w0);
+    ClipAndStore(dst + stride + x + 0, d10);
+    const __m256i sr1_hi = LoadUnaligned32(src + stride + x + 16);
+    const __m256i p11 = CalculateFilteredOutput<4>(sr1_hi, ma[3], b[3]);
+    const __m256i d11 = SelfGuidedSingleMultiplier(sr1_hi, p11, w0);
+    ClipAndStore(dst + stride + x + 16, d11);
+    sq[0][0] = sq[0][6];
+    sq[0][1] = sq[0][7];
+    sq[1][0] = sq[1][6];
+    sq[1][1] = sq[1][7];
+    mas[0] = mas[2];
+    bs[0] = bs[5];
+    bs[1] = bs[6];
+    x += 32;
+  } while (x < width);
+}
+
+inline void BoxFilterPass1LastRow(
+    const uint16_t* const src, const uint16_t* const src0, const int width,
+    const ptrdiff_t sum_width, const uint32_t scale, const int16_t w0,
+    uint16_t* const sum5[5], uint32_t* const square_sum5[5], uint16_t* ma565,
+    uint32_t* b565, uint16_t* const dst) {
+  const ptrdiff_t overread_in_bytes =
+      kOverreadInBytesPass1_128 - sizeof(*src0) * width;
+  __m128i s[2], ma0[2], sq_128[8], b0[6];
+  __m256i mas[3], sq[8], bs[7];
+  s[0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+  s[1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+  Square(s[0], sq_128);
+  BoxFilterPreProcess5LastRowLo(s, scale, sum5, square_sum5, sq_128, &ma0[0],
+                                b0);
+  sq[0] = SetrM128i(sq_128[2], sq_128[2]);
+  sq[1] = SetrM128i(sq_128[3], sq_128[3]);
+  mas[0] = SetrM128i(ma0[0], ma0[0]);
+  bs[0] = SetrM128i(b0[0], b0[0]);
+  bs[1] = SetrM128i(b0[1], b0[1]);
+
+  int x = 0;
+  do {
+    __m256i ma5[3], ma[4], b[4][2];
+    BoxFilterPreProcess5LastRow(
+        src0 + x + 8,
+        kOverreadInBytesPass1_256 + sizeof(*src0) * (x + 8 - width), sum_width,
+        x + 8, scale, sum5, square_sum5, sq, mas, bs);
+    Prepare3_8(mas, ma5);
+    ma[2] = Sum565Lo(ma5);
+    ma[3] = Sum565Hi(ma5);
+    Sum565(bs + 0, b[1]);
+    Sum565(bs + 3, b[3]);
+    const __m256i sr0_lo = LoadUnaligned32(src + x + 0);
+    ma[0] = LoadAligned32(ma565 + x);
+    ma[1] = _mm256_permute2x128_si256(ma[2], ma[3], 0x20);
+    LoadAligned64(b565 + x, b[0]);
+    const __m256i p0 = CalculateFilteredOutputPass1(sr0_lo, ma, b);
+    const __m256i d0 = SelfGuidedSingleMultiplier(sr0_lo, p0, w0);
+    ClipAndStore(dst + x + 0, d0);
+    const __m256i sr0_hi = LoadUnaligned32(src + x + 16);
+    ma[0] = LoadAligned32(ma565 + x + 16);
+    ma[1] = _mm256_permute2x128_si256(ma[2], ma[3], 0x31);
+    LoadAligned64(b565 + x + 16, b[2]);
+    const __m256i p1 = CalculateFilteredOutputPass1(sr0_hi, ma, b + 2);
+    const __m256i d1 = SelfGuidedSingleMultiplier(sr0_hi, p1, w0);
+    ClipAndStore(dst + x + 16, d1);
+    sq[0] = sq[6];
+    sq[1] = sq[7];
+    mas[0] = mas[2];
+    bs[0] = bs[5];
+    bs[1] = bs[6];
+    x += 32;
+  } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPass2(
+    const uint16_t* const src, const uint16_t* const src0, const int width,
+    const ptrdiff_t sum_width, const uint32_t scale, const int16_t w0,
+    uint16_t* const sum3[3], uint32_t* const square_sum3[3],
+    uint16_t* const ma343[3], uint16_t* const ma444[2], uint32_t* const b343[3],
+    uint32_t* const b444[2], uint16_t* const dst) {
+  const ptrdiff_t overread_in_bytes_128 =
+      kOverreadInBytesPass2_128 - sizeof(*src0) * width;
+  __m128i s0[2], ma0, sq_128[4], b0[2];
+  __m256i mas[3], sq[8], bs[7];
+  s0[0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes_128 + 0);
+  s0[1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes_128 + 16);
+  Square(s0[0], sq_128);
+  BoxFilterPreProcess3Lo(s0, scale, sum3, square_sum3, sq_128, &ma0, b0);
+  sq[0] = SetrM128i(sq_128[2], sq_128[2]);
+  sq[1] = SetrM128i(sq_128[3], sq_128[3]);
+  mas[0] = SetrM128i(ma0, ma0);
+  bs[0] = SetrM128i(b0[0], b0[0]);
+  bs[1] = SetrM128i(b0[1], b0[1]);
+
+  int x = 0;
+  do {
+    __m256i ma[4], b[4][2], ma3[3];
+    BoxFilterPreProcess3(
+        src0 + x + 8,
+        kOverreadInBytesPass2_256 + sizeof(*src0) * (x + 8 - width), x + 8,
+        sum_width, scale, sum3, square_sum3, sq, mas, bs);
+    Prepare3_8(mas, ma3);
+    Store343_444(ma3, bs, x, &ma[2], &ma[3], b[2], b[3], ma343[2], ma444[1],
+                 b343[2], b444[1]);
+    const __m256i sr_lo = LoadUnaligned32(src + x + 0);
+    const __m256i sr_hi = LoadUnaligned32(src + x + 16);
+    ma[0] = LoadAligned32(ma343[0] + x);
+    ma[1] = LoadAligned32(ma444[0] + x);
+    LoadAligned64(b343[0] + x, b[0]);
+    LoadAligned64(b444[0] + x, b[1]);
+    const __m256i p0 = CalculateFilteredOutputPass2(sr_lo, ma, b);
+    ma[1] = LoadAligned32(ma343[0] + x + 16);
+    ma[2] = LoadAligned32(ma444[0] + x + 16);
+    LoadAligned64(b343[0] + x + 16, b[1]);
+    LoadAligned64(b444[0] + x + 16, b[2]);
+    const __m256i p1 = CalculateFilteredOutputPass2(sr_hi, ma + 1, b + 1);
+    const __m256i d0 = SelfGuidedSingleMultiplier(sr_lo, p0, w0);
+    const __m256i d1 = SelfGuidedSingleMultiplier(sr_hi, p1, w0);
+    ClipAndStore(dst + x + 0, d0);
+    ClipAndStore(dst + x + 16, d1);
+    sq[0] = sq[6];
+    sq[1] = sq[7];
+    mas[0] = mas[2];
+    bs[0] = bs[5];
+    bs[1] = bs[6];
+    x += 32;
+  } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilter(
+    const uint16_t* const src, const uint16_t* const src0,
+    const uint16_t* const src1, const ptrdiff_t stride, const int width,
+    const uint16_t scales[2], const int16_t w0, const int16_t w2,
+    uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    const ptrdiff_t sum_width, uint16_t* const ma343[4],
+    uint16_t* const ma444[3], uint16_t* const ma565[2], uint32_t* const b343[4],
+    uint32_t* const b444[3], uint32_t* const b565[2], uint16_t* const dst) {
+  const ptrdiff_t overread_in_bytes =
+      kOverreadInBytesPass1_128 - sizeof(*src0) * width;
+  __m128i s[2][4], ma3_128[2][3], ma5_0, sq_128[2][8], b3_128[2][10], b5_128[2];
+  __m256i ma3[2][3], ma5[3], sq[2][8], b3[2][7], b5[7];
+  s[0][0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+  s[0][1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+  s[1][0] = LoadUnaligned16Msan(src1 + 0, overread_in_bytes + 0);
+  s[1][1] = LoadUnaligned16Msan(src1 + 8, overread_in_bytes + 16);
+  Square(s[0][0], sq_128[0]);
+  Square(s[1][0], sq_128[1]);
+  BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq_128,
+                        ma3_128, b3_128, &ma5_0, b5_128);
+  sq[0][0] = SetrM128i(sq_128[0][2], sq_128[0][2]);
+  sq[0][1] = SetrM128i(sq_128[0][3], sq_128[0][3]);
+  sq[1][0] = SetrM128i(sq_128[1][2], sq_128[1][2]);
+  sq[1][1] = SetrM128i(sq_128[1][3], sq_128[1][3]);
+  ma3[0][0] = SetrM128i(ma3_128[0][0], ma3_128[0][0]);
+  ma3[1][0] = SetrM128i(ma3_128[1][0], ma3_128[1][0]);
+  ma5[0] = SetrM128i(ma5_0, ma5_0);
+  b3[0][0] = SetrM128i(b3_128[0][0], b3_128[0][0]);
+  b3[0][1] = SetrM128i(b3_128[0][1], b3_128[0][1]);
+  b3[1][0] = SetrM128i(b3_128[1][0], b3_128[1][0]);
+  b3[1][1] = SetrM128i(b3_128[1][1], b3_128[1][1]);
+  b5[0] = SetrM128i(b5_128[0], b5_128[0]);
+  b5[1] = SetrM128i(b5_128[1], b5_128[1]);
+
+  int x = 0;
+  do {
+    __m256i ma[3][4], mat[3][3], b[3][3][2], bt[3][3][2], p[2][2], ma3x[2][3],
+        ma5x[3];
+    BoxFilterPreProcess(
+        src0 + x + 8, src1 + x + 8,
+        kOverreadInBytesPass1_256 + sizeof(*src0) * (x + 8 - width), x + 8,
+        scales, sum3, sum5, square_sum3, square_sum5, sum_width, sq, ma3, b3,
+        ma5, b5);
+    Prepare3_8(ma3[0], ma3x[0]);
+    Prepare3_8(ma3[1], ma3x[1]);
+    Prepare3_8(ma5, ma5x);
+    Store343_444(ma3x[0], b3[0], x, &ma[1][2], &mat[1][2], &ma[2][1],
+                 &mat[2][1], b[1][2], bt[1][2], b[2][1], bt[2][1], ma343[2],
+                 ma444[1], b343[2], b444[1]);
+    Store343_444(ma3x[1], b3[1], x, &ma[2][2], &mat[2][2], b[2][2], bt[2][2],
+                 ma343[3], ma444[2], b343[3], b444[2]);
+
+    ma[0][2] = Sum565Lo(ma5x);
+    ma[0][3] = Sum565Hi(ma5x);
+    ma[0][1] = _mm256_permute2x128_si256(ma[0][2], ma[0][3], 0x20);
+    ma[0][3] = _mm256_permute2x128_si256(ma[0][2], ma[0][3], 0x31);
+    StoreAligned32(ma565[1] + x + 0, ma[0][1]);
+    StoreAligned32(ma565[1] + x + 16, ma[0][3]);
+    Sum565(b5, b[0][1]);
+    StoreAligned64(b565[1] + x, b[0][1]);
+    const __m256i sr0_lo = LoadUnaligned32(src + x);
+    const __m256i sr1_lo = LoadUnaligned32(src + stride + x);
+    ma[0][0] = LoadAligned32(ma565[0] + x);
+    LoadAligned64(b565[0] + x, b[0][0]);
+    p[0][0] = CalculateFilteredOutputPass1(sr0_lo, ma[0], b[0]);
+    p[1][0] = CalculateFilteredOutput<4>(sr1_lo, ma[0][1], b[0][1]);
+    ma[1][0] = LoadAligned32(ma343[0] + x);
+    ma[1][1] = LoadAligned32(ma444[0] + x);
+    // Keeping the following 4 redundant lines is faster. The reason is that
+    // there are not enough registers available, and these values could be saved
+    // and loaded which is even slower.
+    ma[1][2] = LoadAligned32(ma343[2] + x);  // Redundant line 1.
+    LoadAligned64(b343[0] + x, b[1][0]);
+    LoadAligned64(b444[0] + x, b[1][1]);
+    p[0][1] = CalculateFilteredOutputPass2(sr0_lo, ma[1], b[1]);
+    ma[2][0] = LoadAligned32(ma343[1] + x);
+    ma[2][1] = LoadAligned32(ma444[1] + x);  // Redundant line 2.
+    LoadAligned64(b343[1] + x, b[2][0]);
+    p[1][1] = CalculateFilteredOutputPass2(sr1_lo, ma[2], b[2]);
+    const __m256i d00 = SelfGuidedDoubleMultiplier(sr0_lo, p[0], w0, w2);
+    ClipAndStore(dst + x, d00);
+    const __m256i d10x = SelfGuidedDoubleMultiplier(sr1_lo, p[1], w0, w2);
+    ClipAndStore(dst + stride + x, d10x);
+
+    Sum565(b5 + 3, bt[0][1]);
+    StoreAligned64(b565[1] + x + 16, bt[0][1]);
+    const __m256i sr0_hi = LoadUnaligned32(src + x + 16);
+    const __m256i sr1_hi = LoadUnaligned32(src + stride + x + 16);
+    ma[0][2] = LoadAligned32(ma565[0] + x + 16);
+    LoadAligned64(b565[0] + x + 16, bt[0][0]);
+    p[0][0] = CalculateFilteredOutputPass1(sr0_hi, ma[0] + 2, bt[0]);
+    p[1][0] = CalculateFilteredOutput<4>(sr1_hi, ma[0][3], bt[0][1]);
+    mat[1][0] = LoadAligned32(ma343[0] + x + 16);
+    mat[1][1] = LoadAligned32(ma444[0] + x + 16);
+    mat[1][2] = LoadAligned32(ma343[2] + x + 16);  // Redundant line 3.
+    LoadAligned64(b343[0] + x + 16, bt[1][0]);
+    LoadAligned64(b444[0] + x + 16, bt[1][1]);
+    p[0][1] = CalculateFilteredOutputPass2(sr0_hi, mat[1], bt[1]);
+    mat[2][0] = LoadAligned32(ma343[1] + x + 16);
+    mat[2][1] = LoadAligned32(ma444[1] + x + 16);  // Redundant line 4.
+    LoadAligned64(b343[1] + x + 16, bt[2][0]);
+    p[1][1] = CalculateFilteredOutputPass2(sr1_hi, mat[2], bt[2]);
+    const __m256i d01 = SelfGuidedDoubleMultiplier(sr0_hi, p[0], w0, w2);
+    ClipAndStore(dst + x + 16, d01);
+    const __m256i d11 = SelfGuidedDoubleMultiplier(sr1_hi, p[1], w0, w2);
+    ClipAndStore(dst + stride + x + 16, d11);
+
+    sq[0][0] = sq[0][6];
+    sq[0][1] = sq[0][7];
+    sq[1][0] = sq[1][6];
+    sq[1][1] = sq[1][7];
+    ma3[0][0] = ma3[0][2];
+    ma3[1][0] = ma3[1][2];
+    ma5[0] = ma5[2];
+    b3[0][0] = b3[0][5];
+    b3[0][1] = b3[0][6];
+    b3[1][0] = b3[1][5];
+    b3[1][1] = b3[1][6];
+    b5[0] = b5[5];
+    b5[1] = b5[6];
+    x += 32;
+  } while (x < width);
+}
+
+inline void BoxFilterLastRow(
+    const uint16_t* const src, const uint16_t* const src0, const int width,
+    const ptrdiff_t sum_width, const uint16_t scales[2], const int16_t w0,
+    const int16_t w2, uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    uint16_t* const ma343, uint16_t* const ma444, uint16_t* const ma565,
+    uint32_t* const b343, uint32_t* const b444, uint32_t* const b565,
+    uint16_t* const dst) {
+  const ptrdiff_t overread_in_bytes =
+      kOverreadInBytesPass1_128 - sizeof(*src0) * width;
+  __m128i s[2], ma3_0, ma5_0, sq_128[4], b3_128[2], b5_128[2];
+  __m256i ma3[3], ma5[3], sq[8], b3[7], b5[7];
+  s[0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+  s[1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+  Square(s[0], sq_128);
+  BoxFilterPreProcessLastRowLo(s, scales, sum3, sum5, square_sum3, square_sum5,
+                               sq_128, &ma3_0, &ma5_0, b3_128, b5_128);
+  sq[0] = SetrM128i(sq_128[2], sq_128[2]);
+  sq[1] = SetrM128i(sq_128[3], sq_128[3]);
+  ma3[0] = SetrM128i(ma3_0, ma3_0);
+  ma5[0] = SetrM128i(ma5_0, ma5_0);
+  b3[0] = SetrM128i(b3_128[0], b3_128[0]);
+  b3[1] = SetrM128i(b3_128[1], b3_128[1]);
+  b5[0] = SetrM128i(b5_128[0], b5_128[0]);
+  b5[1] = SetrM128i(b5_128[1], b5_128[1]);
+
+  int x = 0;
+  do {
+    __m256i ma[4], mat[4], b[3][2], bt[3][2], ma3x[3], ma5x[3], p[2];
+    BoxFilterPreProcessLastRow(
+        src0 + x + 8,
+        kOverreadInBytesPass1_256 + sizeof(*src0) * (x + 8 - width), sum_width,
+        x + 8, scales, sum3, sum5, square_sum3, square_sum5, sq, ma3, ma5, b3,
+        b5);
+    Prepare3_8(ma3, ma3x);
+    Prepare3_8(ma5, ma5x);
+    ma[2] = Sum565Lo(ma5x);
+    Sum565(b5, b[1]);
+    mat[1] = Sum565Hi(ma5x);
+    Sum565(b5 + 3, bt[1]);
+    ma[3] = Sum343Lo(ma3x);
+    Sum343(b3, b[2]);
+    mat[2] = Sum343Hi(ma3x);
+    Sum343(b3 + 3, bt[2]);
+
+    const __m256i sr_lo = LoadUnaligned32(src + x);
+    ma[0] = LoadAligned32(ma565 + x);
+    ma[1] = _mm256_permute2x128_si256(ma[2], mat[1], 0x20);
+    mat[1] = _mm256_permute2x128_si256(ma[2], mat[1], 0x31);
+    LoadAligned64(b565 + x, b[0]);
+    p[0] = CalculateFilteredOutputPass1(sr_lo, ma, b);
+    ma[0] = LoadAligned32(ma343 + x);
+    ma[1] = LoadAligned32(ma444 + x);
+    ma[2] = _mm256_permute2x128_si256(ma[3], mat[2], 0x20);
+    LoadAligned64(b343 + x, b[0]);
+    LoadAligned64(b444 + x, b[1]);
+    p[1] = CalculateFilteredOutputPass2(sr_lo, ma, b);
+    const __m256i d0 = SelfGuidedDoubleMultiplier(sr_lo, p, w0, w2);
+
+    const __m256i sr_hi = LoadUnaligned32(src + x + 16);
+    mat[0] = LoadAligned32(ma565 + x + 16);
+    LoadAligned64(b565 + x + 16, bt[0]);
+    p[0] = CalculateFilteredOutputPass1(sr_hi, mat, bt);
+    mat[0] = LoadAligned32(ma343 + x + 16);
+    mat[1] = LoadAligned32(ma444 + x + 16);
+    mat[2] = _mm256_permute2x128_si256(ma[3], mat[2], 0x31);
+    LoadAligned64(b343 + x + 16, bt[0]);
+    LoadAligned64(b444 + x + 16, bt[1]);
+    p[1] = CalculateFilteredOutputPass2(sr_hi, mat, bt);
+    const __m256i d1 = SelfGuidedDoubleMultiplier(sr_hi, p, w0, w2);
+    ClipAndStore(dst + x + 0, d0);
+    ClipAndStore(dst + x + 16, d1);
+
+    sq[0] = sq[6];
+    sq[1] = sq[7];
+    ma3[0] = ma3[2];
+    ma5[0] = ma5[2];
+    b3[0] = b3[5];
+    b3[1] = b3[6];
+    b5[0] = b5[5];
+    b5[1] = b5[6];
+    x += 32;
+  } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterProcess(
+    const RestorationUnitInfo& restoration_info, const uint16_t* src,
+    const ptrdiff_t stride, const uint16_t* const top_border,
+    const ptrdiff_t top_border_stride, const uint16_t* bottom_border,
+    const ptrdiff_t bottom_border_stride, const int width, const int height,
+    SgrBuffer* const sgr_buffer, uint16_t* dst) {
+  const auto temp_stride = Align<ptrdiff_t>(width, 32);
+  const auto sum_width = temp_stride + 8;
+  const auto sum_stride = temp_stride + 32;
+  const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+  const uint16_t* const scales = kSgrScaleParameter[sgr_proj_index];  // < 2^12.
+  const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+  const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+  const int16_t w2 = (1 << kSgrProjPrecisionBits) - w0 - w1;
+  uint16_t *sum3[4], *sum5[5], *ma343[4], *ma444[3], *ma565[2];
+  uint32_t *square_sum3[4], *square_sum5[5], *b343[4], *b444[3], *b565[2];
+  sum3[0] = sgr_buffer->sum3 + kSumOffset;
+  square_sum3[0] = sgr_buffer->square_sum3 + kSumOffset;
+  ma343[0] = sgr_buffer->ma343;
+  b343[0] = sgr_buffer->b343;
+  for (int i = 1; i <= 3; ++i) {
+    sum3[i] = sum3[i - 1] + sum_stride;
+    square_sum3[i] = square_sum3[i - 1] + sum_stride;
+    ma343[i] = ma343[i - 1] + temp_stride;
+    b343[i] = b343[i - 1] + temp_stride;
+  }
+  sum5[0] = sgr_buffer->sum5 + kSumOffset;
+  square_sum5[0] = sgr_buffer->square_sum5 + kSumOffset;
+  for (int i = 1; i <= 4; ++i) {
+    sum5[i] = sum5[i - 1] + sum_stride;
+    square_sum5[i] = square_sum5[i - 1] + sum_stride;
+  }
+  ma444[0] = sgr_buffer->ma444;
+  b444[0] = sgr_buffer->b444;
+  for (int i = 1; i <= 2; ++i) {
+    ma444[i] = ma444[i - 1] + temp_stride;
+    b444[i] = b444[i - 1] + temp_stride;
+  }
+  ma565[0] = sgr_buffer->ma565;
+  ma565[1] = ma565[0] + temp_stride;
+  b565[0] = sgr_buffer->b565;
+  b565[1] = b565[0] + temp_stride;
+  assert(scales[0] != 0);
+  assert(scales[1] != 0);
+  BoxSum(top_border, top_border_stride, width, sum_stride, temp_stride, sum3[0],
+         sum5[1], square_sum3[0], square_sum5[1]);
+  sum5[0] = sum5[1];
+  square_sum5[0] = square_sum5[1];
+  const uint16_t* const s = (height > 1) ? src + stride : bottom_border;
+  BoxSumFilterPreProcess(src, s, width, scales, sum3, sum5, square_sum3,
+                         square_sum5, sum_width, ma343, ma444[0], ma565[0],
+                         b343, b444[0], b565[0]);
+  sum5[0] = sgr_buffer->sum5 + kSumOffset;
+  square_sum5[0] = sgr_buffer->square_sum5 + kSumOffset;
+
+  for (int y = (height >> 1) - 1; y > 0; --y) {
+    Circulate4PointersBy2<uint16_t>(sum3);
+    Circulate4PointersBy2<uint32_t>(square_sum3);
+    Circulate5PointersBy2<uint16_t>(sum5);
+    Circulate5PointersBy2<uint32_t>(square_sum5);
+    BoxFilter(src + 3, src + 2 * stride, src + 3 * stride, stride, width,
+              scales, w0, w2, sum3, sum5, square_sum3, square_sum5, sum_width,
+              ma343, ma444, ma565, b343, b444, b565, dst);
+    src += 2 * stride;
+    dst += 2 * stride;
+    Circulate4PointersBy2<uint16_t>(ma343);
+    Circulate4PointersBy2<uint32_t>(b343);
+    std::swap(ma444[0], ma444[2]);
+    std::swap(b444[0], b444[2]);
+    std::swap(ma565[0], ma565[1]);
+    std::swap(b565[0], b565[1]);
+  }
+
+  Circulate4PointersBy2<uint16_t>(sum3);
+  Circulate4PointersBy2<uint32_t>(square_sum3);
+  Circulate5PointersBy2<uint16_t>(sum5);
+  Circulate5PointersBy2<uint32_t>(square_sum5);
+  if ((height & 1) == 0 || height > 1) {
+    const uint16_t* sr[2];
+    if ((height & 1) == 0) {
+      sr[0] = bottom_border;
+      sr[1] = bottom_border + bottom_border_stride;
+    } else {
+      sr[0] = src + 2 * stride;
+      sr[1] = bottom_border;
+    }
+    BoxFilter(src + 3, sr[0], sr[1], stride, width, scales, w0, w2, sum3, sum5,
+              square_sum3, square_sum5, sum_width, ma343, ma444, ma565, b343,
+              b444, b565, dst);
+  }
+  if ((height & 1) != 0) {
+    if (height > 1) {
+      src += 2 * stride;
+      dst += 2 * stride;
+      Circulate4PointersBy2<uint16_t>(sum3);
+      Circulate4PointersBy2<uint32_t>(square_sum3);
+      Circulate5PointersBy2<uint16_t>(sum5);
+      Circulate5PointersBy2<uint32_t>(square_sum5);
+      Circulate4PointersBy2<uint16_t>(ma343);
+      Circulate4PointersBy2<uint32_t>(b343);
+      std::swap(ma444[0], ma444[2]);
+      std::swap(b444[0], b444[2]);
+      std::swap(ma565[0], ma565[1]);
+      std::swap(b565[0], b565[1]);
+    }
+    BoxFilterLastRow(src + 3, bottom_border + bottom_border_stride, width,
+                     sum_width, scales, w0, w2, sum3, sum5, square_sum3,
+                     square_sum5, ma343[0], ma444[0], ma565[0], b343[0],
+                     b444[0], b565[0], dst);
+  }
+}
+
+inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
+                                  const uint16_t* src, const ptrdiff_t stride,
+                                  const uint16_t* const top_border,
+                                  const ptrdiff_t top_border_stride,
+                                  const uint16_t* bottom_border,
+                                  const ptrdiff_t bottom_border_stride,
+                                  const int width, const int height,
+                                  SgrBuffer* const sgr_buffer, uint16_t* dst) {
+  const auto temp_stride = Align<ptrdiff_t>(width, 32);
+  const auto sum_width = temp_stride + 8;
+  const auto sum_stride = temp_stride + 32;
+  const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+  const uint32_t scale = kSgrScaleParameter[sgr_proj_index][0];  // < 2^12.
+  const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+  uint16_t *sum5[5], *ma565[2];
+  uint32_t *square_sum5[5], *b565[2];
+  sum5[0] = sgr_buffer->sum5 + kSumOffset;
+  square_sum5[0] = sgr_buffer->square_sum5 + kSumOffset;
+  for (int i = 1; i <= 4; ++i) {
+    sum5[i] = sum5[i - 1] + sum_stride;
+    square_sum5[i] = square_sum5[i - 1] + sum_stride;
+  }
+  ma565[0] = sgr_buffer->ma565;
+  ma565[1] = ma565[0] + temp_stride;
+  b565[0] = sgr_buffer->b565;
+  b565[1] = b565[0] + temp_stride;
+  assert(scale != 0);
+  BoxSum<5>(top_border, top_border_stride, width, sum_stride, temp_stride,
+            sum5[1], square_sum5[1]);
+  sum5[0] = sum5[1];
+  square_sum5[0] = square_sum5[1];
+  const uint16_t* const s = (height > 1) ? src + stride : bottom_border;
+  BoxSumFilterPreProcess5(src, s, width, scale, sum5, square_sum5, sum_width,
+                          ma565[0], b565[0]);
+  sum5[0] = sgr_buffer->sum5 + kSumOffset;
+  square_sum5[0] = sgr_buffer->square_sum5 + kSumOffset;
+
+  for (int y = (height >> 1) - 1; y > 0; --y) {
+    Circulate5PointersBy2<uint16_t>(sum5);
+    Circulate5PointersBy2<uint32_t>(square_sum5);
+    BoxFilterPass1(src + 3, src + 2 * stride, src + 3 * stride, stride, sum5,
+                   square_sum5, width, sum_width, scale, w0, ma565, b565, dst);
+    src += 2 * stride;
+    dst += 2 * stride;
+    std::swap(ma565[0], ma565[1]);
+    std::swap(b565[0], b565[1]);
+  }
+
+  Circulate5PointersBy2<uint16_t>(sum5);
+  Circulate5PointersBy2<uint32_t>(square_sum5);
+  if ((height & 1) == 0 || height > 1) {
+    const uint16_t* sr[2];
+    if ((height & 1) == 0) {
+      sr[0] = bottom_border;
+      sr[1] = bottom_border + bottom_border_stride;
+    } else {
+      sr[0] = src + 2 * stride;
+      sr[1] = bottom_border;
+    }
+    BoxFilterPass1(src + 3, sr[0], sr[1], stride, sum5, square_sum5, width,
+                   sum_width, scale, w0, ma565, b565, dst);
+  }
+  if ((height & 1) != 0) {
+    src += 3;
+    if (height > 1) {
+      src += 2 * stride;
+      dst += 2 * stride;
+      std::swap(ma565[0], ma565[1]);
+      std::swap(b565[0], b565[1]);
+      Circulate5PointersBy2<uint16_t>(sum5);
+      Circulate5PointersBy2<uint32_t>(square_sum5);
+    }
+    BoxFilterPass1LastRow(src, bottom_border + bottom_border_stride, width,
+                          sum_width, scale, w0, sum5, square_sum5, ma565[0],
+                          b565[0], dst);
+  }
+}
+
+inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
+                                  const uint16_t* src, const ptrdiff_t stride,
+                                  const uint16_t* const top_border,
+                                  const ptrdiff_t top_border_stride,
+                                  const uint16_t* bottom_border,
+                                  const ptrdiff_t bottom_border_stride,
+                                  const int width, const int height,
+                                  SgrBuffer* const sgr_buffer, uint16_t* dst) {
+  assert(restoration_info.sgr_proj_info.multiplier[0] == 0);
+  const auto temp_stride = Align<ptrdiff_t>(width, 32);
+  const auto sum_width = temp_stride + 8;
+  const auto sum_stride = temp_stride + 32;
+  const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+  const int16_t w0 = (1 << kSgrProjPrecisionBits) - w1;
+  const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+  const uint32_t scale = kSgrScaleParameter[sgr_proj_index][1];  // < 2^12.
+  uint16_t *sum3[3], *ma343[3], *ma444[2];
+  uint32_t *square_sum3[3], *b343[3], *b444[2];
+  sum3[0] = sgr_buffer->sum3 + kSumOffset;
+  square_sum3[0] = sgr_buffer->square_sum3 + kSumOffset;
+  ma343[0] = sgr_buffer->ma343;
+  b343[0] = sgr_buffer->b343;
+  for (int i = 1; i <= 2; ++i) {
+    sum3[i] = sum3[i - 1] + sum_stride;
+    square_sum3[i] = square_sum3[i - 1] + sum_stride;
+    ma343[i] = ma343[i - 1] + temp_stride;
+    b343[i] = b343[i - 1] + temp_stride;
+  }
+  ma444[0] = sgr_buffer->ma444;
+  ma444[1] = ma444[0] + temp_stride;
+  b444[0] = sgr_buffer->b444;
+  b444[1] = b444[0] + temp_stride;
+  assert(scale != 0);
+  BoxSum<3>(top_border, top_border_stride, width, sum_stride, temp_stride,
+            sum3[0], square_sum3[0]);
+  BoxSumFilterPreProcess3<false>(src, width, scale, sum3, square_sum3,
+                                 sum_width, ma343[0], nullptr, b343[0],
+                                 nullptr);
+  Circulate3PointersBy1<uint16_t>(sum3);
+  Circulate3PointersBy1<uint32_t>(square_sum3);
+  const uint16_t* s;
+  if (height > 1) {
+    s = src + stride;
+  } else {
+    s = bottom_border;
+    bottom_border += bottom_border_stride;
+  }
+  BoxSumFilterPreProcess3<true>(s, width, scale, sum3, square_sum3, sum_width,
+                                ma343[1], ma444[0], b343[1], b444[0]);
+
+  for (int y = height - 2; y > 0; --y) {
+    Circulate3PointersBy1<uint16_t>(sum3);
+    Circulate3PointersBy1<uint32_t>(square_sum3);
+    BoxFilterPass2(src + 2, src + 2 * stride, width, sum_width, scale, w0, sum3,
+                   square_sum3, ma343, ma444, b343, b444, dst);
+    src += stride;
+    dst += stride;
+    Circulate3PointersBy1<uint16_t>(ma343);
+    Circulate3PointersBy1<uint32_t>(b343);
+    std::swap(ma444[0], ma444[1]);
+    std::swap(b444[0], b444[1]);
+  }
+
+  int y = std::min(height, 2);
+  src += 2;
+  do {
+    Circulate3PointersBy1<uint16_t>(sum3);
+    Circulate3PointersBy1<uint32_t>(square_sum3);
+    BoxFilterPass2(src, bottom_border, width, sum_width, scale, w0, sum3,
+                   square_sum3, ma343, ma444, b343, b444, dst);
+    src += stride;
+    dst += stride;
+    bottom_border += bottom_border_stride;
+    Circulate3PointersBy1<uint16_t>(ma343);
+    Circulate3PointersBy1<uint32_t>(b343);
+    std::swap(ma444[0], ma444[1]);
+    std::swap(b444[0], b444[1]);
+  } while (--y != 0);
+}
+
+// If |width| is non-multiple of 32, up to 31 more pixels are written to |dest|
+// in the end of each row. It is safe to overwrite the output as it will not be
+// part of the visible frame.
+void SelfGuidedFilter_AVX2(
+    const RestorationUnitInfo& restoration_info, const void* const source,
+    const ptrdiff_t stride, const void* const top_border,
+    const ptrdiff_t top_border_stride, const void* const bottom_border,
+    const ptrdiff_t bottom_border_stride, const int width, const int height,
+    RestorationBuffer* const restoration_buffer, void* const dest) {
+  const int index = restoration_info.sgr_proj_info.index;
+  const int radius_pass_0 = kSgrProjParams[index][0];  // 2 or 0
+  const int radius_pass_1 = kSgrProjParams[index][2];  // 1 or 0
+  const auto* const src = static_cast<const uint16_t*>(source);
+  const auto* const top = static_cast<const uint16_t*>(top_border);
+  const auto* const bottom = static_cast<const uint16_t*>(bottom_border);
+  auto* const dst = static_cast<uint16_t*>(dest);
+  SgrBuffer* const sgr_buffer = &restoration_buffer->sgr_buffer;
+  if (radius_pass_1 == 0) {
+    // |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the
+    // following assertion.
+    assert(radius_pass_0 != 0);
+    BoxFilterProcessPass1(restoration_info, src - 3, stride, top - 3,
+                          top_border_stride, bottom - 3, bottom_border_stride,
+                          width, height, sgr_buffer, dst);
+  } else if (radius_pass_0 == 0) {
+    BoxFilterProcessPass2(restoration_info, src - 2, stride, top - 2,
+                          top_border_stride, bottom - 2, bottom_border_stride,
+                          width, height, sgr_buffer, dst);
+  } else {
+    BoxFilterProcess(restoration_info, src - 3, stride, top - 3,
+                     top_border_stride, bottom - 3, bottom_border_stride, width,
+                     height, sgr_buffer, dst);
+  }
+}
+
 void Init10bpp() {
   Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
   assert(dsp != nullptr);
 #if DSP_ENABLED_10BPP_AVX2(WienerFilter)
   dsp->loop_restorations[0] = WienerFilter_AVX2;
 #endif
+#if DSP_ENABLED_10BPP_AVX2(SelfGuidedFilter)
+  dsp->loop_restorations[1] = SelfGuidedFilter_AVX2;
+#endif
 }
 
 }  // namespace
@@ -581,7 +3146,7 @@ void LoopRestorationInit10bpp_AVX2() { Init10bpp(); }
 }  // namespace dsp
 }  // namespace libgav1
 
-#else  // !(LIBGAV1_TARGETING_AVX2 && LIBGAV1_MAX_BITDEPTH >= 10)
+#else   // !(LIBGAV1_TARGETING_AVX2 && LIBGAV1_MAX_BITDEPTH >= 10)
 namespace libgav1 {
 namespace dsp {
 
diff --git a/src/dsp/x86/loop_restoration_10bit_sse4.cc b/src/dsp/x86/loop_restoration_10bit_sse4.cc
index 0598435..96380e3 100644
--- a/src/dsp/x86/loop_restoration_10bit_sse4.cc
+++ b/src/dsp/x86/loop_restoration_10bit_sse4.cc
@@ -428,13 +428,12 @@ inline void WienerVerticalTap1(const int16_t* wiener_buffer,
   }
 }
 
-void WienerFilter_SSE4_1(const RestorationUnitInfo& restoration_info,
-                         const void* const source, const void* const top_border,
-                         const void* const bottom_border,
-                         const ptrdiff_t stride, const int width,
-                         const int height,
-                         RestorationBuffer* const restoration_buffer,
-                         void* const dest) {
+void WienerFilter_SSE4_1(
+    const RestorationUnitInfo& restoration_info, const void* const source,
+    const ptrdiff_t stride, const void* const top_border,
+    const ptrdiff_t top_border_stride, const void* const bottom_border,
+    const ptrdiff_t bottom_border_stride, const int width, const int height,
+    RestorationBuffer* const restoration_buffer, void* const dest) {
   const int16_t* const number_leading_zero_coefficients =
       restoration_info.wiener_info.number_leading_zero_coefficients;
   const int number_rows_to_skip = std::max(
@@ -458,39 +457,42 @@ void WienerFilter_SSE4_1(const RestorationUnitInfo& restoration_info,
   const __m128i coefficients_horizontal =
       LoadLo8(restoration_info.wiener_info.filter[WienerInfo::kHorizontal]);
   if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) {
-    WienerHorizontalTap7(top + (2 - height_extra) * stride - 3, stride,
-                         wiener_stride, height_extra, coefficients_horizontal,
-                         &wiener_buffer_horizontal);
-    WienerHorizontalTap7(src - 3, stride, wiener_stride, height,
+    WienerHorizontalTap7(top + (2 - height_extra) * top_border_stride - 3,
+                         top_border_stride, wiener_stride, height_extra,
                          coefficients_horizontal, &wiener_buffer_horizontal);
-    WienerHorizontalTap7(bottom - 3, stride, wiener_stride, height_extra,
+    WienerHorizontalTap7(src - 3, stride, wiener_stride, height,
                          coefficients_horizontal, &wiener_buffer_horizontal);
-  } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
-    WienerHorizontalTap5(top + (2 - height_extra) * stride - 2, stride,
-                         wiener_stride, height_extra, coefficients_horizontal,
+    WienerHorizontalTap7(bottom - 3, bottom_border_stride, wiener_stride,
+                         height_extra, coefficients_horizontal,
                          &wiener_buffer_horizontal);
-    WienerHorizontalTap5(src - 2, stride, wiener_stride, height,
+  } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
+    WienerHorizontalTap5(top + (2 - height_extra) * top_border_stride - 2,
+                         top_border_stride, wiener_stride, height_extra,
                          coefficients_horizontal, &wiener_buffer_horizontal);
-    WienerHorizontalTap5(bottom - 2, stride, wiener_stride, height_extra,
+    WienerHorizontalTap5(src - 2, stride, wiener_stride, height,
                          coefficients_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap5(bottom - 2, bottom_border_stride, wiener_stride,
+                         height_extra, coefficients_horizontal,
+                         &wiener_buffer_horizontal);
   } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) {
     // The maximum over-reads happen here.
-    WienerHorizontalTap3(top + (2 - height_extra) * stride - 1, stride,
-                         wiener_stride, height_extra, coefficients_horizontal,
-                         &wiener_buffer_horizontal);
-    WienerHorizontalTap3(src - 1, stride, wiener_stride, height,
+    WienerHorizontalTap3(top + (2 - height_extra) * top_border_stride - 1,
+                         top_border_stride, wiener_stride, height_extra,
                          coefficients_horizontal, &wiener_buffer_horizontal);
-    WienerHorizontalTap3(bottom - 1, stride, wiener_stride, height_extra,
+    WienerHorizontalTap3(src - 1, stride, wiener_stride, height,
                          coefficients_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap3(bottom - 1, bottom_border_stride, wiener_stride,
+                         height_extra, coefficients_horizontal,
+                         &wiener_buffer_horizontal);
   } else {
     assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3);
-    WienerHorizontalTap1(top + (2 - height_extra) * stride, stride,
-                         wiener_stride, height_extra,
+    WienerHorizontalTap1(top + (2 - height_extra) * top_border_stride,
+                         top_border_stride, wiener_stride, height_extra,
                          &wiener_buffer_horizontal);
     WienerHorizontalTap1(src, stride, wiener_stride, height,
                          &wiener_buffer_horizontal);
-    WienerHorizontalTap1(bottom, stride, wiener_stride, height_extra,
-                         &wiener_buffer_horizontal);
+    WienerHorizontalTap1(bottom, bottom_border_stride, wiener_stride,
+                         height_extra, &wiener_buffer_horizontal);
   }
 
   // vertical filtering.
@@ -522,6 +524,1978 @@ void WienerFilter_SSE4_1(const RestorationUnitInfo& restoration_info,
   }
 }
 
+//------------------------------------------------------------------------------
+// SGR
+
+// SIMD overreads 8 - (width % 8) - 2 * padding pixels, where padding is 3 for
+// Pass 1 and 2 for Pass 2.
+constexpr int kOverreadInBytesPass1 = 4;
+constexpr int kOverreadInBytesPass2 = 8;
+
+inline void LoadAligned16x2U16(const uint16_t* const src[2], const ptrdiff_t x,
+                               __m128i dst[2]) {
+  dst[0] = LoadAligned16(src[0] + x);
+  dst[1] = LoadAligned16(src[1] + x);
+}
+
+inline void LoadAligned16x2U16Msan(const uint16_t* const src[2],
+                                   const ptrdiff_t x, const ptrdiff_t border,
+                                   __m128i dst[2]) {
+  dst[0] = LoadAligned16Msan(src[0] + x, sizeof(**src) * (x + 8 - border));
+  dst[1] = LoadAligned16Msan(src[1] + x, sizeof(**src) * (x + 8 - border));
+}
+
+inline void LoadAligned16x3U16(const uint16_t* const src[3], const ptrdiff_t x,
+                               __m128i dst[3]) {
+  dst[0] = LoadAligned16(src[0] + x);
+  dst[1] = LoadAligned16(src[1] + x);
+  dst[2] = LoadAligned16(src[2] + x);
+}
+
+inline void LoadAligned16x3U16Msan(const uint16_t* const src[3],
+                                   const ptrdiff_t x, const ptrdiff_t border,
+                                   __m128i dst[3]) {
+  dst[0] = LoadAligned16Msan(src[0] + x, sizeof(**src) * (x + 8 - border));
+  dst[1] = LoadAligned16Msan(src[1] + x, sizeof(**src) * (x + 8 - border));
+  dst[2] = LoadAligned16Msan(src[2] + x, sizeof(**src) * (x + 8 - border));
+}
+
+inline void LoadAligned32U32(const uint32_t* const src, __m128i dst[2]) {
+  dst[0] = LoadAligned16(src + 0);
+  dst[1] = LoadAligned16(src + 4);
+}
+
+inline void LoadAligned32U32Msan(const uint32_t* const src, const ptrdiff_t x,
+                                 const ptrdiff_t border, __m128i dst[2]) {
+  dst[0] = LoadAligned16Msan(src + x + 0, sizeof(*src) * (x + 4 - border));
+  dst[1] = LoadAligned16Msan(src + x + 4, sizeof(*src) * (x + 8 - border));
+}
+
+inline void LoadAligned32x2U32(const uint32_t* const src[2], const ptrdiff_t x,
+                               __m128i dst[2][2]) {
+  LoadAligned32U32(src[0] + x, dst[0]);
+  LoadAligned32U32(src[1] + x, dst[1]);
+}
+
+inline void LoadAligned32x2U32Msan(const uint32_t* const src[2],
+                                   const ptrdiff_t x, const ptrdiff_t border,
+                                   __m128i dst[2][2]) {
+  LoadAligned32U32Msan(src[0], x, border, dst[0]);
+  LoadAligned32U32Msan(src[1], x, border, dst[1]);
+}
+
+inline void LoadAligned32x3U32(const uint32_t* const src[3], const ptrdiff_t x,
+                               __m128i dst[3][2]) {
+  LoadAligned32U32(src[0] + x, dst[0]);
+  LoadAligned32U32(src[1] + x, dst[1]);
+  LoadAligned32U32(src[2] + x, dst[2]);
+}
+
+inline void LoadAligned32x3U32Msan(const uint32_t* const src[3],
+                                   const ptrdiff_t x, const ptrdiff_t border,
+                                   __m128i dst[3][2]) {
+  LoadAligned32U32Msan(src[0], x, border, dst[0]);
+  LoadAligned32U32Msan(src[1], x, border, dst[1]);
+  LoadAligned32U32Msan(src[2], x, border, dst[2]);
+}
+
+inline void StoreAligned32U16(uint16_t* const dst, const __m128i src[2]) {
+  StoreAligned16(dst + 0, src[0]);
+  StoreAligned16(dst + 8, src[1]);
+}
+
+inline void StoreAligned32U32(uint32_t* const dst, const __m128i src[2]) {
+  StoreAligned16(dst + 0, src[0]);
+  StoreAligned16(dst + 4, src[1]);
+}
+
+inline void StoreAligned64U32(uint32_t* const dst, const __m128i src[4]) {
+  StoreAligned32U32(dst + 0, src + 0);
+  StoreAligned32U32(dst + 8, src + 2);
+}
+
+// Don't use _mm_cvtepu8_epi16() or _mm_cvtepu16_epi32() in the following
+// functions. Some compilers may generate super inefficient code and the whole
+// decoder could be 15% slower.
+
+inline __m128i VaddlLo8(const __m128i src0, const __m128i src1) {
+  const __m128i s0 = _mm_unpacklo_epi8(src0, _mm_setzero_si128());
+  const __m128i s1 = _mm_unpacklo_epi8(src1, _mm_setzero_si128());
+  return _mm_add_epi16(s0, s1);
+}
+
+inline __m128i VaddlHi8(const __m128i src0, const __m128i src1) {
+  const __m128i s0 = _mm_unpackhi_epi8(src0, _mm_setzero_si128());
+  const __m128i s1 = _mm_unpackhi_epi8(src1, _mm_setzero_si128());
+  return _mm_add_epi16(s0, s1);
+}
+
+inline __m128i VaddwLo8(const __m128i src0, const __m128i src1) {
+  const __m128i s1 = _mm_unpacklo_epi8(src1, _mm_setzero_si128());
+  return _mm_add_epi16(src0, s1);
+}
+
+inline __m128i VaddwHi8(const __m128i src0, const __m128i src1) {
+  const __m128i s1 = _mm_unpackhi_epi8(src1, _mm_setzero_si128());
+  return _mm_add_epi16(src0, s1);
+}
+
+inline __m128i VmullNLo8(const __m128i src0, const int src1) {
+  const __m128i s0 = _mm_unpacklo_epi16(src0, _mm_setzero_si128());
+  return _mm_madd_epi16(s0, _mm_set1_epi32(src1));
+}
+
+inline __m128i VmullNHi8(const __m128i src0, const int src1) {
+  const __m128i s0 = _mm_unpackhi_epi16(src0, _mm_setzero_si128());
+  return _mm_madd_epi16(s0, _mm_set1_epi32(src1));
+}
+
+inline __m128i VmullLo16(const __m128i src0, const __m128i src1) {
+  const __m128i s0 = _mm_unpacklo_epi16(src0, _mm_setzero_si128());
+  const __m128i s1 = _mm_unpacklo_epi16(src1, _mm_setzero_si128());
+  return _mm_madd_epi16(s0, s1);
+}
+
+inline __m128i VmullHi16(const __m128i src0, const __m128i src1) {
+  const __m128i s0 = _mm_unpackhi_epi16(src0, _mm_setzero_si128());
+  const __m128i s1 = _mm_unpackhi_epi16(src1, _mm_setzero_si128());
+  return _mm_madd_epi16(s0, s1);
+}
+
+inline __m128i VrshrU16(const __m128i src0, const int src1) {
+  const __m128i sum = _mm_add_epi16(src0, _mm_set1_epi16(1 << (src1 - 1)));
+  return _mm_srli_epi16(sum, src1);
+}
+
+inline __m128i VrshrS32(const __m128i src0, const int src1) {
+  const __m128i sum = _mm_add_epi32(src0, _mm_set1_epi32(1 << (src1 - 1)));
+  return _mm_srai_epi32(sum, src1);
+}
+
+inline __m128i VrshrU32(const __m128i src0, const int src1) {
+  const __m128i sum = _mm_add_epi32(src0, _mm_set1_epi32(1 << (src1 - 1)));
+  return _mm_srli_epi32(sum, src1);
+}
+
+inline void Square(const __m128i src, __m128i dst[2]) {
+  const __m128i s0 = _mm_unpacklo_epi16(src, _mm_setzero_si128());
+  const __m128i s1 = _mm_unpackhi_epi16(src, _mm_setzero_si128());
+  dst[0] = _mm_madd_epi16(s0, s0);
+  dst[1] = _mm_madd_epi16(s1, s1);
+}
+
+template <int offset>
+inline void Prepare3_8(const __m128i src[2], __m128i dst[3]) {
+  dst[0] = _mm_alignr_epi8(src[1], src[0], offset + 0);
+  dst[1] = _mm_alignr_epi8(src[1], src[0], offset + 1);
+  dst[2] = _mm_alignr_epi8(src[1], src[0], offset + 2);
+}
+
+inline void Prepare3_16(const __m128i src[2], __m128i dst[3]) {
+  dst[0] = src[0];
+  dst[1] = _mm_alignr_epi8(src[1], src[0], 2);
+  dst[2] = _mm_alignr_epi8(src[1], src[0], 4);
+}
+
+inline void Prepare3_32(const __m128i src[2], __m128i dst[3]) {
+  dst[0] = src[0];
+  dst[1] = _mm_alignr_epi8(src[1], src[0], 4);
+  dst[2] = _mm_alignr_epi8(src[1], src[0], 8);
+}
+
+inline void Prepare5_16(const __m128i src[2], __m128i dst[5]) {
+  Prepare3_16(src, dst);
+  dst[3] = _mm_alignr_epi8(src[1], src[0], 6);
+  dst[4] = _mm_alignr_epi8(src[1], src[0], 8);
+}
+
+inline void Prepare5_32(const __m128i src[2], __m128i dst[5]) {
+  Prepare3_32(src, dst);
+  dst[3] = _mm_alignr_epi8(src[1], src[0], 12);
+  dst[4] = src[1];
+}
+
+inline __m128i Sum3_16(const __m128i src0, const __m128i src1,
+                       const __m128i src2) {
+  const __m128i sum = _mm_add_epi16(src0, src1);
+  return _mm_add_epi16(sum, src2);
+}
+
+inline __m128i Sum3_16(const __m128i src[3]) {
+  return Sum3_16(src[0], src[1], src[2]);
+}
+
+inline __m128i Sum3_32(const __m128i src0, const __m128i src1,
+                       const __m128i src2) {
+  const __m128i sum = _mm_add_epi32(src0, src1);
+  return _mm_add_epi32(sum, src2);
+}
+
+inline __m128i Sum3_32(const __m128i src[3]) {
+  return Sum3_32(src[0], src[1], src[2]);
+}
+
+inline void Sum3_32(const __m128i src[3][2], __m128i dst[2]) {
+  dst[0] = Sum3_32(src[0][0], src[1][0], src[2][0]);
+  dst[1] = Sum3_32(src[0][1], src[1][1], src[2][1]);
+}
+
+inline __m128i Sum3WLo16(const __m128i src[3]) {
+  const __m128i sum = VaddlLo8(src[0], src[1]);
+  return VaddwLo8(sum, src[2]);
+}
+
+inline __m128i Sum3WHi16(const __m128i src[3]) {
+  const __m128i sum = VaddlHi8(src[0], src[1]);
+  return VaddwHi8(sum, src[2]);
+}
+
+inline __m128i Sum5_16(const __m128i src[5]) {
+  const __m128i sum01 = _mm_add_epi16(src[0], src[1]);
+  const __m128i sum23 = _mm_add_epi16(src[2], src[3]);
+  const __m128i sum = _mm_add_epi16(sum01, sum23);
+  return _mm_add_epi16(sum, src[4]);
+}
+
+inline __m128i Sum5_32(const __m128i* const src0, const __m128i* const src1,
+                       const __m128i* const src2, const __m128i* const src3,
+                       const __m128i* const src4) {
+  const __m128i sum01 = _mm_add_epi32(*src0, *src1);
+  const __m128i sum23 = _mm_add_epi32(*src2, *src3);
+  const __m128i sum = _mm_add_epi32(sum01, sum23);
+  return _mm_add_epi32(sum, *src4);
+}
+
+inline __m128i Sum5_32(const __m128i src[5]) {
+  return Sum5_32(&src[0], &src[1], &src[2], &src[3], &src[4]);
+}
+
+inline void Sum5_32(const __m128i src[5][2], __m128i dst[2]) {
+  dst[0] = Sum5_32(&src[0][0], &src[1][0], &src[2][0], &src[3][0], &src[4][0]);
+  dst[1] = Sum5_32(&src[0][1], &src[1][1], &src[2][1], &src[3][1], &src[4][1]);
+}
+
+inline __m128i Sum3Horizontal16(const __m128i src[2]) {
+  __m128i s[3];
+  Prepare3_16(src, s);
+  return Sum3_16(s);
+}
+
+inline void Sum3Horizontal32(const __m128i src[3], __m128i dst[2]) {
+  __m128i s[3];
+  Prepare3_32(src + 0, s);
+  dst[0] = Sum3_32(s);
+  Prepare3_32(src + 1, s);
+  dst[1] = Sum3_32(s);
+}
+
+inline __m128i Sum5Horizontal16(const __m128i src[2]) {
+  __m128i s[5];
+  Prepare5_16(src, s);
+  return Sum5_16(s);
+}
+
+inline void Sum5Horizontal32(const __m128i src[3], __m128i dst[2]) {
+  __m128i s[5];
+  Prepare5_32(src + 0, s);
+  dst[0] = Sum5_32(s);
+  Prepare5_32(src + 1, s);
+  dst[1] = Sum5_32(s);
+}
+
+void SumHorizontal16(const __m128i src[2], __m128i* const row3,
+                     __m128i* const row5) {
+  __m128i s[5];
+  Prepare5_16(src, s);
+  const __m128i sum04 = _mm_add_epi16(s[0], s[4]);
+  *row3 = Sum3_16(s + 1);
+  *row5 = _mm_add_epi16(sum04, *row3);
+}
+
+inline void SumHorizontal16(const __m128i src[3], __m128i* const row3_0,
+                            __m128i* const row3_1, __m128i* const row5_0,
+                            __m128i* const row5_1) {
+  SumHorizontal16(src + 0, row3_0, row5_0);
+  SumHorizontal16(src + 1, row3_1, row5_1);
+}
+
+void SumHorizontal32(const __m128i src[5], __m128i* const row_sq3,
+                     __m128i* const row_sq5) {
+  const __m128i sum04 = _mm_add_epi32(src[0], src[4]);
+  *row_sq3 = Sum3_32(src + 1);
+  *row_sq5 = _mm_add_epi32(sum04, *row_sq3);
+}
+
+inline void SumHorizontal32(const __m128i src[3], __m128i* const row_sq3_0,
+                            __m128i* const row_sq3_1, __m128i* const row_sq5_0,
+                            __m128i* const row_sq5_1) {
+  __m128i s[5];
+  Prepare5_32(src + 0, s);
+  SumHorizontal32(s, row_sq3_0, row_sq5_0);
+  Prepare5_32(src + 1, s);
+  SumHorizontal32(s, row_sq3_1, row_sq5_1);
+}
+
+inline __m128i Sum343Lo(const __m128i ma3[3]) {
+  const __m128i sum = Sum3WLo16(ma3);
+  const __m128i sum3 = Sum3_16(sum, sum, sum);
+  return VaddwLo8(sum3, ma3[1]);
+}
+
+inline __m128i Sum343Hi(const __m128i ma3[3]) {
+  const __m128i sum = Sum3WHi16(ma3);
+  const __m128i sum3 = Sum3_16(sum, sum, sum);
+  return VaddwHi8(sum3, ma3[1]);
+}
+
+inline __m128i Sum343(const __m128i src[3]) {
+  const __m128i sum = Sum3_32(src);
+  const __m128i sum3 = Sum3_32(sum, sum, sum);
+  return _mm_add_epi32(sum3, src[1]);
+}
+
+inline void Sum343(const __m128i src[3], __m128i dst[2]) {
+  __m128i s[3];
+  Prepare3_32(src + 0, s);
+  dst[0] = Sum343(s);
+  Prepare3_32(src + 1, s);
+  dst[1] = Sum343(s);
+}
+
+inline __m128i Sum565Lo(const __m128i src[3]) {
+  const __m128i sum = Sum3WLo16(src);
+  const __m128i sum4 = _mm_slli_epi16(sum, 2);
+  const __m128i sum5 = _mm_add_epi16(sum4, sum);
+  return VaddwLo8(sum5, src[1]);
+}
+
+inline __m128i Sum565Hi(const __m128i src[3]) {
+  const __m128i sum = Sum3WHi16(src);
+  const __m128i sum4 = _mm_slli_epi16(sum, 2);
+  const __m128i sum5 = _mm_add_epi16(sum4, sum);
+  return VaddwHi8(sum5, src[1]);
+}
+
+inline __m128i Sum565(const __m128i src[3]) {
+  const __m128i sum = Sum3_32(src);
+  const __m128i sum4 = _mm_slli_epi32(sum, 2);
+  const __m128i sum5 = _mm_add_epi32(sum4, sum);
+  return _mm_add_epi32(sum5, src[1]);
+}
+
+inline void Sum565(const __m128i src[3], __m128i dst[2]) {
+  __m128i s[3];
+  Prepare3_32(src + 0, s);
+  dst[0] = Sum565(s);
+  Prepare3_32(src + 1, s);
+  dst[1] = Sum565(s);
+}
+
+inline void BoxSum(const uint16_t* src, const ptrdiff_t src_stride,
+                   const ptrdiff_t width, const ptrdiff_t sum_stride,
+                   const ptrdiff_t sum_width, uint16_t* sum3, uint16_t* sum5,
+                   uint32_t* square_sum3, uint32_t* square_sum5) {
+  const ptrdiff_t overread_in_bytes =
+      kOverreadInBytesPass1 - sizeof(*src) * width;
+  int y = 2;
+  do {
+    __m128i s[3], sq[6];
+    s[0] = LoadUnaligned16Msan(src, overread_in_bytes);
+    Square(s[0], sq);
+    ptrdiff_t x = sum_width;
+    do {
+      __m128i row3[2], row5[2], row_sq3[2], row_sq5[2];
+      s[1] = LoadUnaligned16Msan(
+          src + 8, overread_in_bytes + sizeof(*src) * (sum_width - x + 8));
+      x -= 16;
+      src += 16;
+      s[2] = LoadUnaligned16Msan(
+          src, overread_in_bytes + sizeof(*src) * (sum_width - x));
+      Square(s[1], sq + 2);
+      Square(s[2], sq + 4);
+      SumHorizontal16(s, &row3[0], &row3[1], &row5[0], &row5[1]);
+      StoreAligned32U16(sum3, row3);
+      StoreAligned32U16(sum5, row5);
+      SumHorizontal32(sq + 0, &row_sq3[0], &row_sq3[1], &row_sq5[0],
+                      &row_sq5[1]);
+      StoreAligned32U32(square_sum3 + 0, row_sq3);
+      StoreAligned32U32(square_sum5 + 0, row_sq5);
+      SumHorizontal32(sq + 2, &row_sq3[0], &row_sq3[1], &row_sq5[0],
+                      &row_sq5[1]);
+      StoreAligned32U32(square_sum3 + 8, row_sq3);
+      StoreAligned32U32(square_sum5 + 8, row_sq5);
+      s[0] = s[2];
+      sq[0] = sq[4];
+      sq[1] = sq[5];
+      sum3 += 16;
+      sum5 += 16;
+      square_sum3 += 16;
+      square_sum5 += 16;
+    } while (x != 0);
+    src += src_stride - sum_width;
+    sum3 += sum_stride - sum_width;
+    sum5 += sum_stride - sum_width;
+    square_sum3 += sum_stride - sum_width;
+    square_sum5 += sum_stride - sum_width;
+  } while (--y != 0);
+}
+
+template <int size>
+inline void BoxSum(const uint16_t* src, const ptrdiff_t src_stride,
+                   const ptrdiff_t width, const ptrdiff_t sum_stride,
+                   const ptrdiff_t sum_width, uint16_t* sums,
+                   uint32_t* square_sums) {
+  static_assert(size == 3 || size == 5, "");
+  const ptrdiff_t overread_in_bytes =
+      ((size == 5) ? kOverreadInBytesPass1 : kOverreadInBytesPass2) -
+      sizeof(*src) * width;
+  int y = 2;
+  do {
+    __m128i s[3], sq[6];
+    s[0] = LoadUnaligned16Msan(src, overread_in_bytes);
+    Square(s[0], sq);
+    ptrdiff_t x = sum_width;
+    do {
+      __m128i row[2], row_sq[4];
+      s[1] = LoadUnaligned16Msan(
+          src + 8, overread_in_bytes + sizeof(*src) * (sum_width - x + 8));
+      x -= 16;
+      src += 16;
+      s[2] = LoadUnaligned16Msan(
+          src, overread_in_bytes + sizeof(*src) * (sum_width - x));
+      Square(s[1], sq + 2);
+      Square(s[2], sq + 4);
+      if (size == 3) {
+        row[0] = Sum3Horizontal16(s + 0);
+        row[1] = Sum3Horizontal16(s + 1);
+        Sum3Horizontal32(sq + 0, row_sq + 0);
+        Sum3Horizontal32(sq + 2, row_sq + 2);
+      } else {
+        row[0] = Sum5Horizontal16(s + 0);
+        row[1] = Sum5Horizontal16(s + 1);
+        Sum5Horizontal32(sq + 0, row_sq + 0);
+        Sum5Horizontal32(sq + 2, row_sq + 2);
+      }
+      StoreAligned32U16(sums, row);
+      StoreAligned64U32(square_sums, row_sq);
+      s[0] = s[2];
+      sq[0] = sq[4];
+      sq[1] = sq[5];
+      sums += 16;
+      square_sums += 16;
+    } while (x != 0);
+    src += src_stride - sum_width;
+    sums += sum_stride - sum_width;
+    square_sums += sum_stride - sum_width;
+  } while (--y != 0);
+}
+
+template <int n>
+inline __m128i CalculateMa(const __m128i sum, const __m128i sum_sq,
+                           const uint32_t scale) {
+  static_assert(n == 9 || n == 25, "");
+  // a = |sum_sq|
+  // d = |sum|
+  // p = (a * n < d * d) ? 0 : a * n - d * d;
+  const __m128i dxd = _mm_madd_epi16(sum, sum);
+  // _mm_mullo_epi32() has high latency. Using shifts and additions instead.
+  // Some compilers could do this for us but we make this explicit.
+  // return _mm_mullo_epi32(sum_sq, _mm_set1_epi32(n));
+  __m128i axn = _mm_add_epi32(sum_sq, _mm_slli_epi32(sum_sq, 3));
+  if (n == 25) axn = _mm_add_epi32(axn, _mm_slli_epi32(sum_sq, 4));
+  const __m128i sub = _mm_sub_epi32(axn, dxd);
+  const __m128i p = _mm_max_epi32(sub, _mm_setzero_si128());
+  const __m128i pxs = _mm_mullo_epi32(p, _mm_set1_epi32(scale));
+  return VrshrU32(pxs, kSgrProjScaleBits);
+}
+
+template <int n>
+inline __m128i CalculateMa(const __m128i sum, const __m128i sum_sq[2],
+                           const uint32_t scale) {
+  static_assert(n == 9 || n == 25, "");
+  const __m128i b = VrshrU16(sum, 2);
+  const __m128i sum_lo = _mm_unpacklo_epi16(b, _mm_setzero_si128());
+  const __m128i sum_hi = _mm_unpackhi_epi16(b, _mm_setzero_si128());
+  const __m128i z0 = CalculateMa<n>(sum_lo, VrshrU32(sum_sq[0], 4), scale);
+  const __m128i z1 = CalculateMa<n>(sum_hi, VrshrU32(sum_sq[1], 4), scale);
+  return _mm_packus_epi32(z0, z1);
+}
+
+inline void CalculateB5(const __m128i sum, const __m128i ma, __m128i b[2]) {
+  // one_over_n == 164.
+  constexpr uint32_t one_over_n =
+      ((1 << kSgrProjReciprocalBits) + (25 >> 1)) / 25;
+  // one_over_n_quarter == 41.
+  constexpr uint32_t one_over_n_quarter = one_over_n >> 2;
+  static_assert(one_over_n == one_over_n_quarter << 2, "");
+  // |ma| is in range [0, 255].
+  const __m128i m = _mm_maddubs_epi16(ma, _mm_set1_epi16(one_over_n_quarter));
+  const __m128i m0 = VmullLo16(m, sum);
+  const __m128i m1 = VmullHi16(m, sum);
+  b[0] = VrshrU32(m0, kSgrProjReciprocalBits - 2);
+  b[1] = VrshrU32(m1, kSgrProjReciprocalBits - 2);
+}
+
+inline void CalculateB3(const __m128i sum, const __m128i ma, __m128i b[2]) {
+  // one_over_n == 455.
+  constexpr uint32_t one_over_n =
+      ((1 << kSgrProjReciprocalBits) + (9 >> 1)) / 9;
+  const __m128i m0 = VmullLo16(ma, sum);
+  const __m128i m1 = VmullHi16(ma, sum);
+  const __m128i m2 = _mm_mullo_epi32(m0, _mm_set1_epi32(one_over_n));
+  const __m128i m3 = _mm_mullo_epi32(m1, _mm_set1_epi32(one_over_n));
+  b[0] = VrshrU32(m2, kSgrProjReciprocalBits);
+  b[1] = VrshrU32(m3, kSgrProjReciprocalBits);
+}
+
+inline void CalculateSumAndIndex5(const __m128i s5[5], const __m128i sq5[5][2],
+                                  const uint32_t scale, __m128i* const sum,
+                                  __m128i* const index) {
+  __m128i sum_sq[2];
+  *sum = Sum5_16(s5);
+  Sum5_32(sq5, sum_sq);
+  *index = CalculateMa<25>(*sum, sum_sq, scale);
+}
+
+inline void CalculateSumAndIndex3(const __m128i s3[3], const __m128i sq3[3][2],
+                                  const uint32_t scale, __m128i* const sum,
+                                  __m128i* const index) {
+  __m128i sum_sq[2];
+  *sum = Sum3_16(s3);
+  Sum3_32(sq3, sum_sq);
+  *index = CalculateMa<9>(*sum, sum_sq, scale);
+}
+
+template <int n, int offset>
+inline void LookupIntermediate(const __m128i sum, const __m128i index,
+                               __m128i* const ma, __m128i b[2]) {
+  static_assert(n == 9 || n == 25, "");
+  static_assert(offset == 0 || offset == 8, "");
+  const __m128i idx = _mm_packus_epi16(index, index);
+  // Actually it's not stored and loaded. The compiler will use a 64-bit
+  // general-purpose register to process. Faster than using _mm_extract_epi8().
+  uint8_t temp[8];
+  StoreLo8(temp, idx);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[0]], offset + 0);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[1]], offset + 1);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[2]], offset + 2);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[3]], offset + 3);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[4]], offset + 4);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[5]], offset + 5);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[6]], offset + 6);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[7]], offset + 7);
+  // b = ma * b * one_over_n
+  // |ma| = [0, 255]
+  // |sum| is a box sum with radius 1 or 2.
+  // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
+  // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
+  // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
+  // When radius is 2 |n| is 25. |one_over_n| is 164.
+  // When radius is 1 |n| is 9. |one_over_n| is 455.
+  // |kSgrProjReciprocalBits| is 12.
+  // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
+  // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
+  __m128i maq;
+  if (offset == 0) {
+    maq = _mm_unpacklo_epi8(*ma, _mm_setzero_si128());
+  } else {
+    maq = _mm_unpackhi_epi8(*ma, _mm_setzero_si128());
+  }
+  if (n == 9) {
+    CalculateB3(sum, maq, b);
+  } else {
+    CalculateB5(sum, maq, b);
+  }
+}
+
+// Set the shuffle control mask of indices out of range [0, 15] to (1xxxxxxx)b
+// to get value 0 as the shuffle result. The most significiant bit 1 comes
+// either from the comparison instruction, or from the sign bit of the index.
+inline __m128i ShuffleIndex(const __m128i table, const __m128i index) {
+  __m128i mask;
+  mask = _mm_cmpgt_epi8(index, _mm_set1_epi8(15));
+  mask = _mm_or_si128(mask, index);
+  return _mm_shuffle_epi8(table, mask);
+}
+
+inline __m128i AdjustValue(const __m128i value, const __m128i index,
+                           const int threshold) {
+  const __m128i thresholds = _mm_set1_epi8(threshold - 128);
+  const __m128i offset = _mm_cmpgt_epi8(index, thresholds);
+  return _mm_add_epi8(value, offset);
+}
+
+inline void CalculateIntermediate(const __m128i sum[2], const __m128i index[2],
+                                  __m128i* const ma, __m128i b0[2],
+                                  __m128i b1[2]) {
+  // Use table lookup to read elements whose indices are less than 48.
+  const __m128i c0 = LoadAligned16(kSgrMaLookup + 0 * 16);
+  const __m128i c1 = LoadAligned16(kSgrMaLookup + 1 * 16);
+  const __m128i c2 = LoadAligned16(kSgrMaLookup + 2 * 16);
+  const __m128i indices = _mm_packus_epi16(index[0], index[1]);
+  __m128i idx;
+  // Clip idx to 127 to apply signed comparison instructions.
+  idx = _mm_min_epu8(indices, _mm_set1_epi8(127));
+  // All elements whose indices are less than 48 are set to 0.
+  // Get shuffle results for indices in range [0, 15].
+  *ma = ShuffleIndex(c0, idx);
+  // Get shuffle results for indices in range [16, 31].
+  // Subtract 16 to utilize the sign bit of the index.
+  idx = _mm_sub_epi8(idx, _mm_set1_epi8(16));
+  const __m128i res1 = ShuffleIndex(c1, idx);
+  // Use OR instruction to combine shuffle results together.
+  *ma = _mm_or_si128(*ma, res1);
+  // Get shuffle results for indices in range [32, 47].
+  // Subtract 16 to utilize the sign bit of the index.
+  idx = _mm_sub_epi8(idx, _mm_set1_epi8(16));
+  const __m128i res2 = ShuffleIndex(c2, idx);
+  *ma = _mm_or_si128(*ma, res2);
+
+  // For elements whose indices are larger than 47, since they seldom change
+  // values with the increase of the index, we use comparison and arithmetic
+  // operations to calculate their values.
+  // Add -128 to apply signed comparison instructions.
+  idx = _mm_add_epi8(indices, _mm_set1_epi8(-128));
+  // Elements whose indices are larger than 47 (with value 0) are set to 5.
+  *ma = _mm_max_epu8(*ma, _mm_set1_epi8(5));
+  *ma = AdjustValue(*ma, idx, 55);   // 55 is the last index which value is 5.
+  *ma = AdjustValue(*ma, idx, 72);   // 72 is the last index which value is 4.
+  *ma = AdjustValue(*ma, idx, 101);  // 101 is the last index which value is 3.
+  *ma = AdjustValue(*ma, idx, 169);  // 169 is the last index which value is 2.
+  *ma = AdjustValue(*ma, idx, 254);  // 254 is the last index which value is 1.
+
+  // b = ma * b * one_over_n
+  // |ma| = [0, 255]
+  // |sum| is a box sum with radius 1 or 2.
+  // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
+  // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
+  // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
+  // When radius is 2 |n| is 25. |one_over_n| is 164.
+  // When radius is 1 |n| is 9. |one_over_n| is 455.
+  // |kSgrProjReciprocalBits| is 12.
+  // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
+  // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
+  const __m128i maq0 = _mm_unpacklo_epi8(*ma, _mm_setzero_si128());
+  CalculateB3(sum[0], maq0, b0);
+  const __m128i maq1 = _mm_unpackhi_epi8(*ma, _mm_setzero_si128());
+  CalculateB3(sum[1], maq1, b1);
+}
+
+inline void CalculateIntermediate(const __m128i sum[2], const __m128i index[2],
+                                  __m128i ma[2], __m128i b[4]) {
+  __m128i mas;
+  CalculateIntermediate(sum, index, &mas, b + 0, b + 2);
+  ma[0] = _mm_unpacklo_epi64(ma[0], mas);
+  ma[1] = _mm_srli_si128(mas, 8);
+}
+
+// Note: It has been tried to call CalculateIntermediate() to replace the slow
+// LookupIntermediate() when calculating 16 intermediate data points. However,
+// the compiler generates even slower code.
+template <int offset>
+inline void CalculateIntermediate5(const __m128i s5[5], const __m128i sq5[5][2],
+                                   const uint32_t scale, __m128i* const ma,
+                                   __m128i b[2]) {
+  static_assert(offset == 0 || offset == 8, "");
+  __m128i sum, index;
+  CalculateSumAndIndex5(s5, sq5, scale, &sum, &index);
+  LookupIntermediate<25, offset>(sum, index, ma, b);
+}
+
+inline void CalculateIntermediate3(const __m128i s3[3], const __m128i sq3[3][2],
+                                   const uint32_t scale, __m128i* const ma,
+                                   __m128i b[2]) {
+  __m128i sum, index;
+  CalculateSumAndIndex3(s3, sq3, scale, &sum, &index);
+  LookupIntermediate<9, 0>(sum, index, ma, b);
+}
+
+inline void Store343_444(const __m128i b3[3], const ptrdiff_t x,
+                         __m128i sum_b343[2], __m128i sum_b444[2],
+                         uint32_t* const b343, uint32_t* const b444) {
+  __m128i b[3], sum_b111[2];
+  Prepare3_32(b3 + 0, b);
+  sum_b111[0] = Sum3_32(b);
+  sum_b444[0] = _mm_slli_epi32(sum_b111[0], 2);
+  sum_b343[0] = _mm_sub_epi32(sum_b444[0], sum_b111[0]);
+  sum_b343[0] = _mm_add_epi32(sum_b343[0], b[1]);
+  Prepare3_32(b3 + 1, b);
+  sum_b111[1] = Sum3_32(b);
+  sum_b444[1] = _mm_slli_epi32(sum_b111[1], 2);
+  sum_b343[1] = _mm_sub_epi32(sum_b444[1], sum_b111[1]);
+  sum_b343[1] = _mm_add_epi32(sum_b343[1], b[1]);
+  StoreAligned32U32(b444 + x, sum_b444);
+  StoreAligned32U32(b343 + x, sum_b343);
+}
+
+inline void Store343_444Lo(const __m128i ma3[3], const __m128i b3[3],
+                           const ptrdiff_t x, __m128i* const sum_ma343,
+                           __m128i* const sum_ma444, __m128i sum_b343[2],
+                           __m128i sum_b444[2], uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  const __m128i sum_ma111 = Sum3WLo16(ma3);
+  *sum_ma444 = _mm_slli_epi16(sum_ma111, 2);
+  StoreAligned16(ma444 + x, *sum_ma444);
+  const __m128i sum333 = _mm_sub_epi16(*sum_ma444, sum_ma111);
+  *sum_ma343 = VaddwLo8(sum333, ma3[1]);
+  StoreAligned16(ma343 + x, *sum_ma343);
+  Store343_444(b3, x, sum_b343, sum_b444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m128i ma3[3], const __m128i b3[3],
+                           const ptrdiff_t x, __m128i* const sum_ma343,
+                           __m128i* const sum_ma444, __m128i sum_b343[2],
+                           __m128i sum_b444[2], uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  const __m128i sum_ma111 = Sum3WHi16(ma3);
+  *sum_ma444 = _mm_slli_epi16(sum_ma111, 2);
+  StoreAligned16(ma444 + x, *sum_ma444);
+  const __m128i sum333 = _mm_sub_epi16(*sum_ma444, sum_ma111);
+  *sum_ma343 = VaddwHi8(sum333, ma3[1]);
+  StoreAligned16(ma343 + x, *sum_ma343);
+  Store343_444(b3, x, sum_b343, sum_b444, b343, b444);
+}
+
+inline void Store343_444Lo(const __m128i ma3[3], const __m128i b3[2],
+                           const ptrdiff_t x, __m128i* const sum_ma343,
+                           __m128i sum_b343[2], uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  __m128i sum_ma444, sum_b444[2];
+  Store343_444Lo(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, sum_b444, ma343,
+                 ma444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m128i ma3[3], const __m128i b3[2],
+                           const ptrdiff_t x, __m128i* const sum_ma343,
+                           __m128i sum_b343[2], uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  __m128i sum_ma444, sum_b444[2];
+  Store343_444Hi(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, sum_b444, ma343,
+                 ma444, b343, b444);
+}
+
+inline void Store343_444Lo(const __m128i ma3[3], const __m128i b3[2],
+                           const ptrdiff_t x, uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  __m128i sum_ma343, sum_b343[2];
+  Store343_444Lo(ma3, b3, x, &sum_ma343, sum_b343, ma343, ma444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m128i ma3[3], const __m128i b3[2],
+                           const ptrdiff_t x, uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  __m128i sum_ma343, sum_b343[2];
+  Store343_444Hi(ma3, b3, x, &sum_ma343, sum_b343, ma343, ma444, b343, b444);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5Lo(
+    const __m128i s[2][4], const uint32_t scale, uint16_t* const sum5[5],
+    uint32_t* const square_sum5[5], __m128i sq[2][8], __m128i* const ma,
+    __m128i b[2]) {
+  __m128i s5[2][5], sq5[5][2];
+  Square(s[0][1], sq[0] + 2);
+  Square(s[1][1], sq[1] + 2);
+  s5[0][3] = Sum5Horizontal16(s[0]);
+  StoreAligned16(sum5[3], s5[0][3]);
+  s5[0][4] = Sum5Horizontal16(s[1]);
+  StoreAligned16(sum5[4], s5[0][4]);
+  Sum5Horizontal32(sq[0], sq5[3]);
+  StoreAligned32U32(square_sum5[3], sq5[3]);
+  Sum5Horizontal32(sq[1], sq5[4]);
+  StoreAligned32U32(square_sum5[4], sq5[4]);
+  LoadAligned16x3U16(sum5, 0, s5[0]);
+  LoadAligned32x3U32(square_sum5, 0, sq5);
+  CalculateIntermediate5<0>(s5[0], sq5, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5(
+    const __m128i s[2][4], const ptrdiff_t sum_width, const ptrdiff_t x,
+    const uint32_t scale, uint16_t* const sum5[5],
+    uint32_t* const square_sum5[5], __m128i sq[2][8], __m128i ma[2],
+    __m128i b[6]) {
+  __m128i s5[2][5], sq5[5][2];
+  Square(s[0][2], sq[0] + 4);
+  Square(s[1][2], sq[1] + 4);
+  s5[0][3] = Sum5Horizontal16(s[0] + 1);
+  s5[1][3] = Sum5Horizontal16(s[0] + 2);
+  StoreAligned16(sum5[3] + x + 0, s5[0][3]);
+  StoreAligned16(sum5[3] + x + 8, s5[1][3]);
+  s5[0][4] = Sum5Horizontal16(s[1] + 1);
+  s5[1][4] = Sum5Horizontal16(s[1] + 2);
+  StoreAligned16(sum5[4] + x + 0, s5[0][4]);
+  StoreAligned16(sum5[4] + x + 8, s5[1][4]);
+  Sum5Horizontal32(sq[0] + 2, sq5[3]);
+  StoreAligned32U32(square_sum5[3] + x, sq5[3]);
+  Sum5Horizontal32(sq[1] + 2, sq5[4]);
+  StoreAligned32U32(square_sum5[4] + x, sq5[4]);
+  LoadAligned16x3U16(sum5, x, s5[0]);
+  LoadAligned32x3U32(square_sum5, x, sq5);
+  CalculateIntermediate5<8>(s5[0], sq5, scale, &ma[0], b + 2);
+
+  Square(s[0][3], sq[0] + 6);
+  Square(s[1][3], sq[1] + 6);
+  Sum5Horizontal32(sq[0] + 4, sq5[3]);
+  StoreAligned32U32(square_sum5[3] + x + 8, sq5[3]);
+  Sum5Horizontal32(sq[1] + 4, sq5[4]);
+  StoreAligned32U32(square_sum5[4] + x + 8, sq5[4]);
+  LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
+  LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
+  CalculateIntermediate5<0>(s5[1], sq5, scale, &ma[1], b + 4);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRowLo(
+    const __m128i s[2], const uint32_t scale, const uint16_t* const sum5[5],
+    const uint32_t* const square_sum5[5], __m128i sq[4], __m128i* const ma,
+    __m128i b[2]) {
+  __m128i s5[5], sq5[5][2];
+  Square(s[1], sq + 2);
+  s5[3] = s5[4] = Sum5Horizontal16(s);
+  Sum5Horizontal32(sq, sq5[3]);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  LoadAligned16x3U16(sum5, 0, s5);
+  LoadAligned32x3U32(square_sum5, 0, sq5);
+  CalculateIntermediate5<0>(s5, sq5, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRow(
+    const __m128i s[4], const ptrdiff_t sum_width, const ptrdiff_t x,
+    const uint32_t scale, const uint16_t* const sum5[5],
+    const uint32_t* const square_sum5[5], __m128i sq[8], __m128i ma[2],
+    __m128i b[6]) {
+  __m128i s5[2][5], sq5[5][2];
+  Square(s[2], sq + 4);
+  s5[0][3] = Sum5Horizontal16(s + 1);
+  s5[1][3] = Sum5Horizontal16(s + 2);
+  s5[0][4] = s5[0][3];
+  s5[1][4] = s5[1][3];
+  Sum5Horizontal32(sq + 2, sq5[3]);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  LoadAligned16x3U16(sum5, x, s5[0]);
+  LoadAligned32x3U32(square_sum5, x, sq5);
+  CalculateIntermediate5<8>(s5[0], sq5, scale, &ma[0], b + 2);
+
+  Square(s[3], sq + 6);
+  Sum5Horizontal32(sq + 4, sq5[3]);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
+  LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
+  CalculateIntermediate5<0>(s5[1], sq5, scale, &ma[1], b + 4);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3Lo(
+    const __m128i s[2], const uint32_t scale, uint16_t* const sum3[3],
+    uint32_t* const square_sum3[3], __m128i sq[4], __m128i* const ma,
+    __m128i b[2]) {
+  __m128i s3[3], sq3[3][2];
+  Square(s[1], sq + 2);
+  s3[2] = Sum3Horizontal16(s);
+  StoreAligned16(sum3[2], s3[2]);
+  Sum3Horizontal32(sq, sq3[2]);
+  StoreAligned32U32(square_sum3[2], sq3[2]);
+  LoadAligned16x2U16(sum3, 0, s3);
+  LoadAligned32x2U32(square_sum3, 0, sq3);
+  CalculateIntermediate3(s3, sq3, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3(
+    const __m128i s[4], const ptrdiff_t x, const ptrdiff_t sum_width,
+    const uint32_t scale, uint16_t* const sum3[3],
+    uint32_t* const square_sum3[3], __m128i sq[8], __m128i ma[2],
+    __m128i b[6]) {
+  __m128i s3[4], sq3[3][2], sum[2], index[2];
+  Square(s[2], sq + 4);
+  s3[2] = Sum3Horizontal16(s + 1);
+  s3[3] = Sum3Horizontal16(s + 2);
+  StoreAligned32U16(sum3[2] + x, s3 + 2);
+  Sum3Horizontal32(sq + 2, sq3[2]);
+  StoreAligned32U32(square_sum3[2] + x + 0, sq3[2]);
+  LoadAligned16x2U16(sum3, x, s3);
+  LoadAligned32x2U32(square_sum3, x, sq3);
+  CalculateSumAndIndex3(s3, sq3, scale, &sum[0], &index[0]);
+
+  Square(s[3], sq + 6);
+  Sum3Horizontal32(sq + 4, sq3[2]);
+  StoreAligned32U32(square_sum3[2] + x + 8, sq3[2]);
+  LoadAligned16x2U16Msan(sum3, x + 8, sum_width, s3 + 1);
+  LoadAligned32x2U32Msan(square_sum3, x + 8, sum_width, sq3);
+  CalculateSumAndIndex3(s3 + 1, sq3, scale, &sum[1], &index[1]);
+  CalculateIntermediate(sum, index, ma, b + 2);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLo(
+    const __m128i s[2][4], const uint16_t scales[2], uint16_t* const sum3[4],
+    uint16_t* const sum5[5], uint32_t* const square_sum3[4],
+    uint32_t* const square_sum5[5], __m128i sq[2][8], __m128i ma3[2][2],
+    __m128i b3[2][6], __m128i* const ma5, __m128i b5[2]) {
+  __m128i s3[4], s5[5], sq3[4][2], sq5[5][2], sum[2], index[2];
+  Square(s[0][1], sq[0] + 2);
+  Square(s[1][1], sq[1] + 2);
+  SumHorizontal16(s[0], &s3[2], &s5[3]);
+  SumHorizontal16(s[1], &s3[3], &s5[4]);
+  StoreAligned16(sum3[2], s3[2]);
+  StoreAligned16(sum3[3], s3[3]);
+  StoreAligned16(sum5[3], s5[3]);
+  StoreAligned16(sum5[4], s5[4]);
+  SumHorizontal32(sq[0], &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  StoreAligned32U32(square_sum3[2], sq3[2]);
+  StoreAligned32U32(square_sum5[3], sq5[3]);
+  SumHorizontal32(sq[1], &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+  StoreAligned32U32(square_sum3[3], sq3[3]);
+  StoreAligned32U32(square_sum5[4], sq5[4]);
+  LoadAligned16x2U16(sum3, 0, s3);
+  LoadAligned32x2U32(square_sum3, 0, sq3);
+  LoadAligned16x3U16(sum5, 0, s5);
+  LoadAligned32x3U32(square_sum5, 0, sq5);
+  CalculateSumAndIndex3(s3 + 0, sq3 + 0, scales[1], &sum[0], &index[0]);
+  CalculateSumAndIndex3(s3 + 1, sq3 + 1, scales[1], &sum[1], &index[1]);
+  CalculateIntermediate(sum, index, &ma3[0][0], b3[0], b3[1]);
+  ma3[1][0] = _mm_srli_si128(ma3[0][0], 8);
+  CalculateIntermediate5<0>(s5, sq5, scales[0], ma5, b5);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess(
+    const __m128i s[2][4], const ptrdiff_t x, const uint16_t scales[2],
+    uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    const ptrdiff_t sum_width, __m128i sq[2][8], __m128i ma3[2][2],
+    __m128i b3[2][6], __m128i ma5[2], __m128i b5[6]) {
+  __m128i s3[2][4], s5[2][5], sq3[4][2], sq5[5][2], sum[2][2], index[2][2];
+  SumHorizontal16(s[0] + 1, &s3[0][2], &s3[1][2], &s5[0][3], &s5[1][3]);
+  StoreAligned16(sum3[2] + x + 0, s3[0][2]);
+  StoreAligned16(sum3[2] + x + 8, s3[1][2]);
+  StoreAligned16(sum5[3] + x + 0, s5[0][3]);
+  StoreAligned16(sum5[3] + x + 8, s5[1][3]);
+  SumHorizontal16(s[1] + 1, &s3[0][3], &s3[1][3], &s5[0][4], &s5[1][4]);
+  StoreAligned16(sum3[3] + x + 0, s3[0][3]);
+  StoreAligned16(sum3[3] + x + 8, s3[1][3]);
+  StoreAligned16(sum5[4] + x + 0, s5[0][4]);
+  StoreAligned16(sum5[4] + x + 8, s5[1][4]);
+  Square(s[0][2], sq[0] + 4);
+  Square(s[1][2], sq[1] + 4);
+  SumHorizontal32(sq[0] + 2, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  StoreAligned32U32(square_sum3[2] + x, sq3[2]);
+  StoreAligned32U32(square_sum5[3] + x, sq5[3]);
+  SumHorizontal32(sq[1] + 2, &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+  StoreAligned32U32(square_sum3[3] + x, sq3[3]);
+  StoreAligned32U32(square_sum5[4] + x, sq5[4]);
+  LoadAligned16x2U16(sum3, x, s3[0]);
+  LoadAligned32x2U32(square_sum3, x, sq3);
+  CalculateSumAndIndex3(s3[0], sq3, scales[1], &sum[0][0], &index[0][0]);
+  CalculateSumAndIndex3(s3[0] + 1, sq3 + 1, scales[1], &sum[1][0],
+                        &index[1][0]);
+  LoadAligned16x3U16(sum5, x, s5[0]);
+  LoadAligned32x3U32(square_sum5, x, sq5);
+  CalculateIntermediate5<8>(s5[0], sq5, scales[0], &ma5[0], b5 + 2);
+
+  Square(s[0][3], sq[0] + 6);
+  Square(s[1][3], sq[1] + 6);
+  SumHorizontal32(sq[0] + 4, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  StoreAligned32U32(square_sum3[2] + x + 8, sq3[2]);
+  StoreAligned32U32(square_sum5[3] + x + 8, sq5[3]);
+  SumHorizontal32(sq[1] + 4, &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+  StoreAligned32U32(square_sum3[3] + x + 8, sq3[3]);
+  StoreAligned32U32(square_sum5[4] + x + 8, sq5[4]);
+  LoadAligned16x2U16Msan(sum3, x + 8, sum_width, s3[1]);
+  LoadAligned32x2U32Msan(square_sum3, x + 8, sum_width, sq3);
+  CalculateSumAndIndex3(s3[1], sq3, scales[1], &sum[0][1], &index[0][1]);
+  CalculateSumAndIndex3(s3[1] + 1, sq3 + 1, scales[1], &sum[1][1],
+                        &index[1][1]);
+  CalculateIntermediate(sum[0], index[0], ma3[0], b3[0] + 2);
+  CalculateIntermediate(sum[1], index[1], ma3[1], b3[1] + 2);
+  LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
+  LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
+  CalculateIntermediate5<0>(s5[1], sq5, scales[0], &ma5[1], b5 + 4);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRowLo(
+    const __m128i s[2], const uint16_t scales[2], const uint16_t* const sum3[4],
+    const uint16_t* const sum5[5], const uint32_t* const square_sum3[4],
+    const uint32_t* const square_sum5[5], __m128i sq[4], __m128i* const ma3,
+    __m128i* const ma5, __m128i b3[2], __m128i b5[2]) {
+  __m128i s3[3], s5[5], sq3[3][2], sq5[5][2];
+  Square(s[1], sq + 2);
+  SumHorizontal16(s, &s3[2], &s5[3]);
+  SumHorizontal32(sq, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  LoadAligned16x3U16(sum5, 0, s5);
+  s5[4] = s5[3];
+  LoadAligned32x3U32(square_sum5, 0, sq5);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  CalculateIntermediate5<0>(s5, sq5, scales[0], ma5, b5);
+  LoadAligned16x2U16(sum3, 0, s3);
+  LoadAligned32x2U32(square_sum3, 0, sq3);
+  CalculateIntermediate3(s3, sq3, scales[1], ma3, b3);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRow(
+    const __m128i s[4], const ptrdiff_t sum_width, const ptrdiff_t x,
+    const uint16_t scales[2], const uint16_t* const sum3[4],
+    const uint16_t* const sum5[5], const uint32_t* const square_sum3[4],
+    const uint32_t* const square_sum5[5], __m128i sq[8], __m128i ma3[2],
+    __m128i ma5[2], __m128i b3[6], __m128i b5[6]) {
+  __m128i s3[2][3], s5[2][5], sq3[3][2], sq5[5][2], sum[2], index[2];
+  Square(s[2], sq + 4);
+  SumHorizontal16(s + 1, &s3[0][2], &s3[1][2], &s5[0][3], &s5[1][3]);
+  SumHorizontal32(sq + 2, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  LoadAligned16x3U16(sum5, x, s5[0]);
+  s5[0][4] = s5[0][3];
+  LoadAligned32x3U32(square_sum5, x, sq5);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  CalculateIntermediate5<8>(s5[0], sq5, scales[0], ma5, b5 + 2);
+  LoadAligned16x2U16(sum3, x, s3[0]);
+  LoadAligned32x2U32(square_sum3, x, sq3);
+  CalculateSumAndIndex3(s3[0], sq3, scales[1], &sum[0], &index[0]);
+
+  Square(s[3], sq + 6);
+  SumHorizontal32(sq + 4, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
+  s5[1][4] = s5[1][3];
+  LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  CalculateIntermediate5<0>(s5[1], sq5, scales[0], ma5 + 1, b5 + 4);
+  LoadAligned16x2U16Msan(sum3, x + 8, sum_width, s3[1]);
+  LoadAligned32x2U32Msan(square_sum3, x + 8, sum_width, sq3);
+  CalculateSumAndIndex3(s3[1], sq3, scales[1], &sum[1], &index[1]);
+  CalculateIntermediate(sum, index, ma3, b3 + 2);
+}
+
+inline void BoxSumFilterPreProcess5(const uint16_t* const src0,
+                                    const uint16_t* const src1, const int width,
+                                    const uint32_t scale,
+                                    uint16_t* const sum5[5],
+                                    uint32_t* const square_sum5[5],
+                                    const ptrdiff_t sum_width, uint16_t* ma565,
+                                    uint32_t* b565) {
+  const ptrdiff_t overread_in_bytes =
+      kOverreadInBytesPass1 - sizeof(*src0) * width;
+  __m128i s[2][4], mas[2], sq[2][8], bs[6];
+  s[0][0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+  s[0][1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+  s[1][0] = LoadUnaligned16Msan(src1 + 0, overread_in_bytes + 0);
+  s[1][1] = LoadUnaligned16Msan(src1 + 8, overread_in_bytes + 16);
+  Square(s[0][0], sq[0]);
+  Square(s[1][0], sq[1]);
+  BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq, &mas[0], bs);
+
+  int x = 0;
+  do {
+    __m128i ma5[3], ma[2], b[4];
+    s[0][2] = LoadUnaligned16Msan(src0 + x + 16,
+                                  overread_in_bytes + sizeof(*src0) * (x + 16));
+    s[0][3] = LoadUnaligned16Msan(src0 + x + 24,
+                                  overread_in_bytes + sizeof(*src0) * (x + 24));
+    s[1][2] = LoadUnaligned16Msan(src1 + x + 16,
+                                  overread_in_bytes + sizeof(*src1) * (x + 16));
+    s[1][3] = LoadUnaligned16Msan(src1 + x + 24,
+                                  overread_in_bytes + sizeof(*src1) * (x + 24));
+    BoxFilterPreProcess5(s, sum_width, x + 8, scale, sum5, square_sum5, sq, mas,
+                         bs);
+    Prepare3_8<0>(mas, ma5);
+    ma[0] = Sum565Lo(ma5);
+    ma[1] = Sum565Hi(ma5);
+    StoreAligned32U16(ma565, ma);
+    Sum565(bs + 0, b + 0);
+    Sum565(bs + 2, b + 2);
+    StoreAligned64U32(b565, b);
+    s[0][0] = s[0][2];
+    s[0][1] = s[0][3];
+    s[1][0] = s[1][2];
+    s[1][1] = s[1][3];
+    sq[0][2] = sq[0][6];
+    sq[0][3] = sq[0][7];
+    sq[1][2] = sq[1][6];
+    sq[1][3] = sq[1][7];
+    mas[0] = mas[1];
+    bs[0] = bs[4];
+    bs[1] = bs[5];
+    ma565 += 16;
+    b565 += 16;
+    x += 16;
+  } while (x < width);
+}
+
+template <bool calculate444>
+LIBGAV1_ALWAYS_INLINE void BoxSumFilterPreProcess3(
+    const uint16_t* const src, const int width, const uint32_t scale,
+    uint16_t* const sum3[3], uint32_t* const square_sum3[3],
+    const ptrdiff_t sum_width, uint16_t* ma343, uint16_t* ma444, uint32_t* b343,
+    uint32_t* b444) {
+  const ptrdiff_t overread_in_bytes =
+      kOverreadInBytesPass2 - sizeof(*src) * width;
+  __m128i s[4], mas[2], sq[8], bs[6];
+  s[0] = LoadUnaligned16Msan(src + 0, overread_in_bytes + 0);
+  s[1] = LoadUnaligned16Msan(src + 8, overread_in_bytes + 16);
+  Square(s[0], sq);
+  BoxFilterPreProcess3Lo(s, scale, sum3, square_sum3, sq, &mas[0], bs);
+
+  int x = 0;
+  do {
+    s[2] = LoadUnaligned16Msan(src + x + 16,
+                               overread_in_bytes + sizeof(*src) * (x + 16));
+    s[3] = LoadUnaligned16Msan(src + x + 24,
+                               overread_in_bytes + sizeof(*src) * (x + 24));
+    BoxFilterPreProcess3(s, x + 8, sum_width, scale, sum3, square_sum3, sq, mas,
+                         bs);
+    __m128i ma3[3];
+    Prepare3_8<0>(mas, ma3);
+    if (calculate444) {  // NOLINT(readability-simplify-boolean-expr)
+      Store343_444Lo(ma3, bs + 0, 0, ma343, ma444, b343, b444);
+      Store343_444Hi(ma3, bs + 2, 8, ma343, ma444, b343, b444);
+      ma444 += 16;
+      b444 += 16;
+    } else {
+      __m128i ma[2], b[4];
+      ma[0] = Sum343Lo(ma3);
+      ma[1] = Sum343Hi(ma3);
+      StoreAligned32U16(ma343, ma);
+      Sum343(bs + 0, b + 0);
+      Sum343(bs + 2, b + 2);
+      StoreAligned64U32(b343, b);
+    }
+    s[1] = s[3];
+    sq[2] = sq[6];
+    sq[3] = sq[7];
+    mas[0] = mas[1];
+    bs[0] = bs[4];
+    bs[1] = bs[5];
+    ma343 += 16;
+    b343 += 16;
+    x += 16;
+  } while (x < width);
+}
+
+inline void BoxSumFilterPreProcess(
+    const uint16_t* const src0, const uint16_t* const src1, const int width,
+    const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    const ptrdiff_t sum_width, uint16_t* const ma343[4], uint16_t* const ma444,
+    uint16_t* ma565, uint32_t* const b343[4], uint32_t* const b444,
+    uint32_t* b565) {
+  const ptrdiff_t overread_in_bytes =
+      kOverreadInBytesPass1 - sizeof(*src0) * width;
+  __m128i s[2][4], ma3[2][2], ma5[2], sq[2][8], b3[2][6], b5[6];
+  s[0][0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+  s[0][1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+  s[1][0] = LoadUnaligned16Msan(src1 + 0, overread_in_bytes + 0);
+  s[1][1] = LoadUnaligned16Msan(src1 + 8, overread_in_bytes + 16);
+  Square(s[0][0], sq[0]);
+  Square(s[1][0], sq[1]);
+  BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq,
+                        ma3, b3, &ma5[0], b5);
+
+  int x = 0;
+  do {
+    __m128i ma[2], b[4], ma3x[3], ma5x[3];
+    s[0][2] = LoadUnaligned16Msan(src0 + x + 16,
+                                  overread_in_bytes + sizeof(*src0) * (x + 16));
+    s[0][3] = LoadUnaligned16Msan(src0 + x + 24,
+                                  overread_in_bytes + sizeof(*src0) * (x + 24));
+    s[1][2] = LoadUnaligned16Msan(src1 + x + 16,
+                                  overread_in_bytes + sizeof(*src1) * (x + 16));
+    s[1][3] = LoadUnaligned16Msan(src1 + x + 24,
+                                  overread_in_bytes + sizeof(*src1) * (x + 24));
+    BoxFilterPreProcess(s, x + 8, scales, sum3, sum5, square_sum3, square_sum5,
+                        sum_width, sq, ma3, b3, ma5, b5);
+
+    Prepare3_8<0>(ma3[0], ma3x);
+    ma[0] = Sum343Lo(ma3x);
+    ma[1] = Sum343Hi(ma3x);
+    StoreAligned32U16(ma343[0] + x, ma);
+    Sum343(b3[0] + 0, b + 0);
+    Sum343(b3[0] + 2, b + 2);
+    StoreAligned64U32(b343[0] + x, b);
+    Sum565(b5 + 0, b + 0);
+    Sum565(b5 + 2, b + 2);
+    StoreAligned64U32(b565, b);
+    Prepare3_8<0>(ma3[1], ma3x);
+    Store343_444Lo(ma3x, b3[1], x, ma343[1], ma444, b343[1], b444);
+    Store343_444Hi(ma3x, b3[1] + 2, x + 8, ma343[1], ma444, b343[1], b444);
+    Prepare3_8<0>(ma5, ma5x);
+    ma[0] = Sum565Lo(ma5x);
+    ma[1] = Sum565Hi(ma5x);
+    StoreAligned32U16(ma565, ma);
+    s[0][0] = s[0][2];
+    s[0][1] = s[0][3];
+    s[1][0] = s[1][2];
+    s[1][1] = s[1][3];
+    sq[0][2] = sq[0][6];
+    sq[0][3] = sq[0][7];
+    sq[1][2] = sq[1][6];
+    sq[1][3] = sq[1][7];
+    ma3[0][0] = ma3[0][1];
+    ma3[1][0] = ma3[1][1];
+    ma5[0] = ma5[1];
+    b3[0][0] = b3[0][4];
+    b3[0][1] = b3[0][5];
+    b3[1][0] = b3[1][4];
+    b3[1][1] = b3[1][5];
+    b5[0] = b5[4];
+    b5[1] = b5[5];
+    ma565 += 16;
+    b565 += 16;
+    x += 16;
+  } while (x < width);
+}
+
+template <int shift>
+inline __m128i FilterOutput(const __m128i ma_x_src, const __m128i b) {
+  // ma: 255 * 32 = 8160 (13 bits)
+  // b: 65088 * 32 = 2082816 (21 bits)
+  // v: b - ma * 255 (22 bits)
+  const __m128i v = _mm_sub_epi32(b, ma_x_src);
+  // kSgrProjSgrBits = 8
+  // kSgrProjRestoreBits = 4
+  // shift = 4 or 5
+  // v >> 8 or 9 (13 bits)
+  return VrshrS32(v, kSgrProjSgrBits + shift - kSgrProjRestoreBits);
+}
+
+template <int shift>
+inline __m128i CalculateFilteredOutput(const __m128i src, const __m128i ma,
+                                       const __m128i b[2]) {
+  const __m128i ma_x_src_lo = VmullLo16(ma, src);
+  const __m128i ma_x_src_hi = VmullHi16(ma, src);
+  const __m128i dst_lo = FilterOutput<shift>(ma_x_src_lo, b[0]);
+  const __m128i dst_hi = FilterOutput<shift>(ma_x_src_hi, b[1]);
+  return _mm_packs_epi32(dst_lo, dst_hi);  // 13 bits
+}
+
+inline __m128i CalculateFilteredOutputPass1(const __m128i src,
+                                            const __m128i ma[2],
+                                            const __m128i b[2][2]) {
+  const __m128i ma_sum = _mm_add_epi16(ma[0], ma[1]);
+  __m128i b_sum[2];
+  b_sum[0] = _mm_add_epi32(b[0][0], b[1][0]);
+  b_sum[1] = _mm_add_epi32(b[0][1], b[1][1]);
+  return CalculateFilteredOutput<5>(src, ma_sum, b_sum);
+}
+
+inline __m128i CalculateFilteredOutputPass2(const __m128i src,
+                                            const __m128i ma[3],
+                                            const __m128i b[3][2]) {
+  const __m128i ma_sum = Sum3_16(ma);
+  __m128i b_sum[2];
+  Sum3_32(b, b_sum);
+  return CalculateFilteredOutput<5>(src, ma_sum, b_sum);
+}
+
+inline __m128i SelfGuidedFinal(const __m128i src, const __m128i v[2]) {
+  const __m128i v_lo =
+      VrshrS32(v[0], kSgrProjRestoreBits + kSgrProjPrecisionBits);
+  const __m128i v_hi =
+      VrshrS32(v[1], kSgrProjRestoreBits + kSgrProjPrecisionBits);
+  const __m128i vv = _mm_packs_epi32(v_lo, v_hi);
+  return _mm_add_epi16(src, vv);
+}
+
+inline __m128i SelfGuidedDoubleMultiplier(const __m128i src,
+                                          const __m128i filter[2], const int w0,
+                                          const int w2) {
+  __m128i v[2];
+  const __m128i w0_w2 = _mm_set1_epi32((w2 << 16) | static_cast<uint16_t>(w0));
+  const __m128i f_lo = _mm_unpacklo_epi16(filter[0], filter[1]);
+  const __m128i f_hi = _mm_unpackhi_epi16(filter[0], filter[1]);
+  v[0] = _mm_madd_epi16(w0_w2, f_lo);
+  v[1] = _mm_madd_epi16(w0_w2, f_hi);
+  return SelfGuidedFinal(src, v);
+}
+
+inline __m128i SelfGuidedSingleMultiplier(const __m128i src,
+                                          const __m128i filter, const int w0) {
+  // weight: -96 to 96 (Sgrproj_Xqd_Min/Max)
+  __m128i v[2];
+  v[0] = VmullNLo8(filter, w0);
+  v[1] = VmullNHi8(filter, w0);
+  return SelfGuidedFinal(src, v);
+}
+
+inline void ClipAndStore(uint16_t* const dst, const __m128i val) {
+  const __m128i val0 = _mm_max_epi16(val, _mm_setzero_si128());
+  const __m128i val1 = _mm_min_epi16(val0, _mm_set1_epi16(1023));
+  StoreAligned16(dst, val1);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPass1(
+    const uint16_t* const src, const uint16_t* const src0,
+    const uint16_t* const src1, const ptrdiff_t stride, uint16_t* const sum5[5],
+    uint32_t* const square_sum5[5], const int width, const ptrdiff_t sum_width,
+    const uint32_t scale, const int16_t w0, uint16_t* const ma565[2],
+    uint32_t* const b565[2], uint16_t* const dst) {
+  const ptrdiff_t overread_in_bytes =
+      kOverreadInBytesPass1 - sizeof(*src0) * width;
+  __m128i s[2][4], mas[2], sq[2][8], bs[6];
+  s[0][0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+  s[0][1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+  s[1][0] = LoadUnaligned16Msan(src1 + 0, overread_in_bytes + 0);
+  s[1][1] = LoadUnaligned16Msan(src1 + 8, overread_in_bytes + 16);
+  Square(s[0][0], sq[0]);
+  Square(s[1][0], sq[1]);
+  BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq, &mas[0], bs);
+
+  int x = 0;
+  do {
+    __m128i ma[2], ma5[3], b[2][2], p[2];
+    s[0][2] = LoadUnaligned16Msan(src0 + x + 16,
+                                  overread_in_bytes + sizeof(*src0) * (x + 16));
+    s[0][3] = LoadUnaligned16Msan(src0 + x + 24,
+                                  overread_in_bytes + sizeof(*src0) * (x + 24));
+    s[1][2] = LoadUnaligned16Msan(src1 + x + 16,
+                                  overread_in_bytes + sizeof(*src1) * (x + 16));
+    s[1][3] = LoadUnaligned16Msan(src1 + x + 24,
+                                  overread_in_bytes + sizeof(*src1) * (x + 24));
+    BoxFilterPreProcess5(s, sum_width, x + 8, scale, sum5, square_sum5, sq, mas,
+                         bs);
+    Prepare3_8<0>(mas, ma5);
+    ma[1] = Sum565Lo(ma5);
+    StoreAligned16(ma565[1] + x, ma[1]);
+    Sum565(bs, b[1]);
+    StoreAligned32U32(b565[1] + x, b[1]);
+    const __m128i sr0_lo = LoadAligned16(src + x + 0);
+    const __m128i sr1_lo = LoadAligned16(src + stride + x + 0);
+    ma[0] = LoadAligned16(ma565[0] + x);
+    LoadAligned32U32(b565[0] + x, b[0]);
+    p[0] = CalculateFilteredOutputPass1(sr0_lo, ma, b);
+    p[1] = CalculateFilteredOutput<4>(sr1_lo, ma[1], b[1]);
+    const __m128i d00 = SelfGuidedSingleMultiplier(sr0_lo, p[0], w0);
+    const __m128i d10 = SelfGuidedSingleMultiplier(sr1_lo, p[1], w0);
+
+    ma[1] = Sum565Hi(ma5);
+    StoreAligned16(ma565[1] + x + 8, ma[1]);
+    Sum565(bs + 2, b[1]);
+    StoreAligned32U32(b565[1] + x + 8, b[1]);
+    const __m128i sr0_hi = LoadAligned16(src + x + 8);
+    const __m128i sr1_hi = LoadAligned16(src + stride + x + 8);
+    ma[0] = LoadAligned16(ma565[0] + x + 8);
+    LoadAligned32U32(b565[0] + x + 8, b[0]);
+    p[0] = CalculateFilteredOutputPass1(sr0_hi, ma, b);
+    p[1] = CalculateFilteredOutput<4>(sr1_hi, ma[1], b[1]);
+    const __m128i d01 = SelfGuidedSingleMultiplier(sr0_hi, p[0], w0);
+    ClipAndStore(dst + x + 0, d00);
+    ClipAndStore(dst + x + 8, d01);
+    const __m128i d11 = SelfGuidedSingleMultiplier(sr1_hi, p[1], w0);
+    ClipAndStore(dst + stride + x + 0, d10);
+    ClipAndStore(dst + stride + x + 8, d11);
+    s[0][0] = s[0][2];
+    s[0][1] = s[0][3];
+    s[1][0] = s[1][2];
+    s[1][1] = s[1][3];
+    sq[0][2] = sq[0][6];
+    sq[0][3] = sq[0][7];
+    sq[1][2] = sq[1][6];
+    sq[1][3] = sq[1][7];
+    mas[0] = mas[1];
+    bs[0] = bs[4];
+    bs[1] = bs[5];
+    x += 16;
+  } while (x < width);
+}
+
+inline void BoxFilterPass1LastRow(
+    const uint16_t* const src, const uint16_t* const src0, const int width,
+    const ptrdiff_t sum_width, const uint32_t scale, const int16_t w0,
+    uint16_t* const sum5[5], uint32_t* const square_sum5[5], uint16_t* ma565,
+    uint32_t* b565, uint16_t* const dst) {
+  const ptrdiff_t overread_in_bytes =
+      kOverreadInBytesPass1 - sizeof(*src0) * width;
+  __m128i s[4], mas[2], sq[8], bs[6];
+  s[0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+  s[1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+  Square(s[0], sq);
+  BoxFilterPreProcess5LastRowLo(s, scale, sum5, square_sum5, sq, &mas[0], bs);
+
+  int x = 0;
+  do {
+    __m128i ma[2], ma5[3], b[2][2];
+    s[2] = LoadUnaligned16Msan(src0 + x + 16,
+                               overread_in_bytes + sizeof(*src0) * (x + 16));
+    s[3] = LoadUnaligned16Msan(src0 + x + 24,
+                               overread_in_bytes + sizeof(*src0) * (x + 24));
+    BoxFilterPreProcess5LastRow(s, sum_width, x + 8, scale, sum5, square_sum5,
+                                sq, mas, bs);
+    Prepare3_8<0>(mas, ma5);
+    ma[1] = Sum565Lo(ma5);
+    Sum565(bs, b[1]);
+    ma[0] = LoadAligned16(ma565);
+    LoadAligned32U32(b565, b[0]);
+    const __m128i sr_lo = LoadAligned16(src + x + 0);
+    __m128i p = CalculateFilteredOutputPass1(sr_lo, ma, b);
+    const __m128i d0 = SelfGuidedSingleMultiplier(sr_lo, p, w0);
+
+    ma[1] = Sum565Hi(ma5);
+    Sum565(bs + 2, b[1]);
+    ma[0] = LoadAligned16(ma565 + 8);
+    LoadAligned32U32(b565 + 8, b[0]);
+    const __m128i sr_hi = LoadAligned16(src + x + 8);
+    p = CalculateFilteredOutputPass1(sr_hi, ma, b);
+    const __m128i d1 = SelfGuidedSingleMultiplier(sr_hi, p, w0);
+    ClipAndStore(dst + x + 0, d0);
+    ClipAndStore(dst + x + 8, d1);
+    s[1] = s[3];
+    sq[2] = sq[6];
+    sq[3] = sq[7];
+    mas[0] = mas[1];
+    bs[0] = bs[4];
+    bs[1] = bs[5];
+    ma565 += 16;
+    b565 += 16;
+    x += 16;
+  } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPass2(
+    const uint16_t* const src, const uint16_t* const src0, const int width,
+    const ptrdiff_t sum_width, const uint32_t scale, const int16_t w0,
+    uint16_t* const sum3[3], uint32_t* const square_sum3[3],
+    uint16_t* const ma343[3], uint16_t* const ma444[2], uint32_t* const b343[3],
+    uint32_t* const b444[2], uint16_t* const dst) {
+  const ptrdiff_t overread_in_bytes =
+      kOverreadInBytesPass2 - sizeof(*src0) * width;
+  __m128i s[4], mas[2], sq[8], bs[6];
+  s[0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+  s[1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+  Square(s[0], sq);
+  BoxFilterPreProcess3Lo(s, scale, sum3, square_sum3, sq, &mas[0], bs);
+
+  int x = 0;
+  do {
+    s[2] = LoadUnaligned16Msan(src0 + x + 16,
+                               overread_in_bytes + sizeof(*src0) * (x + 16));
+    s[3] = LoadUnaligned16Msan(src0 + x + 24,
+                               overread_in_bytes + sizeof(*src0) * (x + 24));
+    BoxFilterPreProcess3(s, x + 8, sum_width, scale, sum3, square_sum3, sq, mas,
+                         bs);
+    __m128i ma[3], b[3][2], ma3[3];
+    Prepare3_8<0>(mas, ma3);
+    Store343_444Lo(ma3, bs + 0, x, &ma[2], b[2], ma343[2], ma444[1], b343[2],
+                   b444[1]);
+    const __m128i sr_lo = LoadAligned16(src + x + 0);
+    ma[0] = LoadAligned16(ma343[0] + x);
+    ma[1] = LoadAligned16(ma444[0] + x);
+    LoadAligned32U32(b343[0] + x, b[0]);
+    LoadAligned32U32(b444[0] + x, b[1]);
+    const __m128i p0 = CalculateFilteredOutputPass2(sr_lo, ma, b);
+
+    Store343_444Hi(ma3, bs + 2, x + 8, &ma[2], b[2], ma343[2], ma444[1],
+                   b343[2], b444[1]);
+    const __m128i sr_hi = LoadAligned16(src + x + 8);
+    ma[0] = LoadAligned16(ma343[0] + x + 8);
+    ma[1] = LoadAligned16(ma444[0] + x + 8);
+    LoadAligned32U32(b343[0] + x + 8, b[0]);
+    LoadAligned32U32(b444[0] + x + 8, b[1]);
+    const __m128i p1 = CalculateFilteredOutputPass2(sr_hi, ma, b);
+    const __m128i d0 = SelfGuidedSingleMultiplier(sr_lo, p0, w0);
+    const __m128i d1 = SelfGuidedSingleMultiplier(sr_hi, p1, w0);
+    ClipAndStore(dst + x + 0, d0);
+    ClipAndStore(dst + x + 8, d1);
+    s[1] = s[3];
+    sq[2] = sq[6];
+    sq[3] = sq[7];
+    mas[0] = mas[1];
+    bs[0] = bs[4];
+    bs[1] = bs[5];
+    x += 16;
+  } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilter(
+    const uint16_t* const src, const uint16_t* const src0,
+    const uint16_t* const src1, const ptrdiff_t stride, const int width,
+    const uint16_t scales[2], const int16_t w0, const int16_t w2,
+    uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    const ptrdiff_t sum_width, uint16_t* const ma343[4],
+    uint16_t* const ma444[3], uint16_t* const ma565[2], uint32_t* const b343[4],
+    uint32_t* const b444[3], uint32_t* const b565[2], uint16_t* const dst) {
+  const ptrdiff_t overread_in_bytes =
+      kOverreadInBytesPass1 - sizeof(*src0) * width;
+  __m128i s[2][4], ma3[2][2], ma5[2], sq[2][8], b3[2][6], b5[6];
+  s[0][0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+  s[0][1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+  s[1][0] = LoadUnaligned16Msan(src1 + 0, overread_in_bytes + 0);
+  s[1][1] = LoadUnaligned16Msan(src1 + 8, overread_in_bytes + 16);
+  Square(s[0][0], sq[0]);
+  Square(s[1][0], sq[1]);
+  BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq,
+                        ma3, b3, &ma5[0], b5);
+
+  int x = 0;
+  do {
+    __m128i ma[3][3], b[3][3][2], p[2][2], ma3x[2][3], ma5x[3];
+    s[0][2] = LoadUnaligned16Msan(src0 + x + 16,
+                                  overread_in_bytes + sizeof(*src0) * (x + 16));
+    s[0][3] = LoadUnaligned16Msan(src0 + x + 24,
+                                  overread_in_bytes + sizeof(*src0) * (x + 24));
+    s[1][2] = LoadUnaligned16Msan(src1 + x + 16,
+                                  overread_in_bytes + sizeof(*src1) * (x + 16));
+    s[1][3] = LoadUnaligned16Msan(src1 + x + 24,
+                                  overread_in_bytes + sizeof(*src1) * (x + 24));
+    BoxFilterPreProcess(s, x + 8, scales, sum3, sum5, square_sum3, square_sum5,
+                        sum_width, sq, ma3, b3, ma5, b5);
+    Prepare3_8<0>(ma3[0], ma3x[0]);
+    Prepare3_8<0>(ma3[1], ma3x[1]);
+    Prepare3_8<0>(ma5, ma5x);
+    Store343_444Lo(ma3x[0], b3[0], x, &ma[1][2], &ma[2][1], b[1][2], b[2][1],
+                   ma343[2], ma444[1], b343[2], b444[1]);
+    Store343_444Lo(ma3x[1], b3[1], x, &ma[2][2], b[2][2], ma343[3], ma444[2],
+                   b343[3], b444[2]);
+    ma[0][1] = Sum565Lo(ma5x);
+    StoreAligned16(ma565[1] + x, ma[0][1]);
+    Sum565(b5, b[0][1]);
+    StoreAligned32U32(b565[1] + x, b[0][1]);
+    const __m128i sr0_lo = LoadAligned16(src + x);
+    const __m128i sr1_lo = LoadAligned16(src + stride + x);
+    ma[0][0] = LoadAligned16(ma565[0] + x);
+    LoadAligned32U32(b565[0] + x, b[0][0]);
+    p[0][0] = CalculateFilteredOutputPass1(sr0_lo, ma[0], b[0]);
+    p[1][0] = CalculateFilteredOutput<4>(sr1_lo, ma[0][1], b[0][1]);
+    ma[1][0] = LoadAligned16(ma343[0] + x);
+    ma[1][1] = LoadAligned16(ma444[0] + x);
+    LoadAligned32U32(b343[0] + x, b[1][0]);
+    LoadAligned32U32(b444[0] + x, b[1][1]);
+    p[0][1] = CalculateFilteredOutputPass2(sr0_lo, ma[1], b[1]);
+    const __m128i d00 = SelfGuidedDoubleMultiplier(sr0_lo, p[0], w0, w2);
+    ma[2][0] = LoadAligned16(ma343[1] + x);
+    LoadAligned32U32(b343[1] + x, b[2][0]);
+    p[1][1] = CalculateFilteredOutputPass2(sr1_lo, ma[2], b[2]);
+    const __m128i d10 = SelfGuidedDoubleMultiplier(sr1_lo, p[1], w0, w2);
+
+    Store343_444Hi(ma3x[0], b3[0] + 2, x + 8, &ma[1][2], &ma[2][1], b[1][2],
+                   b[2][1], ma343[2], ma444[1], b343[2], b444[1]);
+    Store343_444Hi(ma3x[1], b3[1] + 2, x + 8, &ma[2][2], b[2][2], ma343[3],
+                   ma444[2], b343[3], b444[2]);
+    ma[0][1] = Sum565Hi(ma5x);
+    StoreAligned16(ma565[1] + x + 8, ma[0][1]);
+    Sum565(b5 + 2, b[0][1]);
+    StoreAligned32U32(b565[1] + x + 8, b[0][1]);
+    const __m128i sr0_hi = LoadAligned16(src + x + 8);
+    const __m128i sr1_hi = LoadAligned16(src + stride + x + 8);
+    ma[0][0] = LoadAligned16(ma565[0] + x + 8);
+    LoadAligned32U32(b565[0] + x + 8, b[0][0]);
+    p[0][0] = CalculateFilteredOutputPass1(sr0_hi, ma[0], b[0]);
+    p[1][0] = CalculateFilteredOutput<4>(sr1_hi, ma[0][1], b[0][1]);
+    ma[1][0] = LoadAligned16(ma343[0] + x + 8);
+    ma[1][1] = LoadAligned16(ma444[0] + x + 8);
+    LoadAligned32U32(b343[0] + x + 8, b[1][0]);
+    LoadAligned32U32(b444[0] + x + 8, b[1][1]);
+    p[0][1] = CalculateFilteredOutputPass2(sr0_hi, ma[1], b[1]);
+    const __m128i d01 = SelfGuidedDoubleMultiplier(sr0_hi, p[0], w0, w2);
+    ClipAndStore(dst + x + 0, d00);
+    ClipAndStore(dst + x + 8, d01);
+    ma[2][0] = LoadAligned16(ma343[1] + x + 8);
+    LoadAligned32U32(b343[1] + x + 8, b[2][0]);
+    p[1][1] = CalculateFilteredOutputPass2(sr1_hi, ma[2], b[2]);
+    const __m128i d11 = SelfGuidedDoubleMultiplier(sr1_hi, p[1], w0, w2);
+    ClipAndStore(dst + stride + x + 0, d10);
+    ClipAndStore(dst + stride + x + 8, d11);
+    s[0][0] = s[0][2];
+    s[0][1] = s[0][3];
+    s[1][0] = s[1][2];
+    s[1][1] = s[1][3];
+    sq[0][2] = sq[0][6];
+    sq[0][3] = sq[0][7];
+    sq[1][2] = sq[1][6];
+    sq[1][3] = sq[1][7];
+    ma3[0][0] = ma3[0][1];
+    ma3[1][0] = ma3[1][1];
+    ma5[0] = ma5[1];
+    b3[0][0] = b3[0][4];
+    b3[0][1] = b3[0][5];
+    b3[1][0] = b3[1][4];
+    b3[1][1] = b3[1][5];
+    b5[0] = b5[4];
+    b5[1] = b5[5];
+    x += 16;
+  } while (x < width);
+}
+
+inline void BoxFilterLastRow(
+    const uint16_t* const src, const uint16_t* const src0, const int width,
+    const ptrdiff_t sum_width, const uint16_t scales[2], const int16_t w0,
+    const int16_t w2, uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    uint16_t* const ma343, uint16_t* const ma444, uint16_t* const ma565,
+    uint32_t* const b343, uint32_t* const b444, uint32_t* const b565,
+    uint16_t* const dst) {
+  const ptrdiff_t overread_in_bytes =
+      kOverreadInBytesPass1 - sizeof(*src0) * width;
+  __m128i s[4], ma3[2], ma5[2], sq[8], b3[6], b5[6], ma[3], b[3][2];
+  s[0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+  s[1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+  Square(s[0], sq);
+  BoxFilterPreProcessLastRowLo(s, scales, sum3, sum5, square_sum3, square_sum5,
+                               sq, &ma3[0], &ma5[0], b3, b5);
+
+  int x = 0;
+  do {
+    __m128i ma3x[3], ma5x[3], p[2];
+    s[2] = LoadUnaligned16Msan(src0 + x + 16,
+                               overread_in_bytes + sizeof(*src0) * (x + 16));
+    s[3] = LoadUnaligned16Msan(src0 + x + 24,
+                               overread_in_bytes + sizeof(*src0) * (x + 24));
+    BoxFilterPreProcessLastRow(s, sum_width, x + 8, scales, sum3, sum5,
+                               square_sum3, square_sum5, sq, ma3, ma5, b3, b5);
+    Prepare3_8<0>(ma3, ma3x);
+    Prepare3_8<0>(ma5, ma5x);
+    ma[1] = Sum565Lo(ma5x);
+    Sum565(b5, b[1]);
+    ma[2] = Sum343Lo(ma3x);
+    Sum343(b3, b[2]);
+    const __m128i sr_lo = LoadAligned16(src + x + 0);
+    ma[0] = LoadAligned16(ma565 + x);
+    LoadAligned32U32(b565 + x, b[0]);
+    p[0] = CalculateFilteredOutputPass1(sr_lo, ma, b);
+    ma[0] = LoadAligned16(ma343 + x);
+    ma[1] = LoadAligned16(ma444 + x);
+    LoadAligned32U32(b343 + x, b[0]);
+    LoadAligned32U32(b444 + x, b[1]);
+    p[1] = CalculateFilteredOutputPass2(sr_lo, ma, b);
+    const __m128i d0 = SelfGuidedDoubleMultiplier(sr_lo, p, w0, w2);
+
+    ma[1] = Sum565Hi(ma5x);
+    Sum565(b5 + 2, b[1]);
+    ma[2] = Sum343Hi(ma3x);
+    Sum343(b3 + 2, b[2]);
+    const __m128i sr_hi = LoadAligned16(src + x + 8);
+    ma[0] = LoadAligned16(ma565 + x + 8);
+    LoadAligned32U32(b565 + x + 8, b[0]);
+    p[0] = CalculateFilteredOutputPass1(sr_hi, ma, b);
+    ma[0] = LoadAligned16(ma343 + x + 8);
+    ma[1] = LoadAligned16(ma444 + x + 8);
+    LoadAligned32U32(b343 + x + 8, b[0]);
+    LoadAligned32U32(b444 + x + 8, b[1]);
+    p[1] = CalculateFilteredOutputPass2(sr_hi, ma, b);
+    const __m128i d1 = SelfGuidedDoubleMultiplier(sr_hi, p, w0, w2);
+    ClipAndStore(dst + x + 0, d0);
+    ClipAndStore(dst + x + 8, d1);
+    s[1] = s[3];
+    sq[2] = sq[6];
+    sq[3] = sq[7];
+    ma3[0] = ma3[1];
+    ma5[0] = ma5[1];
+    b3[0] = b3[4];
+    b3[1] = b3[5];
+    b5[0] = b5[4];
+    b5[1] = b5[5];
+    x += 16;
+  } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterProcess(
+    const RestorationUnitInfo& restoration_info, const uint16_t* src,
+    const ptrdiff_t stride, const uint16_t* const top_border,
+    const ptrdiff_t top_border_stride, const uint16_t* bottom_border,
+    const ptrdiff_t bottom_border_stride, const int width, const int height,
+    SgrBuffer* const sgr_buffer, uint16_t* dst) {
+  const auto temp_stride = Align<ptrdiff_t>(width, 16);
+  const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
+  const auto sum_stride = temp_stride + 16;
+  const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+  const uint16_t* const scales = kSgrScaleParameter[sgr_proj_index];  // < 2^12.
+  const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+  const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+  const int16_t w2 = (1 << kSgrProjPrecisionBits) - w0 - w1;
+  uint16_t *sum3[4], *sum5[5], *ma343[4], *ma444[3], *ma565[2];
+  uint32_t *square_sum3[4], *square_sum5[5], *b343[4], *b444[3], *b565[2];
+  sum3[0] = sgr_buffer->sum3;
+  square_sum3[0] = sgr_buffer->square_sum3;
+  ma343[0] = sgr_buffer->ma343;
+  b343[0] = sgr_buffer->b343;
+  for (int i = 1; i <= 3; ++i) {
+    sum3[i] = sum3[i - 1] + sum_stride;
+    square_sum3[i] = square_sum3[i - 1] + sum_stride;
+    ma343[i] = ma343[i - 1] + temp_stride;
+    b343[i] = b343[i - 1] + temp_stride;
+  }
+  sum5[0] = sgr_buffer->sum5;
+  square_sum5[0] = sgr_buffer->square_sum5;
+  for (int i = 1; i <= 4; ++i) {
+    sum5[i] = sum5[i - 1] + sum_stride;
+    square_sum5[i] = square_sum5[i - 1] + sum_stride;
+  }
+  ma444[0] = sgr_buffer->ma444;
+  b444[0] = sgr_buffer->b444;
+  for (int i = 1; i <= 2; ++i) {
+    ma444[i] = ma444[i - 1] + temp_stride;
+    b444[i] = b444[i - 1] + temp_stride;
+  }
+  ma565[0] = sgr_buffer->ma565;
+  ma565[1] = ma565[0] + temp_stride;
+  b565[0] = sgr_buffer->b565;
+  b565[1] = b565[0] + temp_stride;
+  assert(scales[0] != 0);
+  assert(scales[1] != 0);
+  BoxSum(top_border, top_border_stride, width, sum_stride, sum_width, sum3[0],
+         sum5[1], square_sum3[0], square_sum5[1]);
+  sum5[0] = sum5[1];
+  square_sum5[0] = square_sum5[1];
+  const uint16_t* const s = (height > 1) ? src + stride : bottom_border;
+  BoxSumFilterPreProcess(src, s, width, scales, sum3, sum5, square_sum3,
+                         square_sum5, sum_width, ma343, ma444[0], ma565[0],
+                         b343, b444[0], b565[0]);
+  sum5[0] = sgr_buffer->sum5;
+  square_sum5[0] = sgr_buffer->square_sum5;
+
+  for (int y = (height >> 1) - 1; y > 0; --y) {
+    Circulate4PointersBy2<uint16_t>(sum3);
+    Circulate4PointersBy2<uint32_t>(square_sum3);
+    Circulate5PointersBy2<uint16_t>(sum5);
+    Circulate5PointersBy2<uint32_t>(square_sum5);
+    BoxFilter(src + 3, src + 2 * stride, src + 3 * stride, stride, width,
+              scales, w0, w2, sum3, sum5, square_sum3, square_sum5, sum_width,
+              ma343, ma444, ma565, b343, b444, b565, dst);
+    src += 2 * stride;
+    dst += 2 * stride;
+    Circulate4PointersBy2<uint16_t>(ma343);
+    Circulate4PointersBy2<uint32_t>(b343);
+    std::swap(ma444[0], ma444[2]);
+    std::swap(b444[0], b444[2]);
+    std::swap(ma565[0], ma565[1]);
+    std::swap(b565[0], b565[1]);
+  }
+
+  Circulate4PointersBy2<uint16_t>(sum3);
+  Circulate4PointersBy2<uint32_t>(square_sum3);
+  Circulate5PointersBy2<uint16_t>(sum5);
+  Circulate5PointersBy2<uint32_t>(square_sum5);
+  if ((height & 1) == 0 || height > 1) {
+    const uint16_t* sr[2];
+    if ((height & 1) == 0) {
+      sr[0] = bottom_border;
+      sr[1] = bottom_border + bottom_border_stride;
+    } else {
+      sr[0] = src + 2 * stride;
+      sr[1] = bottom_border;
+    }
+    BoxFilter(src + 3, sr[0], sr[1], stride, width, scales, w0, w2, sum3, sum5,
+              square_sum3, square_sum5, sum_width, ma343, ma444, ma565, b343,
+              b444, b565, dst);
+  }
+  if ((height & 1) != 0) {
+    if (height > 1) {
+      src += 2 * stride;
+      dst += 2 * stride;
+      Circulate4PointersBy2<uint16_t>(sum3);
+      Circulate4PointersBy2<uint32_t>(square_sum3);
+      Circulate5PointersBy2<uint16_t>(sum5);
+      Circulate5PointersBy2<uint32_t>(square_sum5);
+      Circulate4PointersBy2<uint16_t>(ma343);
+      Circulate4PointersBy2<uint32_t>(b343);
+      std::swap(ma444[0], ma444[2]);
+      std::swap(b444[0], b444[2]);
+      std::swap(ma565[0], ma565[1]);
+      std::swap(b565[0], b565[1]);
+    }
+    BoxFilterLastRow(src + 3, bottom_border + bottom_border_stride, width,
+                     sum_width, scales, w0, w2, sum3, sum5, square_sum3,
+                     square_sum5, ma343[0], ma444[0], ma565[0], b343[0],
+                     b444[0], b565[0], dst);
+  }
+}
+
+inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
+                                  const uint16_t* src, const ptrdiff_t stride,
+                                  const uint16_t* const top_border,
+                                  const ptrdiff_t top_border_stride,
+                                  const uint16_t* bottom_border,
+                                  const ptrdiff_t bottom_border_stride,
+                                  const int width, const int height,
+                                  SgrBuffer* const sgr_buffer, uint16_t* dst) {
+  const auto temp_stride = Align<ptrdiff_t>(width, 16);
+  const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
+  const auto sum_stride = temp_stride + 16;
+  const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+  const uint32_t scale = kSgrScaleParameter[sgr_proj_index][0];  // < 2^12.
+  const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+  uint16_t *sum5[5], *ma565[2];
+  uint32_t *square_sum5[5], *b565[2];
+  sum5[0] = sgr_buffer->sum5;
+  square_sum5[0] = sgr_buffer->square_sum5;
+  for (int i = 1; i <= 4; ++i) {
+    sum5[i] = sum5[i - 1] + sum_stride;
+    square_sum5[i] = square_sum5[i - 1] + sum_stride;
+  }
+  ma565[0] = sgr_buffer->ma565;
+  ma565[1] = ma565[0] + temp_stride;
+  b565[0] = sgr_buffer->b565;
+  b565[1] = b565[0] + temp_stride;
+  assert(scale != 0);
+  BoxSum<5>(top_border, top_border_stride, width, sum_stride, sum_width,
+            sum5[1], square_sum5[1]);
+  sum5[0] = sum5[1];
+  square_sum5[0] = square_sum5[1];
+  const uint16_t* const s = (height > 1) ? src + stride : bottom_border;
+  BoxSumFilterPreProcess5(src, s, width, scale, sum5, square_sum5, sum_width,
+                          ma565[0], b565[0]);
+  sum5[0] = sgr_buffer->sum5;
+  square_sum5[0] = sgr_buffer->square_sum5;
+
+  for (int y = (height >> 1) - 1; y > 0; --y) {
+    Circulate5PointersBy2<uint16_t>(sum5);
+    Circulate5PointersBy2<uint32_t>(square_sum5);
+    BoxFilterPass1(src + 3, src + 2 * stride, src + 3 * stride, stride, sum5,
+                   square_sum5, width, sum_width, scale, w0, ma565, b565, dst);
+    src += 2 * stride;
+    dst += 2 * stride;
+    std::swap(ma565[0], ma565[1]);
+    std::swap(b565[0], b565[1]);
+  }
+
+  Circulate5PointersBy2<uint16_t>(sum5);
+  Circulate5PointersBy2<uint32_t>(square_sum5);
+  if ((height & 1) == 0 || height > 1) {
+    const uint16_t* sr[2];
+    if ((height & 1) == 0) {
+      sr[0] = bottom_border;
+      sr[1] = bottom_border + bottom_border_stride;
+    } else {
+      sr[0] = src + 2 * stride;
+      sr[1] = bottom_border;
+    }
+    BoxFilterPass1(src + 3, sr[0], sr[1], stride, sum5, square_sum5, width,
+                   sum_width, scale, w0, ma565, b565, dst);
+  }
+  if ((height & 1) != 0) {
+    src += 3;
+    if (height > 1) {
+      src += 2 * stride;
+      dst += 2 * stride;
+      std::swap(ma565[0], ma565[1]);
+      std::swap(b565[0], b565[1]);
+      Circulate5PointersBy2<uint16_t>(sum5);
+      Circulate5PointersBy2<uint32_t>(square_sum5);
+    }
+    BoxFilterPass1LastRow(src, bottom_border + bottom_border_stride, width,
+                          sum_width, scale, w0, sum5, square_sum5, ma565[0],
+                          b565[0], dst);
+  }
+}
+
+inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
+                                  const uint16_t* src, const ptrdiff_t stride,
+                                  const uint16_t* const top_border,
+                                  const ptrdiff_t top_border_stride,
+                                  const uint16_t* bottom_border,
+                                  const ptrdiff_t bottom_border_stride,
+                                  const int width, const int height,
+                                  SgrBuffer* const sgr_buffer, uint16_t* dst) {
+  assert(restoration_info.sgr_proj_info.multiplier[0] == 0);
+  const auto temp_stride = Align<ptrdiff_t>(width, 16);
+  const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
+  const auto sum_stride = temp_stride + 16;
+  const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+  const int16_t w0 = (1 << kSgrProjPrecisionBits) - w1;
+  const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+  const uint32_t scale = kSgrScaleParameter[sgr_proj_index][1];  // < 2^12.
+  uint16_t *sum3[3], *ma343[3], *ma444[2];
+  uint32_t *square_sum3[3], *b343[3], *b444[2];
+  sum3[0] = sgr_buffer->sum3;
+  square_sum3[0] = sgr_buffer->square_sum3;
+  ma343[0] = sgr_buffer->ma343;
+  b343[0] = sgr_buffer->b343;
+  for (int i = 1; i <= 2; ++i) {
+    sum3[i] = sum3[i - 1] + sum_stride;
+    square_sum3[i] = square_sum3[i - 1] + sum_stride;
+    ma343[i] = ma343[i - 1] + temp_stride;
+    b343[i] = b343[i - 1] + temp_stride;
+  }
+  ma444[0] = sgr_buffer->ma444;
+  ma444[1] = ma444[0] + temp_stride;
+  b444[0] = sgr_buffer->b444;
+  b444[1] = b444[0] + temp_stride;
+  assert(scale != 0);
+  BoxSum<3>(top_border, top_border_stride, width, sum_stride, sum_width,
+            sum3[0], square_sum3[0]);
+  BoxSumFilterPreProcess3<false>(src, width, scale, sum3, square_sum3,
+                                 sum_width, ma343[0], nullptr, b343[0],
+                                 nullptr);
+  Circulate3PointersBy1<uint16_t>(sum3);
+  Circulate3PointersBy1<uint32_t>(square_sum3);
+  const uint16_t* s;
+  if (height > 1) {
+    s = src + stride;
+  } else {
+    s = bottom_border;
+    bottom_border += bottom_border_stride;
+  }
+  BoxSumFilterPreProcess3<true>(s, width, scale, sum3, square_sum3, sum_width,
+                                ma343[1], ma444[0], b343[1], b444[0]);
+
+  for (int y = height - 2; y > 0; --y) {
+    Circulate3PointersBy1<uint16_t>(sum3);
+    Circulate3PointersBy1<uint32_t>(square_sum3);
+    BoxFilterPass2(src + 2, src + 2 * stride, width, sum_width, scale, w0, sum3,
+                   square_sum3, ma343, ma444, b343, b444, dst);
+    src += stride;
+    dst += stride;
+    Circulate3PointersBy1<uint16_t>(ma343);
+    Circulate3PointersBy1<uint32_t>(b343);
+    std::swap(ma444[0], ma444[1]);
+    std::swap(b444[0], b444[1]);
+  }
+
+  int y = std::min(height, 2);
+  src += 2;
+  do {
+    Circulate3PointersBy1<uint16_t>(sum3);
+    Circulate3PointersBy1<uint32_t>(square_sum3);
+    BoxFilterPass2(src, bottom_border, width, sum_width, scale, w0, sum3,
+                   square_sum3, ma343, ma444, b343, b444, dst);
+    src += stride;
+    dst += stride;
+    bottom_border += bottom_border_stride;
+    Circulate3PointersBy1<uint16_t>(ma343);
+    Circulate3PointersBy1<uint32_t>(b343);
+    std::swap(ma444[0], ma444[1]);
+    std::swap(b444[0], b444[1]);
+  } while (--y != 0);
+}
+
+// If |width| is non-multiple of 16, up to 15 more pixels are written to |dest|
+// in the end of each row. It is safe to overwrite the output as it will not be
+// part of the visible frame.
+void SelfGuidedFilter_SSE4_1(
+    const RestorationUnitInfo& restoration_info, const void* const source,
+    const ptrdiff_t stride, const void* const top_border,
+    const ptrdiff_t top_border_stride, const void* const bottom_border,
+    const ptrdiff_t bottom_border_stride, const int width, const int height,
+    RestorationBuffer* const restoration_buffer, void* const dest) {
+  const int index = restoration_info.sgr_proj_info.index;
+  const int radius_pass_0 = kSgrProjParams[index][0];  // 2 or 0
+  const int radius_pass_1 = kSgrProjParams[index][2];  // 1 or 0
+  const auto* const src = static_cast<const uint16_t*>(source);
+  const auto* const top = static_cast<const uint16_t*>(top_border);
+  const auto* const bottom = static_cast<const uint16_t*>(bottom_border);
+  auto* const dst = static_cast<uint16_t*>(dest);
+  SgrBuffer* const sgr_buffer = &restoration_buffer->sgr_buffer;
+  if (radius_pass_1 == 0) {
+    // |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the
+    // following assertion.
+    assert(radius_pass_0 != 0);
+    BoxFilterProcessPass1(restoration_info, src - 3, stride, top - 3,
+                          top_border_stride, bottom - 3, bottom_border_stride,
+                          width, height, sgr_buffer, dst);
+  } else if (radius_pass_0 == 0) {
+    BoxFilterProcessPass2(restoration_info, src - 2, stride, top - 2,
+                          top_border_stride, bottom - 2, bottom_border_stride,
+                          width, height, sgr_buffer, dst);
+  } else {
+    BoxFilterProcess(restoration_info, src - 3, stride, top - 3,
+                     top_border_stride, bottom - 3, bottom_border_stride, width,
+                     height, sgr_buffer, dst);
+  }
+}
+
 void Init10bpp() {
   Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
   assert(dsp != nullptr);
@@ -531,6 +2505,11 @@ void Init10bpp() {
 #else
   static_cast<void>(WienerFilter_SSE4_1);
 #endif
+#if DSP_ENABLED_10BPP_SSE4_1(SelfGuidedFilter)
+  dsp->loop_restorations[1] = SelfGuidedFilter_SSE4_1;
+#else
+  static_cast<void>(SelfGuidedFilter_SSE4_1);
+#endif
 }
 
 }  // namespace
@@ -540,7 +2519,7 @@ void LoopRestorationInit10bpp_SSE4_1() { Init10bpp(); }
 }  // namespace dsp
 }  // namespace libgav1
 
-#else  // !(LIBGAV1_TARGETING_SSE4_1 && LIBGAV1_MAX_BITDEPTH >= 10)
+#else   // !(LIBGAV1_TARGETING_SSE4_1 && LIBGAV1_MAX_BITDEPTH >= 10)
 namespace libgav1 {
 namespace dsp {
 
diff --git a/src/dsp/x86/loop_restoration_avx2.cc b/src/dsp/x86/loop_restoration_avx2.cc
index 7ae7c90..351a324 100644
--- a/src/dsp/x86/loop_restoration_avx2.cc
+++ b/src/dsp/x86/loop_restoration_avx2.cc
@@ -28,7 +28,6 @@
 #include "src/dsp/constants.h"
 #include "src/dsp/dsp.h"
 #include "src/dsp/x86/common_avx2.h"
-#include "src/dsp/x86/common_sse4.h"
 #include "src/utils/common.h"
 #include "src/utils/constants.h"
 
@@ -116,7 +115,8 @@ inline void WienerHorizontalTap7(const uint8_t* src, const ptrdiff_t src_stride,
   filter[0] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x0100));
   filter[1] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x0302));
   filter[2] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x0102));
-  filter[3] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x8000));
+  filter[3] = _mm256_shuffle_epi8(
+      coefficients, _mm256_set1_epi16(static_cast<int16_t>(0x8000)));
   for (int y = height; y != 0; --y) {
     __m256i s = LoadUnaligned32(src);
     __m256i ss[4];
@@ -144,7 +144,8 @@ inline void WienerHorizontalTap5(const uint8_t* src, const ptrdiff_t src_stride,
   __m256i filter[3];
   filter[0] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x0201));
   filter[1] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x0203));
-  filter[2] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x8001));
+  filter[2] = _mm256_shuffle_epi8(
+      coefficients, _mm256_set1_epi16(static_cast<int16_t>(0x8001)));
   for (int y = height; y != 0; --y) {
     __m256i s = LoadUnaligned32(src);
     __m256i ss[4];
@@ -171,7 +172,8 @@ inline void WienerHorizontalTap3(const uint8_t* src, const ptrdiff_t src_stride,
                                  int16_t** const wiener_buffer) {
   __m256i filter[2];
   filter[0] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x0302));
-  filter[1] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x8002));
+  filter[1] = _mm256_shuffle_epi8(
+      coefficients, _mm256_set1_epi16(static_cast<int16_t>(0x8002)));
   for (int y = height; y != 0; --y) {
     __m256i s = LoadUnaligned32(src);
     __m256i ss[4];
@@ -480,12 +482,12 @@ inline void WienerVerticalTap1(const int16_t* wiener_buffer,
   }
 }
 
-void WienerFilter_AVX2(const RestorationUnitInfo& restoration_info,
-                       const void* const source, const void* const top_border,
-                       const void* const bottom_border, const ptrdiff_t stride,
-                       const int width, const int height,
-                       RestorationBuffer* const restoration_buffer,
-                       void* const dest) {
+void WienerFilter_AVX2(
+    const RestorationUnitInfo& restoration_info, const void* const source,
+    const ptrdiff_t stride, const void* const top_border,
+    const ptrdiff_t top_border_stride, const void* const bottom_border,
+    const ptrdiff_t bottom_border_stride, const int width, const int height,
+    RestorationBuffer* const restoration_buffer, void* const dest) {
   const int16_t* const number_leading_zero_coefficients =
       restoration_info.wiener_info.number_leading_zero_coefficients;
   const int number_rows_to_skip = std::max(
@@ -515,39 +517,42 @@ void WienerFilter_AVX2(const RestorationUnitInfo& restoration_info,
   c_horizontal = _mm_packs_epi16(c_horizontal, c_horizontal);
   const __m256i coefficients_horizontal = _mm256_broadcastd_epi32(c_horizontal);
   if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) {
-    WienerHorizontalTap7(top + (2 - height_extra) * stride - 3, stride,
-                         wiener_stride, height_extra, coefficients_horizontal,
-                         &wiener_buffer_horizontal);
-    WienerHorizontalTap7(src - 3, stride, wiener_stride, height,
+    WienerHorizontalTap7(top + (2 - height_extra) * top_border_stride - 3,
+                         top_border_stride, wiener_stride, height_extra,
                          coefficients_horizontal, &wiener_buffer_horizontal);
-    WienerHorizontalTap7(bottom - 3, stride, wiener_stride, height_extra,
+    WienerHorizontalTap7(src - 3, stride, wiener_stride, height,
                          coefficients_horizontal, &wiener_buffer_horizontal);
-  } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
-    WienerHorizontalTap5(top + (2 - height_extra) * stride - 2, stride,
-                         wiener_stride, height_extra, coefficients_horizontal,
+    WienerHorizontalTap7(bottom - 3, bottom_border_stride, wiener_stride,
+                         height_extra, coefficients_horizontal,
                          &wiener_buffer_horizontal);
-    WienerHorizontalTap5(src - 2, stride, wiener_stride, height,
+  } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
+    WienerHorizontalTap5(top + (2 - height_extra) * top_border_stride - 2,
+                         top_border_stride, wiener_stride, height_extra,
                          coefficients_horizontal, &wiener_buffer_horizontal);
-    WienerHorizontalTap5(bottom - 2, stride, wiener_stride, height_extra,
+    WienerHorizontalTap5(src - 2, stride, wiener_stride, height,
                          coefficients_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap5(bottom - 2, bottom_border_stride, wiener_stride,
+                         height_extra, coefficients_horizontal,
+                         &wiener_buffer_horizontal);
   } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) {
     // The maximum over-reads happen here.
-    WienerHorizontalTap3(top + (2 - height_extra) * stride - 1, stride,
-                         wiener_stride, height_extra, coefficients_horizontal,
-                         &wiener_buffer_horizontal);
-    WienerHorizontalTap3(src - 1, stride, wiener_stride, height,
+    WienerHorizontalTap3(top + (2 - height_extra) * top_border_stride - 1,
+                         top_border_stride, wiener_stride, height_extra,
                          coefficients_horizontal, &wiener_buffer_horizontal);
-    WienerHorizontalTap3(bottom - 1, stride, wiener_stride, height_extra,
+    WienerHorizontalTap3(src - 1, stride, wiener_stride, height,
                          coefficients_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap3(bottom - 1, bottom_border_stride, wiener_stride,
+                         height_extra, coefficients_horizontal,
+                         &wiener_buffer_horizontal);
   } else {
     assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3);
-    WienerHorizontalTap1(top + (2 - height_extra) * stride, stride,
-                         wiener_stride, height_extra,
+    WienerHorizontalTap1(top + (2 - height_extra) * top_border_stride,
+                         top_border_stride, wiener_stride, height_extra,
                          &wiener_buffer_horizontal);
     WienerHorizontalTap1(src, stride, wiener_stride, height,
                          &wiener_buffer_horizontal);
-    WienerHorizontalTap1(bottom, stride, wiener_stride, height_extra,
-                         &wiener_buffer_horizontal);
+    WienerHorizontalTap1(bottom, bottom_border_stride, wiener_stride,
+                         height_extra, &wiener_buffer_horizontal);
   }
 
   // vertical filtering.
@@ -765,17 +770,6 @@ inline __m256i VaddwHi16(const __m256i src0, const __m256i src1) {
   return _mm256_add_epi32(src0, s1);
 }
 
-// Using VgetLane16() can save a sign extension instruction.
-template <int n>
-inline int VgetLane16(__m256i src) {
-  return _mm256_extract_epi16(src, n);
-}
-
-template <int n>
-inline int VgetLane8(__m256i src) {
-  return _mm256_extract_epi8(src, n);
-}
-
 inline __m256i VmullNLo8(const __m256i src0, const int src1) {
   const __m256i s0 = _mm256_unpacklo_epi16(src0, _mm256_setzero_si256());
   return _mm256_madd_epi16(s0, _mm256_set1_epi32(src1));
@@ -1253,9 +1247,8 @@ inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride,
   do {
     const __m128i s0 =
         LoadUnaligned16Msan(src, kOverreadInBytesPass1_128 - width);
-    __m128i sq_128[2];
+    __m128i sq_128[2], s3, s5, sq3[2], sq5[2];
     __m256i sq[3];
-    __m128i s3, s5, sq3[2], sq5[2];
     sq_128[0] = SquareLo8(s0);
     sq_128[1] = SquareHi8(s0);
     SumHorizontalLo(s0, &s3, &s5);
@@ -1432,11 +1425,43 @@ inline __m256i CalculateMa(const __m256i sum, const __m256i sum_sq[2],
   return _mm256_packus_epi32(z0, z1);
 }
 
-template <int n>
-inline __m128i CalculateB(const __m128i sum, const __m128i ma) {
-  static_assert(n == 9 || n == 25, "");
+inline __m128i CalculateB5(const __m128i sum, const __m128i ma) {
+  // one_over_n == 164.
   constexpr uint32_t one_over_n =
-      ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n;
+      ((1 << kSgrProjReciprocalBits) + (25 >> 1)) / 25;
+  // one_over_n_quarter == 41.
+  constexpr uint32_t one_over_n_quarter = one_over_n >> 2;
+  static_assert(one_over_n == one_over_n_quarter << 2, "");
+  // |ma| is in range [0, 255].
+  const __m128i m = _mm_maddubs_epi16(ma, _mm_set1_epi16(one_over_n_quarter));
+  const __m128i m0 = VmullLo16(m, sum);
+  const __m128i m1 = VmullHi16(m, sum);
+  const __m128i b_lo = VrshrU32(m0, kSgrProjReciprocalBits - 2);
+  const __m128i b_hi = VrshrU32(m1, kSgrProjReciprocalBits - 2);
+  return _mm_packus_epi32(b_lo, b_hi);
+}
+
+inline __m256i CalculateB5(const __m256i sum, const __m256i ma) {
+  // one_over_n == 164.
+  constexpr uint32_t one_over_n =
+      ((1 << kSgrProjReciprocalBits) + (25 >> 1)) / 25;
+  // one_over_n_quarter == 41.
+  constexpr uint32_t one_over_n_quarter = one_over_n >> 2;
+  static_assert(one_over_n == one_over_n_quarter << 2, "");
+  // |ma| is in range [0, 255].
+  const __m256i m =
+      _mm256_maddubs_epi16(ma, _mm256_set1_epi16(one_over_n_quarter));
+  const __m256i m0 = VmullLo16(m, sum);
+  const __m256i m1 = VmullHi16(m, sum);
+  const __m256i b_lo = VrshrU32(m0, kSgrProjReciprocalBits - 2);
+  const __m256i b_hi = VrshrU32(m1, kSgrProjReciprocalBits - 2);
+  return _mm256_packus_epi32(b_lo, b_hi);
+}
+
+inline __m128i CalculateB3(const __m128i sum, const __m128i ma) {
+  // one_over_n == 455.
+  constexpr uint32_t one_over_n =
+      ((1 << kSgrProjReciprocalBits) + (9 >> 1)) / 9;
   const __m128i m0 = VmullLo16(ma, sum);
   const __m128i m1 = VmullHi16(ma, sum);
   const __m128i m2 = _mm_mullo_epi32(m0, _mm_set1_epi32(one_over_n));
@@ -1446,11 +1471,10 @@ inline __m128i CalculateB(const __m128i sum, const __m128i ma) {
   return _mm_packus_epi32(b_lo, b_hi);
 }
 
-template <int n>
-inline __m256i CalculateB(const __m256i sum, const __m256i ma) {
-  static_assert(n == 9 || n == 25, "");
+inline __m256i CalculateB3(const __m256i sum, const __m256i ma) {
+  // one_over_n == 455.
   constexpr uint32_t one_over_n =
-      ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n;
+      ((1 << kSgrProjReciprocalBits) + (9 >> 1)) / 9;
   const __m256i m0 = VmullLo16(ma, sum);
   const __m256i m1 = VmullHi16(ma, sum);
   const __m256i m2 = _mm256_mullo_epi32(m0, _mm256_set1_epi32(one_over_n));
@@ -1525,7 +1549,7 @@ inline void LookupIntermediate(const __m128i sum, const __m128i index,
   // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
   // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
   const __m128i maq = _mm_unpacklo_epi8(*ma, _mm_setzero_si128());
-  *b = CalculateB<n>(sum, maq);
+  *b = (n == 9) ? CalculateB3(sum, maq) : CalculateB5(sum, maq);
 }
 
 // Repeat the first 48 elements in kSgrMaLookup with a period of 16.
@@ -1539,7 +1563,7 @@ alignas(32) constexpr uint8_t kSgrMaLookupAvx2[96] = {
 
 // Set the shuffle control mask of indices out of range [0, 15] to (1xxxxxxx)b
 // to get value 0 as the shuffle result. The most significiant bit 1 comes
-// either from the comparision instruction, or from the sign bit of the index.
+// either from the comparison instruction, or from the sign bit of the index.
 inline __m256i ShuffleIndex(const __m256i table, const __m256i index) {
   __m256i mask;
   mask = _mm256_cmpgt_epi8(index, _mm256_set1_epi8(15));
@@ -1558,15 +1582,15 @@ template <int n>
 inline void CalculateIntermediate(const __m256i sum[2], const __m256i index[2],
                                   __m256i ma[3], __m256i b[2]) {
   static_assert(n == 9 || n == 25, "");
-  // Use table lookup to read elements which indices are less than 48.
+  // Use table lookup to read elements whose indices are less than 48.
   const __m256i c0 = LoadAligned32(kSgrMaLookupAvx2 + 0 * 32);
   const __m256i c1 = LoadAligned32(kSgrMaLookupAvx2 + 1 * 32);
   const __m256i c2 = LoadAligned32(kSgrMaLookupAvx2 + 2 * 32);
   const __m256i indices = _mm256_packus_epi16(index[0], index[1]);
   __m256i idx, mas;
-  // Clip idx to 127 to apply signed comparision instructions.
+  // Clip idx to 127 to apply signed comparison instructions.
   idx = _mm256_min_epu8(indices, _mm256_set1_epi8(127));
-  // All elements which indices are less than 48 are set to 0.
+  // All elements whose indices are less than 48 are set to 0.
   // Get shuffle results for indices in range [0, 15].
   mas = ShuffleIndex(c0, idx);
   // Get shuffle results for indices in range [16, 31].
@@ -1581,12 +1605,12 @@ inline void CalculateIntermediate(const __m256i sum[2], const __m256i index[2],
   const __m256i res2 = ShuffleIndex(c2, idx);
   mas = _mm256_or_si256(mas, res2);
 
-  // For elements which indices are larger than 47, since they seldom change
+  // For elements whose indices are larger than 47, since they seldom change
   // values with the increase of the index, we use comparison and arithmetic
   // operations to calculate their values.
-  // Add -128 to apply signed comparision instructions.
+  // Add -128 to apply signed comparison instructions.
   idx = _mm256_add_epi8(indices, _mm256_set1_epi8(-128));
-  // Elements which indices are larger than 47 (with value 0) are set to 5.
+  // Elements whose indices are larger than 47 (with value 0) are set to 5.
   mas = _mm256_max_epu8(mas, _mm256_set1_epi8(5));
   mas = AdjustValue(mas, idx, 55);   // 55 is the last index which value is 5.
   mas = AdjustValue(mas, idx, 72);   // 72 is the last index which value is 4.
@@ -1611,8 +1635,13 @@ inline void CalculateIntermediate(const __m256i sum[2], const __m256i index[2],
   // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
   const __m256i maq0 = _mm256_unpackhi_epi8(ma[0], _mm256_setzero_si256());
   const __m256i maq1 = _mm256_unpacklo_epi8(ma[1], _mm256_setzero_si256());
-  b[0] = CalculateB<n>(sum[0], maq0);
-  b[1] = CalculateB<n>(sum[1], maq1);
+  if (n == 9) {
+    b[0] = CalculateB3(sum[0], maq0);
+    b[1] = CalculateB3(sum[1], maq1);
+  } else {
+    b[0] = CalculateB5(sum[0], maq0);
+    b[1] = CalculateB5(sum[1], maq1);
+  }
 }
 
 inline void CalculateIntermediate5(const __m128i s5[5], const __m128i sq5[5][2],
@@ -1903,8 +1932,8 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess(
     __m256i b3[2][5], __m256i ma5[3], __m256i b5[5]) {
   const __m256i s0 = LoadUnaligned32Msan(src0 + 8, over_read_in_bytes + 8);
   const __m256i s1 = LoadUnaligned32Msan(src1 + 8, over_read_in_bytes + 8);
-  __m256i s3[2][4], s5[2][5], sq3[4][2], sq5[5][2], sq3t[4][2], sq5t[5][2],
-      sum_3[2][2], index_3[2][2], sum_5[2], index_5[2];
+  __m256i s3[2][4], s5[2][5], sq3[4][2], sq5[5][2], sum_3[2][2], index_3[2][2],
+      sum_5[2], index_5[2];
   sq[0][1] = SquareLo8(s0);
   sq[0][2] = SquareHi8(s0);
   sq[1][1] = SquareLo8(s1);
@@ -1938,22 +1967,22 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess(
   LoadAligned64x3U32(square_sum5, x, sq5);
   CalculateSumAndIndex5(s5[0], sq5, scales[0], &sum_5[0], &index_5[0]);
 
-  SumHorizontal(sq[0] + 1, &sq3t[2][0], &sq3t[2][1], &sq5t[3][0], &sq5t[3][1]);
-  SumHorizontal(sq[1] + 1, &sq3t[3][0], &sq3t[3][1], &sq5t[4][0], &sq5t[4][1]);
-  StoreAligned64(square_sum3[2] + x + 16, sq3t[2]);
-  StoreAligned64(square_sum5[3] + x + 16, sq5t[3]);
-  StoreAligned64(square_sum3[3] + x + 16, sq3t[3]);
-  StoreAligned64(square_sum5[4] + x + 16, sq5t[4]);
+  SumHorizontal(sq[0] + 1, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  SumHorizontal(sq[1] + 1, &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+  StoreAligned64(square_sum3[2] + x + 16, sq3[2]);
+  StoreAligned64(square_sum5[3] + x + 16, sq5[3]);
+  StoreAligned64(square_sum3[3] + x + 16, sq3[3]);
+  StoreAligned64(square_sum5[4] + x + 16, sq5[4]);
   LoadAligned32x2U16Msan(sum3, x + 16, sum_width, s3[1]);
-  LoadAligned64x2U32Msan(square_sum3, x + 16, sum_width, sq3t);
-  CalculateSumAndIndex3(s3[1], sq3t, scales[1], &sum_3[0][1], &index_3[0][1]);
-  CalculateSumAndIndex3(s3[1] + 1, sq3t + 1, scales[1], &sum_3[1][1],
+  LoadAligned64x2U32Msan(square_sum3, x + 16, sum_width, sq3);
+  CalculateSumAndIndex3(s3[1], sq3, scales[1], &sum_3[0][1], &index_3[0][1]);
+  CalculateSumAndIndex3(s3[1] + 1, sq3 + 1, scales[1], &sum_3[1][1],
                         &index_3[1][1]);
   CalculateIntermediate<9>(sum_3[0], index_3[0], ma3[0], b3[0] + 1);
   CalculateIntermediate<9>(sum_3[1], index_3[1], ma3[1], b3[1] + 1);
   LoadAligned32x3U16Msan(sum5, x + 16, sum_width, s5[1]);
-  LoadAligned64x3U32Msan(square_sum5, x + 16, sum_width, sq5t);
-  CalculateSumAndIndex5(s5[1], sq5t, scales[0], &sum_5[1], &index_5[1]);
+  LoadAligned64x3U32Msan(square_sum5, x + 16, sum_width, sq5);
+  CalculateSumAndIndex5(s5[1], sq5, scales[0], &sum_5[1], &index_5[1]);
   CalculateIntermediate<25>(sum_5, index_5, ma5, b5 + 1);
   b3[0][0] = _mm256_permute2x128_si256(b3[0][0], b3[0][2], 0x21);
   b3[1][0] = _mm256_permute2x128_si256(b3[1][0], b3[1][2], 0x21);
@@ -1988,8 +2017,8 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRow(
     __m256i sq[6], __m256i ma3[2], __m256i ma5[2], __m256i b3[5],
     __m256i b5[5]) {
   const __m256i s0 = LoadUnaligned32Msan(src + 8, over_read_in_bytes + 8);
-  __m256i s3[2][3], s5[2][5], sq3[4][2], sq3t[4][2], sq5[5][2], sq5t[5][2],
-      sum_3[2], index_3[2], sum_5[2], index_5[2];
+  __m256i s3[2][3], s5[2][5], sq3[4][2], sq5[5][2], sum_3[2], index_3[2],
+      sum_5[2], index_5[2];
   sq[1] = SquareLo8(s0);
   sq[2] = SquareHi8(s0);
   sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21);
@@ -2006,17 +2035,17 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRow(
   sq5[4][1] = sq5[3][1];
   CalculateSumAndIndex5(s5[0], sq5, scales[0], &sum_5[0], &index_5[0]);
 
-  SumHorizontal(sq + 1, &sq3t[2][0], &sq3t[2][1], &sq5t[3][0], &sq5t[3][1]);
+  SumHorizontal(sq + 1, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
   LoadAligned32x2U16Msan(sum3, x + 16, sum_width, s3[1]);
-  LoadAligned64x2U32Msan(square_sum3, x + 16, sum_width, sq3t);
-  CalculateSumAndIndex3(s3[1], sq3t, scales[1], &sum_3[1], &index_3[1]);
+  LoadAligned64x2U32Msan(square_sum3, x + 16, sum_width, sq3);
+  CalculateSumAndIndex3(s3[1], sq3, scales[1], &sum_3[1], &index_3[1]);
   CalculateIntermediate<9>(sum_3, index_3, ma3, b3 + 1);
   LoadAligned32x3U16Msan(sum5, x + 16, sum_width, s5[1]);
   s5[1][4] = s5[1][3];
-  LoadAligned64x3U32Msan(square_sum5, x + 16, sum_width, sq5t);
-  sq5t[4][0] = sq5t[3][0];
-  sq5t[4][1] = sq5t[3][1];
-  CalculateSumAndIndex5(s5[1], sq5t, scales[0], &sum_5[1], &index_5[1]);
+  LoadAligned64x3U32Msan(square_sum5, x + 16, sum_width, sq5);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  CalculateSumAndIndex5(s5[1], sq5, scales[0], &sum_5[1], &index_5[1]);
   CalculateIntermediate<25>(sum_5, index_5, ma5, b5 + 1);
   b3[0] = _mm256_permute2x128_si256(b3[0], b3[2], 0x21);
   b5[0] = _mm256_permute2x128_si256(b5[0], b5[2], 0x21);
@@ -2071,9 +2100,9 @@ LIBGAV1_ALWAYS_INLINE void BoxSumFilterPreProcess3(
     uint16_t* const sum3[3], uint32_t* const square_sum3[3],
     const ptrdiff_t sum_width, uint16_t* ma343, uint16_t* ma444, uint32_t* b343,
     uint32_t* b444) {
+  const __m128i s = LoadUnaligned16Msan(src, kOverreadInBytesPass2_128 - width);
   __m128i ma0, sq_128[2], b0;
   __m256i mas[3], sq[3], bs[3];
-  const __m128i s = LoadUnaligned16Msan(src, kOverreadInBytesPass2_128 - width);
   sq_128[0] = SquareLo8(s);
   BoxFilterPreProcess3Lo(s, scale, sum3, square_sum3, sq_128, &ma0, &b0);
   sq[0] = SetrM128i(sq_128[0], sq_128[1]);
@@ -2115,9 +2144,9 @@ inline void BoxSumFilterPreProcess(
     const uint8_t* const src0, const uint8_t* const src1, const int width,
     const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5],
     uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
-    const ptrdiff_t sum_width, uint16_t* const ma343[4],
-    uint16_t* const ma444[2], uint16_t* ma565, uint32_t* const b343[4],
-    uint32_t* const b444[2], uint32_t* b565) {
+    const ptrdiff_t sum_width, uint16_t* const ma343[4], uint16_t* const ma444,
+    uint16_t* ma565, uint32_t* const b343[4], uint32_t* const b444,
+    uint32_t* b565) {
   __m128i s[2], ma3_128[2], ma5_0, sq_128[2][2], b3_128[2], b5_0;
   __m256i ma3[2][3], ma5[3], sq[2][3], b3[2][5], b5[5];
   s[0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1_128 - width);
@@ -2151,9 +2180,8 @@ inline void BoxSumFilterPreProcess(
     Sum565W(b5, b);
     StoreAligned64(b565, b);
     Prepare3_8(ma3[1], ma3x);
-    Store343_444Lo(ma3x, b3[1], x, ma343[1], ma444[0], b343[1], b444[0]);
-    Store343_444Hi(ma3x, b3[1] + 1, x + 16, ma343[1], ma444[0], b343[1],
-                   b444[0]);
+    Store343_444Lo(ma3x, b3[1], x, ma343[1], ma444, b343[1], b444);
+    Store343_444Hi(ma3x, b3[1] + 1, x + 16, ma343[1], ma444, b343[1], b444);
     Prepare3_8(ma5, ma5x);
     ma[0] = Sum565Lo(ma5x);
     ma[1] = Sum565Hi(ma5x);
@@ -2199,8 +2227,9 @@ inline __m256i CalculateFilteredOutput(const __m256i src, const __m256i ma,
   return _mm256_packs_epi32(dst_lo, dst_hi);  // 13 bits
 }
 
-inline __m256i CalculateFilteredOutputPass1(const __m256i src, __m256i ma[2],
-                                            __m256i b[2][2]) {
+inline __m256i CalculateFilteredOutputPass1(const __m256i src,
+                                            const __m256i ma[2],
+                                            const __m256i b[2][2]) {
   const __m256i ma_sum = _mm256_add_epi16(ma[0], ma[1]);
   __m256i b_sum[2];
   b_sum[0] = _mm256_add_epi32(b[0][0], b[1][0]);
@@ -2208,8 +2237,9 @@ inline __m256i CalculateFilteredOutputPass1(const __m256i src, __m256i ma[2],
   return CalculateFilteredOutput<5>(src, ma_sum, b_sum);
 }
 
-inline __m256i CalculateFilteredOutputPass2(const __m256i src, __m256i ma[3],
-                                            __m256i b[3][2]) {
+inline __m256i CalculateFilteredOutputPass2(const __m256i src,
+                                            const __m256i ma[3],
+                                            const __m256i b[3][2]) {
   const __m256i ma_sum = Sum3_16(ma);
   __m256i b_sum[2];
   Sum3_32(b, b_sum);
@@ -2267,13 +2297,13 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterPass1(
 
   int x = 0;
   do {
-    __m256i ma[3], ma3[3], b[2][2][2];
+    __m256i ma[3], ma5[3], b[2][2][2];
     BoxFilterPreProcess5(src0 + x + 8, src1 + x + 8,
                          x + 8 + kOverreadInBytesPass1_256 - width, sum_width,
                          x + 8, scale, sum5, square_sum5, sq, mas, bs);
-    Prepare3_8(mas, ma3);
-    ma[1] = Sum565Lo(ma3);
-    ma[2] = Sum565Hi(ma3);
+    Prepare3_8(mas, ma5);
+    ma[1] = Sum565Lo(ma5);
+    ma[2] = Sum565Hi(ma5);
     StoreAligned64(ma565[1] + x, ma + 1);
     Sum565W(bs + 0, b[0][1]);
     Sum565W(bs + 1, b[1][1]);
@@ -2511,9 +2541,9 @@ inline void BoxFilterLastRow(
     const ptrdiff_t sum_width, const uint16_t scales[2], const int16_t w0,
     const int16_t w2, uint16_t* const sum3[4], uint16_t* const sum5[5],
     uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
-    uint16_t* const ma343[4], uint16_t* const ma444[3],
-    uint16_t* const ma565[2], uint32_t* const b343[4], uint32_t* const b444[3],
-    uint32_t* const b565[2], uint8_t* const dst) {
+    uint16_t* const ma343, uint16_t* const ma444, uint16_t* const ma565,
+    uint32_t* const b343, uint32_t* const b444, uint32_t* const b565,
+    uint8_t* const dst) {
   const __m128i s0 =
       LoadUnaligned16Msan(src0, kOverreadInBytesPass1_128 - width);
   __m128i ma3_0, ma5_0, b3_0, b5_0, sq_128[2];
@@ -2542,13 +2572,13 @@ inline void BoxFilterLastRow(
     Sum343W(b3, b[2]);
     const __m256i sr = LoadUnaligned32(src + x);
     const __m256i sr_lo = _mm256_unpacklo_epi8(sr, _mm256_setzero_si256());
-    ma[0] = LoadAligned32(ma565[0] + x);
-    LoadAligned64(b565[0] + x, b[0]);
+    ma[0] = LoadAligned32(ma565 + x);
+    LoadAligned64(b565 + x, b[0]);
     p[0] = CalculateFilteredOutputPass1(sr_lo, ma, b);
-    ma[0] = LoadAligned32(ma343[0] + x);
-    ma[1] = LoadAligned32(ma444[0] + x);
-    LoadAligned64(b343[0] + x, b[0]);
-    LoadAligned64(b444[0] + x, b[1]);
+    ma[0] = LoadAligned32(ma343 + x);
+    ma[1] = LoadAligned32(ma444 + x);
+    LoadAligned64(b343 + x, b[0]);
+    LoadAligned64(b444 + x, b[1]);
     p[1] = CalculateFilteredOutputPass2(sr_lo, ma, b);
     const __m256i d0 = SelfGuidedDoubleMultiplier(sr_lo, p, w0, w2);
 
@@ -2557,13 +2587,13 @@ inline void BoxFilterLastRow(
     mat[2] = Sum343Hi(ma3x);
     Sum343W(b3 + 1, b[2]);
     const __m256i sr_hi = _mm256_unpackhi_epi8(sr, _mm256_setzero_si256());
-    mat[0] = LoadAligned32(ma565[0] + x + 16);
-    LoadAligned64(b565[0] + x + 16, b[0]);
+    mat[0] = LoadAligned32(ma565 + x + 16);
+    LoadAligned64(b565 + x + 16, b[0]);
     p[0] = CalculateFilteredOutputPass1(sr_hi, mat, b);
-    mat[0] = LoadAligned32(ma343[0] + x + 16);
-    mat[1] = LoadAligned32(ma444[0] + x + 16);
-    LoadAligned64(b343[0] + x + 16, b[0]);
-    LoadAligned64(b444[0] + x + 16, b[1]);
+    mat[0] = LoadAligned32(ma343 + x + 16);
+    mat[1] = LoadAligned32(ma444 + x + 16);
+    LoadAligned64(b343 + x + 16, b[0]);
+    LoadAligned64(b444 + x + 16, b[1]);
     p[1] = CalculateFilteredOutputPass2(sr_hi, mat, b);
     const __m256i d1 = SelfGuidedDoubleMultiplier(sr_hi, p, w0, w2);
     StoreUnaligned32(dst + x, _mm256_packus_epi16(d0, d1));
@@ -2578,8 +2608,9 @@ inline void BoxFilterLastRow(
 
 LIBGAV1_ALWAYS_INLINE void BoxFilterProcess(
     const RestorationUnitInfo& restoration_info, const uint8_t* src,
-    const uint8_t* const top_border, const uint8_t* bottom_border,
-    const ptrdiff_t stride, const int width, const int height,
+    const ptrdiff_t stride, const uint8_t* const top_border,
+    const ptrdiff_t top_border_stride, const uint8_t* bottom_border,
+    const ptrdiff_t bottom_border_stride, const int width, const int height,
     SgrBuffer* const sgr_buffer, uint8_t* dst) {
   const auto temp_stride = Align<ptrdiff_t>(width, 32);
   const auto sum_width = temp_stride + 8;
@@ -2619,14 +2650,14 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterProcess(
   b565[1] = b565[0] + temp_stride;
   assert(scales[0] != 0);
   assert(scales[1] != 0);
-  BoxSum(top_border, stride, width, sum_stride, temp_stride, sum3[0], sum5[1],
-         square_sum3[0], square_sum5[1]);
+  BoxSum(top_border, top_border_stride, width, sum_stride, temp_stride, sum3[0],
+         sum5[1], square_sum3[0], square_sum5[1]);
   sum5[0] = sum5[1];
   square_sum5[0] = square_sum5[1];
   const uint8_t* const s = (height > 1) ? src + stride : bottom_border;
   BoxSumFilterPreProcess(src, s, width, scales, sum3, sum5, square_sum3,
-                         square_sum5, sum_width, ma343, ma444, ma565[0], b343,
-                         b444, b565[0]);
+                         square_sum5, sum_width, ma343, ma444[0], ma565[0],
+                         b343, b444[0], b565[0]);
   sum5[0] = sgr_buffer->sum5 + kSumOffset;
   square_sum5[0] = sgr_buffer->square_sum5 + kSumOffset;
 
@@ -2656,7 +2687,7 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterProcess(
     const uint8_t* sr[2];
     if ((height & 1) == 0) {
       sr[0] = bottom_border;
-      sr[1] = bottom_border + stride;
+      sr[1] = bottom_border + bottom_border_stride;
     } else {
       sr[0] = src + 2 * stride;
       sr[1] = bottom_border;
@@ -2680,19 +2711,21 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterProcess(
       std::swap(ma565[0], ma565[1]);
       std::swap(b565[0], b565[1]);
     }
-    BoxFilterLastRow(src + 3, bottom_border + stride, width, sum_width, scales,
-                     w0, w2, sum3, sum5, square_sum3, square_sum5, ma343, ma444,
-                     ma565, b343, b444, b565, dst);
+    BoxFilterLastRow(src + 3, bottom_border + bottom_border_stride, width,
+                     sum_width, scales, w0, w2, sum3, sum5, square_sum3,
+                     square_sum5, ma343[0], ma444[0], ma565[0], b343[0],
+                     b444[0], b565[0], dst);
   }
 }
 
 inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
-                                  const uint8_t* src,
+                                  const uint8_t* src, const ptrdiff_t stride,
                                   const uint8_t* const top_border,
+                                  const ptrdiff_t top_border_stride,
                                   const uint8_t* bottom_border,
-                                  const ptrdiff_t stride, const int width,
-                                  const int height, SgrBuffer* const sgr_buffer,
-                                  uint8_t* dst) {
+                                  const ptrdiff_t bottom_border_stride,
+                                  const int width, const int height,
+                                  SgrBuffer* const sgr_buffer, uint8_t* dst) {
   const auto temp_stride = Align<ptrdiff_t>(width, 32);
   const auto sum_width = temp_stride + 8;
   const auto sum_stride = temp_stride + 32;
@@ -2712,8 +2745,8 @@ inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
   b565[0] = sgr_buffer->b565;
   b565[1] = b565[0] + temp_stride;
   assert(scale != 0);
-  BoxSum<5>(top_border, stride, width, sum_stride, temp_stride, sum5[1],
-            square_sum5[1]);
+  BoxSum<5>(top_border, top_border_stride, width, sum_stride, temp_stride,
+            sum5[1], square_sum5[1]);
   sum5[0] = sum5[1];
   square_sum5[0] = square_sum5[1];
   const uint8_t* const s = (height > 1) ? src + stride : bottom_border;
@@ -2739,7 +2772,7 @@ inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
     const uint8_t* sr[2];
     if ((height & 1) == 0) {
       sr[0] = bottom_border;
-      sr[1] = bottom_border + stride;
+      sr[1] = bottom_border + bottom_border_stride;
     } else {
       sr[0] = src + 2 * stride;
       sr[1] = bottom_border;
@@ -2757,18 +2790,20 @@ inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
       Circulate5PointersBy2<uint16_t>(sum5);
       Circulate5PointersBy2<uint32_t>(square_sum5);
     }
-    BoxFilterPass1LastRow(src, bottom_border + stride, width, sum_width, scale,
-                          w0, sum5, square_sum5, ma565[0], b565[0], dst);
+    BoxFilterPass1LastRow(src, bottom_border + bottom_border_stride, width,
+                          sum_width, scale, w0, sum5, square_sum5, ma565[0],
+                          b565[0], dst);
   }
 }
 
 inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
-                                  const uint8_t* src,
+                                  const uint8_t* src, const ptrdiff_t stride,
                                   const uint8_t* const top_border,
+                                  const ptrdiff_t top_border_stride,
                                   const uint8_t* bottom_border,
-                                  const ptrdiff_t stride, const int width,
-                                  const int height, SgrBuffer* const sgr_buffer,
-                                  uint8_t* dst) {
+                                  const ptrdiff_t bottom_border_stride,
+                                  const int width, const int height,
+                                  SgrBuffer* const sgr_buffer, uint8_t* dst) {
   assert(restoration_info.sgr_proj_info.multiplier[0] == 0);
   const auto temp_stride = Align<ptrdiff_t>(width, 32);
   const auto sum_width = temp_stride + 8;
@@ -2794,8 +2829,8 @@ inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
   b444[0] = sgr_buffer->b444;
   b444[1] = b444[0] + temp_stride;
   assert(scale != 0);
-  BoxSum<3>(top_border, stride, width, sum_stride, temp_stride, sum3[0],
-            square_sum3[0]);
+  BoxSum<3>(top_border, top_border_stride, width, sum_stride, temp_stride,
+            sum3[0], square_sum3[0]);
   BoxSumFilterPreProcess3<false>(src, width, scale, sum3, square_sum3,
                                  sum_width, ma343[0], nullptr, b343[0],
                                  nullptr);
@@ -2806,7 +2841,7 @@ inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
     s = src + stride;
   } else {
     s = bottom_border;
-    bottom_border += stride;
+    bottom_border += bottom_border_stride;
   }
   BoxSumFilterPreProcess3<true>(s, width, scale, sum3, square_sum3, sum_width,
                                 ma343[1], ma444[0], b343[1], b444[0]);
@@ -2833,7 +2868,7 @@ inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
                    square_sum3, ma343, ma444, b343, b444, dst);
     src += stride;
     dst += stride;
-    bottom_border += stride;
+    bottom_border += bottom_border_stride;
     Circulate3PointersBy1<uint16_t>(ma343);
     Circulate3PointersBy1<uint32_t>(b343);
     std::swap(ma444[0], ma444[1]);
@@ -2841,13 +2876,14 @@ inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
   } while (--y != 0);
 }
 
-// If |width| is non-multiple of 8, up to 7 more pixels are written to |dest| in
-// the end of each row. It is safe to overwrite the output as it will not be
+// If |width| is non-multiple of 32, up to 31 more pixels are written to |dest|
+// in the end of each row. It is safe to overwrite the output as it will not be
 // part of the visible frame.
 void SelfGuidedFilter_AVX2(
     const RestorationUnitInfo& restoration_info, const void* const source,
-    const void* const top_border, const void* const bottom_border,
-    const ptrdiff_t stride, const int width, const int height,
+    const ptrdiff_t stride, const void* const top_border,
+    const ptrdiff_t top_border_stride, const void* const bottom_border,
+    const ptrdiff_t bottom_border_stride, const int width, const int height,
     RestorationBuffer* const restoration_buffer, void* const dest) {
   const int index = restoration_info.sgr_proj_info.index;
   const int radius_pass_0 = kSgrProjParams[index][0];  // 2 or 0
@@ -2861,14 +2897,17 @@ void SelfGuidedFilter_AVX2(
     // |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the
     // following assertion.
     assert(radius_pass_0 != 0);
-    BoxFilterProcessPass1(restoration_info, src - 3, top - 3, bottom - 3,
-                          stride, width, height, sgr_buffer, dst);
+    BoxFilterProcessPass1(restoration_info, src - 3, stride, top - 3,
+                          top_border_stride, bottom - 3, bottom_border_stride,
+                          width, height, sgr_buffer, dst);
   } else if (radius_pass_0 == 0) {
-    BoxFilterProcessPass2(restoration_info, src - 2, top - 2, bottom - 2,
-                          stride, width, height, sgr_buffer, dst);
+    BoxFilterProcessPass2(restoration_info, src - 2, stride, top - 2,
+                          top_border_stride, bottom - 2, bottom_border_stride,
+                          width, height, sgr_buffer, dst);
   } else {
-    BoxFilterProcess(restoration_info, src - 3, top - 3, bottom - 3, stride,
-                     width, height, sgr_buffer, dst);
+    BoxFilterProcess(restoration_info, src - 3, stride, top - 3,
+                     top_border_stride, bottom - 3, bottom_border_stride, width,
+                     height, sgr_buffer, dst);
   }
 }
 
@@ -2891,7 +2930,7 @@ void LoopRestorationInit_AVX2() { low_bitdepth::Init8bpp(); }
 }  // namespace dsp
 }  // namespace libgav1
 
-#else  // !LIBGAV1_TARGETING_AVX2
+#else   // !LIBGAV1_TARGETING_AVX2
 namespace libgav1 {
 namespace dsp {
 
diff --git a/src/dsp/x86/loop_restoration_avx2.h b/src/dsp/x86/loop_restoration_avx2.h
index d80227c..2c3534a 100644
--- a/src/dsp/x86/loop_restoration_avx2.h
+++ b/src/dsp/x86/loop_restoration_avx2.h
@@ -47,6 +47,10 @@ void LoopRestorationInit10bpp_AVX2();
 #define LIBGAV1_Dsp8bpp_SelfGuidedFilter LIBGAV1_CPU_AVX2
 #endif
 
+#ifndef LIBGAV1_Dsp10bpp_SelfGuidedFilter
+#define LIBGAV1_Dsp10bpp_SelfGuidedFilter LIBGAV1_CPU_AVX2
+#endif
+
 #endif  // LIBGAV1_TARGETING_AVX2
 
 #endif  // LIBGAV1_SRC_DSP_X86_LOOP_RESTORATION_AVX2_H_
diff --git a/src/dsp/x86/loop_restoration_sse4.cc b/src/dsp/x86/loop_restoration_sse4.cc
index 24f5ad2..273bcc8 100644
--- a/src/dsp/x86/loop_restoration_sse4.cc
+++ b/src/dsp/x86/loop_restoration_sse4.cc
@@ -481,13 +481,12 @@ inline void WienerVerticalTap1(const int16_t* wiener_buffer,
   }
 }
 
-void WienerFilter_SSE4_1(const RestorationUnitInfo& restoration_info,
-                         const void* const source, const void* const top_border,
-                         const void* const bottom_border,
-                         const ptrdiff_t stride, const int width,
-                         const int height,
-                         RestorationBuffer* const restoration_buffer,
-                         void* const dest) {
+void WienerFilter_SSE4_1(
+    const RestorationUnitInfo& restoration_info, const void* const source,
+    const ptrdiff_t stride, const void* const top_border,
+    const ptrdiff_t top_border_stride, const void* const bottom_border,
+    const ptrdiff_t bottom_border_stride, const int width, const int height,
+    RestorationBuffer* const restoration_buffer, void* const dest) {
   const int16_t* const number_leading_zero_coefficients =
       restoration_info.wiener_info.number_leading_zero_coefficients;
   const int number_rows_to_skip = std::max(
@@ -516,45 +515,48 @@ void WienerFilter_SSE4_1(const RestorationUnitInfo& restoration_info,
   const __m128i coefficients_horizontal =
       _mm_sub_epi16(c, _mm_setr_epi16(0, 0, 0, 128, 0, 0, 0, 0));
   if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) {
-    WienerHorizontalTap7(top + (2 - height_extra) * stride - 3, stride,
-                         wiener_stride, height_extra, filter_horizontal[0],
-                         coefficients_horizontal, &wiener_buffer_horizontal);
-    WienerHorizontalTap7(src - 3, stride, wiener_stride, height,
+    WienerHorizontalTap7(top + (2 - height_extra) * top_border_stride - 3,
+                         top_border_stride, wiener_stride, height_extra,
                          filter_horizontal[0], coefficients_horizontal,
                          &wiener_buffer_horizontal);
-    WienerHorizontalTap7(bottom - 3, stride, wiener_stride, height_extra,
+    WienerHorizontalTap7(src - 3, stride, wiener_stride, height,
                          filter_horizontal[0], coefficients_horizontal,
                          &wiener_buffer_horizontal);
-  } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
-    WienerHorizontalTap5(top + (2 - height_extra) * stride - 2, stride,
-                         wiener_stride, height_extra, filter_horizontal[1],
+    WienerHorizontalTap7(bottom - 3, bottom_border_stride, wiener_stride,
+                         height_extra, filter_horizontal[0],
                          coefficients_horizontal, &wiener_buffer_horizontal);
-    WienerHorizontalTap5(src - 2, stride, wiener_stride, height,
+  } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
+    WienerHorizontalTap5(top + (2 - height_extra) * top_border_stride - 2,
+                         top_border_stride, wiener_stride, height_extra,
                          filter_horizontal[1], coefficients_horizontal,
                          &wiener_buffer_horizontal);
-    WienerHorizontalTap5(bottom - 2, stride, wiener_stride, height_extra,
+    WienerHorizontalTap5(src - 2, stride, wiener_stride, height,
                          filter_horizontal[1], coefficients_horizontal,
                          &wiener_buffer_horizontal);
+    WienerHorizontalTap5(bottom - 2, bottom_border_stride, wiener_stride,
+                         height_extra, filter_horizontal[1],
+                         coefficients_horizontal, &wiener_buffer_horizontal);
   } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) {
     // The maximum over-reads happen here.
-    WienerHorizontalTap3(top + (2 - height_extra) * stride - 1, stride,
-                         wiener_stride, height_extra, filter_horizontal[2],
-                         coefficients_horizontal, &wiener_buffer_horizontal);
-    WienerHorizontalTap3(src - 1, stride, wiener_stride, height,
+    WienerHorizontalTap3(top + (2 - height_extra) * top_border_stride - 1,
+                         top_border_stride, wiener_stride, height_extra,
                          filter_horizontal[2], coefficients_horizontal,
                          &wiener_buffer_horizontal);
-    WienerHorizontalTap3(bottom - 1, stride, wiener_stride, height_extra,
+    WienerHorizontalTap3(src - 1, stride, wiener_stride, height,
                          filter_horizontal[2], coefficients_horizontal,
                          &wiener_buffer_horizontal);
+    WienerHorizontalTap3(bottom - 1, bottom_border_stride, wiener_stride,
+                         height_extra, filter_horizontal[2],
+                         coefficients_horizontal, &wiener_buffer_horizontal);
   } else {
     assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3);
-    WienerHorizontalTap1(top + (2 - height_extra) * stride, stride,
-                         wiener_stride, height_extra,
+    WienerHorizontalTap1(top + (2 - height_extra) * top_border_stride,
+                         top_border_stride, wiener_stride, height_extra,
                          &wiener_buffer_horizontal);
     WienerHorizontalTap1(src, stride, wiener_stride, height,
                          &wiener_buffer_horizontal);
-    WienerHorizontalTap1(bottom, stride, wiener_stride, height_extra,
-                         &wiener_buffer_horizontal);
+    WienerHorizontalTap1(bottom, bottom_border_stride, wiener_stride,
+                         height_extra, &wiener_buffer_horizontal);
   }
 
   // vertical filtering.
@@ -1160,11 +1162,26 @@ inline __m128i CalculateMa(const __m128i sum, const __m128i sum_sq[2],
   return _mm_packus_epi32(z0, z1);
 }
 
-template <int n>
-inline __m128i CalculateB(const __m128i sum, const __m128i ma) {
-  static_assert(n == 9 || n == 25, "");
+inline __m128i CalculateB5(const __m128i sum, const __m128i ma) {
+  // one_over_n == 164.
   constexpr uint32_t one_over_n =
-      ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n;
+      ((1 << kSgrProjReciprocalBits) + (25 >> 1)) / 25;
+  // one_over_n_quarter == 41.
+  constexpr uint32_t one_over_n_quarter = one_over_n >> 2;
+  static_assert(one_over_n == one_over_n_quarter << 2, "");
+  // |ma| is in range [0, 255].
+  const __m128i m = _mm_maddubs_epi16(ma, _mm_set1_epi16(one_over_n_quarter));
+  const __m128i m0 = VmullLo16(m, sum);
+  const __m128i m1 = VmullHi16(m, sum);
+  const __m128i b_lo = VrshrU32(m0, kSgrProjReciprocalBits - 2);
+  const __m128i b_hi = VrshrU32(m1, kSgrProjReciprocalBits - 2);
+  return _mm_packus_epi32(b_lo, b_hi);
+}
+
+inline __m128i CalculateB3(const __m128i sum, const __m128i ma) {
+  // one_over_n == 455.
+  constexpr uint32_t one_over_n =
+      ((1 << kSgrProjReciprocalBits) + (9 >> 1)) / 9;
   const __m128i m0 = VmullLo16(ma, sum);
   const __m128i m1 = VmullHi16(ma, sum);
   const __m128i m2 = _mm_mullo_epi32(m0, _mm_set1_epi32(one_over_n));
@@ -1227,12 +1244,12 @@ inline void LookupIntermediate(const __m128i sum, const __m128i index,
   } else {
     maq = _mm_unpackhi_epi8(*ma, _mm_setzero_si128());
   }
-  *b = CalculateB<n>(sum, maq);
+  *b = (n == 9) ? CalculateB3(sum, maq) : CalculateB5(sum, maq);
 }
 
 // Set the shuffle control mask of indices out of range [0, 15] to (1xxxxxxx)b
 // to get value 0 as the shuffle result. The most significiant bit 1 comes
-// either from the comparision instruction, or from the sign bit of the index.
+// either from the comparison instruction, or from the sign bit of the index.
 inline __m128i ShuffleIndex(const __m128i table, const __m128i index) {
   __m128i mask;
   mask = _mm_cmpgt_epi8(index, _mm_set1_epi8(15));
@@ -1250,15 +1267,15 @@ inline __m128i AdjustValue(const __m128i value, const __m128i index,
 inline void CalculateIntermediate(const __m128i sum[2], const __m128i index[2],
                                   __m128i* const ma, __m128i* const b0,
                                   __m128i* const b1) {
-  // Use table lookup to read elements which indices are less than 48.
+  // Use table lookup to read elements whose indices are less than 48.
   const __m128i c0 = LoadAligned16(kSgrMaLookup + 0 * 16);
   const __m128i c1 = LoadAligned16(kSgrMaLookup + 1 * 16);
   const __m128i c2 = LoadAligned16(kSgrMaLookup + 2 * 16);
   const __m128i indices = _mm_packus_epi16(index[0], index[1]);
   __m128i idx;
-  // Clip idx to 127 to apply signed comparision instructions.
+  // Clip idx to 127 to apply signed comparison instructions.
   idx = _mm_min_epu8(indices, _mm_set1_epi8(127));
-  // All elements which indices are less than 48 are set to 0.
+  // All elements whose indices are less than 48 are set to 0.
   // Get shuffle results for indices in range [0, 15].
   *ma = ShuffleIndex(c0, idx);
   // Get shuffle results for indices in range [16, 31].
@@ -1273,12 +1290,12 @@ inline void CalculateIntermediate(const __m128i sum[2], const __m128i index[2],
   const __m128i res2 = ShuffleIndex(c2, idx);
   *ma = _mm_or_si128(*ma, res2);
 
-  // For elements which indices are larger than 47, since they seldom change
+  // For elements whose indices are larger than 47, since they seldom change
   // values with the increase of the index, we use comparison and arithmetic
   // operations to calculate their values.
-  // Add -128 to apply signed comparision instructions.
+  // Add -128 to apply signed comparison instructions.
   idx = _mm_add_epi8(indices, _mm_set1_epi8(-128));
-  // Elements which indices are larger than 47 (with value 0) are set to 5.
+  // Elements whose indices are larger than 47 (with value 0) are set to 5.
   *ma = _mm_max_epu8(*ma, _mm_set1_epi8(5));
   *ma = AdjustValue(*ma, idx, 55);   // 55 is the last index which value is 5.
   *ma = AdjustValue(*ma, idx, 72);   // 72 is the last index which value is 4.
@@ -1298,9 +1315,9 @@ inline void CalculateIntermediate(const __m128i sum[2], const __m128i index[2],
   // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
   // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
   const __m128i maq0 = _mm_unpacklo_epi8(*ma, _mm_setzero_si128());
-  *b0 = CalculateB<9>(sum[0], maq0);
+  *b0 = CalculateB3(sum[0], maq0);
   const __m128i maq1 = _mm_unpackhi_epi8(*ma, _mm_setzero_si128());
-  *b1 = CalculateB<9>(sum[1], maq1);
+  *b1 = CalculateB3(sum[1], maq1);
 }
 
 inline void CalculateIntermediate(const __m128i sum[2], const __m128i index[2],
@@ -1776,9 +1793,9 @@ inline void BoxSumFilterPreProcess(
     const uint8_t* const src0, const uint8_t* const src1, const int width,
     const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5],
     uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
-    const ptrdiff_t sum_width, uint16_t* const ma343[4],
-    uint16_t* const ma444[2], uint16_t* ma565, uint32_t* const b343[4],
-    uint32_t* const b444[2], uint32_t* b565) {
+    const ptrdiff_t sum_width, uint16_t* const ma343[4], uint16_t* const ma444,
+    uint16_t* ma565, uint32_t* const b343[4], uint32_t* const b444,
+    uint32_t* b565) {
   __m128i s[2][2], ma3[2][2], ma5[2], sq[2][4], b3[2][3], b5[3];
   s[0][0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width);
   s[1][0] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1 - width);
@@ -1808,9 +1825,8 @@ inline void BoxSumFilterPreProcess(
     Sum565W(b5 + 1, b + 2);
     StoreAligned64U32(b565, b);
     Prepare3_8<0>(ma3[1], ma3x);
-    Store343_444Lo(ma3x, b3[1], x, ma343[1], ma444[0], b343[1], b444[0]);
-    Store343_444Hi(ma3x, b3[1] + 1, x + 8, ma343[1], ma444[0], b343[1],
-                   b444[0]);
+    Store343_444Lo(ma3x, b3[1], x, ma343[1], ma444, b343[1], b444);
+    Store343_444Hi(ma3x, b3[1] + 1, x + 8, ma343[1], ma444, b343[1], b444);
     Prepare3_8<0>(ma5, ma5x);
     ma[0] = Sum565Lo(ma5x);
     ma[1] = Sum565Hi(ma5x);
@@ -1854,8 +1870,9 @@ inline __m128i CalculateFilteredOutput(const __m128i src, const __m128i ma,
   return _mm_packs_epi32(dst_lo, dst_hi);  // 13 bits
 }
 
-inline __m128i CalculateFilteredOutputPass1(const __m128i src, __m128i ma[2],
-                                            __m128i b[2][2]) {
+inline __m128i CalculateFilteredOutputPass1(const __m128i src,
+                                            const __m128i ma[2],
+                                            const __m128i b[2][2]) {
   const __m128i ma_sum = _mm_add_epi16(ma[0], ma[1]);
   __m128i b_sum[2];
   b_sum[0] = _mm_add_epi32(b[0][0], b[1][0]);
@@ -1863,8 +1880,9 @@ inline __m128i CalculateFilteredOutputPass1(const __m128i src, __m128i ma[2],
   return CalculateFilteredOutput<5>(src, ma_sum, b_sum);
 }
 
-inline __m128i CalculateFilteredOutputPass2(const __m128i src, __m128i ma[3],
-                                            __m128i b[3][2]) {
+inline __m128i CalculateFilteredOutputPass2(const __m128i src,
+                                            const __m128i ma[3],
+                                            const __m128i b[3][2]) {
   const __m128i ma_sum = Sum3_16(ma);
   __m128i b_sum[2];
   Sum3_32(b, b_sum);
@@ -1916,15 +1934,15 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterPass1(
 
   int x = 0;
   do {
-    __m128i ma[2], ma3[3], b[2][2], sr[2], p[2];
+    __m128i ma[2], ma5[3], b[2][2], sr[2], p[2];
     s[0][1] = LoadUnaligned16Msan(src0 + x + 16,
                                   x + 16 + kOverreadInBytesPass1 - width);
     s[1][1] = LoadUnaligned16Msan(src1 + x + 16,
                                   x + 16 + kOverreadInBytesPass1 - width);
     BoxFilterPreProcess5(s, sum_width, x + 8, scale, sum5, square_sum5, sq, mas,
                          bs);
-    Prepare3_8<0>(mas, ma3);
-    ma[1] = Sum565Lo(ma3);
+    Prepare3_8<0>(mas, ma5);
+    ma[1] = Sum565Lo(ma5);
     StoreAligned16(ma565[1] + x, ma[1]);
     Sum565W(bs, b[1]);
     StoreAligned32U32(b565[1] + x, b[1]);
@@ -1939,7 +1957,7 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterPass1(
     const __m128i d00 = SelfGuidedSingleMultiplier(sr0_lo, p[0], w0);
     const __m128i d10 = SelfGuidedSingleMultiplier(sr1_lo, p[1], w0);
 
-    ma[1] = Sum565Hi(ma3);
+    ma[1] = Sum565Hi(ma5);
     StoreAligned16(ma565[1] + x + 8, ma[1]);
     Sum565W(bs + 1, b[1]);
     StoreAligned32U32(b565[1] + x + 8, b[1]);
@@ -2158,9 +2176,9 @@ inline void BoxFilterLastRow(
     const ptrdiff_t sum_width, const uint16_t scales[2], const int16_t w0,
     const int16_t w2, uint16_t* const sum3[4], uint16_t* const sum5[5],
     uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
-    uint16_t* const ma343[4], uint16_t* const ma444[3],
-    uint16_t* const ma565[2], uint32_t* const b343[4], uint32_t* const b444[3],
-    uint32_t* const b565[2], uint8_t* const dst) {
+    uint16_t* const ma343, uint16_t* const ma444, uint16_t* const ma565,
+    uint32_t* const b343, uint32_t* const b444, uint32_t* const b565,
+    uint8_t* const dst) {
   __m128i s[2], ma3[2], ma5[2], sq[4], b3[3], b5[3], ma[3], b[3][2];
   s[0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width);
   sq[0] = SquareLo8(s[0]);
@@ -2183,13 +2201,13 @@ inline void BoxFilterLastRow(
     Sum343W(b3, b[2]);
     const __m128i sr = LoadAligned16(src + x);
     const __m128i sr_lo = _mm_unpacklo_epi8(sr, _mm_setzero_si128());
-    ma[0] = LoadAligned16(ma565[0] + x);
-    LoadAligned32U32(b565[0] + x, b[0]);
+    ma[0] = LoadAligned16(ma565 + x);
+    LoadAligned32U32(b565 + x, b[0]);
     p[0] = CalculateFilteredOutputPass1(sr_lo, ma, b);
-    ma[0] = LoadAligned16(ma343[0] + x);
-    ma[1] = LoadAligned16(ma444[0] + x);
-    LoadAligned32U32(b343[0] + x, b[0]);
-    LoadAligned32U32(b444[0] + x, b[1]);
+    ma[0] = LoadAligned16(ma343 + x);
+    ma[1] = LoadAligned16(ma444 + x);
+    LoadAligned32U32(b343 + x, b[0]);
+    LoadAligned32U32(b444 + x, b[1]);
     p[1] = CalculateFilteredOutputPass2(sr_lo, ma, b);
     const __m128i d0 = SelfGuidedDoubleMultiplier(sr_lo, p, w0, w2);
 
@@ -2198,13 +2216,13 @@ inline void BoxFilterLastRow(
     ma[2] = Sum343Hi(ma3x);
     Sum343W(b3 + 1, b[2]);
     const __m128i sr_hi = _mm_unpackhi_epi8(sr, _mm_setzero_si128());
-    ma[0] = LoadAligned16(ma565[0] + x + 8);
-    LoadAligned32U32(b565[0] + x + 8, b[0]);
+    ma[0] = LoadAligned16(ma565 + x + 8);
+    LoadAligned32U32(b565 + x + 8, b[0]);
     p[0] = CalculateFilteredOutputPass1(sr_hi, ma, b);
-    ma[0] = LoadAligned16(ma343[0] + x + 8);
-    ma[1] = LoadAligned16(ma444[0] + x + 8);
-    LoadAligned32U32(b343[0] + x + 8, b[0]);
-    LoadAligned32U32(b444[0] + x + 8, b[1]);
+    ma[0] = LoadAligned16(ma343 + x + 8);
+    ma[1] = LoadAligned16(ma444 + x + 8);
+    LoadAligned32U32(b343 + x + 8, b[0]);
+    LoadAligned32U32(b444 + x + 8, b[1]);
     p[1] = CalculateFilteredOutputPass2(sr_hi, ma, b);
     const __m128i d1 = SelfGuidedDoubleMultiplier(sr_hi, p, w0, w2);
     StoreAligned16(dst + x, _mm_packus_epi16(d0, d1));
@@ -2220,8 +2238,9 @@ inline void BoxFilterLastRow(
 
 LIBGAV1_ALWAYS_INLINE void BoxFilterProcess(
     const RestorationUnitInfo& restoration_info, const uint8_t* src,
-    const uint8_t* const top_border, const uint8_t* bottom_border,
-    const ptrdiff_t stride, const int width, const int height,
+    const ptrdiff_t stride, const uint8_t* const top_border,
+    const ptrdiff_t top_border_stride, const uint8_t* bottom_border,
+    const ptrdiff_t bottom_border_stride, const int width, const int height,
     SgrBuffer* const sgr_buffer, uint8_t* dst) {
   const auto temp_stride = Align<ptrdiff_t>(width, 16);
   const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
@@ -2261,14 +2280,14 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterProcess(
   b565[1] = b565[0] + temp_stride;
   assert(scales[0] != 0);
   assert(scales[1] != 0);
-  BoxSum(top_border, stride, width, sum_stride, sum_width, sum3[0], sum5[1],
-         square_sum3[0], square_sum5[1]);
+  BoxSum(top_border, top_border_stride, width, sum_stride, sum_width, sum3[0],
+         sum5[1], square_sum3[0], square_sum5[1]);
   sum5[0] = sum5[1];
   square_sum5[0] = square_sum5[1];
   const uint8_t* const s = (height > 1) ? src + stride : bottom_border;
   BoxSumFilterPreProcess(src, s, width, scales, sum3, sum5, square_sum3,
-                         square_sum5, sum_width, ma343, ma444, ma565[0], b343,
-                         b444, b565[0]);
+                         square_sum5, sum_width, ma343, ma444[0], ma565[0],
+                         b343, b444[0], b565[0]);
   sum5[0] = sgr_buffer->sum5;
   square_sum5[0] = sgr_buffer->square_sum5;
 
@@ -2298,7 +2317,7 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterProcess(
     const uint8_t* sr[2];
     if ((height & 1) == 0) {
       sr[0] = bottom_border;
-      sr[1] = bottom_border + stride;
+      sr[1] = bottom_border + bottom_border_stride;
     } else {
       sr[0] = src + 2 * stride;
       sr[1] = bottom_border;
@@ -2322,19 +2341,21 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterProcess(
       std::swap(ma565[0], ma565[1]);
       std::swap(b565[0], b565[1]);
     }
-    BoxFilterLastRow(src + 3, bottom_border + stride, width, sum_width, scales,
-                     w0, w2, sum3, sum5, square_sum3, square_sum5, ma343, ma444,
-                     ma565, b343, b444, b565, dst);
+    BoxFilterLastRow(src + 3, bottom_border + bottom_border_stride, width,
+                     sum_width, scales, w0, w2, sum3, sum5, square_sum3,
+                     square_sum5, ma343[0], ma444[0], ma565[0], b343[0],
+                     b444[0], b565[0], dst);
   }
 }
 
 inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
-                                  const uint8_t* src,
+                                  const uint8_t* src, const ptrdiff_t stride,
                                   const uint8_t* const top_border,
+                                  const ptrdiff_t top_border_stride,
                                   const uint8_t* bottom_border,
-                                  const ptrdiff_t stride, const int width,
-                                  const int height, SgrBuffer* const sgr_buffer,
-                                  uint8_t* dst) {
+                                  const ptrdiff_t bottom_border_stride,
+                                  const int width, const int height,
+                                  SgrBuffer* const sgr_buffer, uint8_t* dst) {
   const auto temp_stride = Align<ptrdiff_t>(width, 16);
   const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
   const auto sum_stride = temp_stride + 16;
@@ -2354,8 +2375,8 @@ inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
   b565[0] = sgr_buffer->b565;
   b565[1] = b565[0] + temp_stride;
   assert(scale != 0);
-  BoxSum<5>(top_border, stride, width, sum_stride, sum_width, sum5[1],
-            square_sum5[1]);
+  BoxSum<5>(top_border, top_border_stride, width, sum_stride, sum_width,
+            sum5[1], square_sum5[1]);
   sum5[0] = sum5[1];
   square_sum5[0] = square_sum5[1];
   const uint8_t* const s = (height > 1) ? src + stride : bottom_border;
@@ -2381,7 +2402,7 @@ inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
     const uint8_t* sr[2];
     if ((height & 1) == 0) {
       sr[0] = bottom_border;
-      sr[1] = bottom_border + stride;
+      sr[1] = bottom_border + bottom_border_stride;
     } else {
       sr[0] = src + 2 * stride;
       sr[1] = bottom_border;
@@ -2399,18 +2420,20 @@ inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
       Circulate5PointersBy2<uint16_t>(sum5);
       Circulate5PointersBy2<uint32_t>(square_sum5);
     }
-    BoxFilterPass1LastRow(src, bottom_border + stride, width, sum_width, scale,
-                          w0, sum5, square_sum5, ma565[0], b565[0], dst);
+    BoxFilterPass1LastRow(src, bottom_border + bottom_border_stride, width,
+                          sum_width, scale, w0, sum5, square_sum5, ma565[0],
+                          b565[0], dst);
   }
 }
 
 inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
-                                  const uint8_t* src,
+                                  const uint8_t* src, const ptrdiff_t stride,
                                   const uint8_t* const top_border,
+                                  const ptrdiff_t top_border_stride,
                                   const uint8_t* bottom_border,
-                                  const ptrdiff_t stride, const int width,
-                                  const int height, SgrBuffer* const sgr_buffer,
-                                  uint8_t* dst) {
+                                  const ptrdiff_t bottom_border_stride,
+                                  const int width, const int height,
+                                  SgrBuffer* const sgr_buffer, uint8_t* dst) {
   assert(restoration_info.sgr_proj_info.multiplier[0] == 0);
   const auto temp_stride = Align<ptrdiff_t>(width, 16);
   const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
@@ -2436,8 +2459,8 @@ inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
   b444[0] = sgr_buffer->b444;
   b444[1] = b444[0] + temp_stride;
   assert(scale != 0);
-  BoxSum<3>(top_border, stride, width, sum_stride, sum_width, sum3[0],
-            square_sum3[0]);
+  BoxSum<3>(top_border, top_border_stride, width, sum_stride, sum_width,
+            sum3[0], square_sum3[0]);
   BoxSumFilterPreProcess3<false>(src, width, scale, sum3, square_sum3,
                                  sum_width, ma343[0], nullptr, b343[0],
                                  nullptr);
@@ -2448,7 +2471,7 @@ inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
     s = src + stride;
   } else {
     s = bottom_border;
-    bottom_border += stride;
+    bottom_border += bottom_border_stride;
   }
   BoxSumFilterPreProcess3<true>(s, width, scale, sum3, square_sum3, sum_width,
                                 ma343[1], ma444[0], b343[1], b444[0]);
@@ -2475,7 +2498,7 @@ inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
                    square_sum3, ma343, ma444, b343, b444, dst);
     src += stride;
     dst += stride;
-    bottom_border += stride;
+    bottom_border += bottom_border_stride;
     Circulate3PointersBy1<uint16_t>(ma343);
     Circulate3PointersBy1<uint32_t>(b343);
     std::swap(ma444[0], ma444[1]);
@@ -2483,13 +2506,14 @@ inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
   } while (--y != 0);
 }
 
-// If |width| is non-multiple of 8, up to 7 more pixels are written to |dest| in
-// the end of each row. It is safe to overwrite the output as it will not be
+// If |width| is non-multiple of 16, up to 15 more pixels are written to |dest|
+// in the end of each row. It is safe to overwrite the output as it will not be
 // part of the visible frame.
 void SelfGuidedFilter_SSE4_1(
     const RestorationUnitInfo& restoration_info, const void* const source,
-    const void* const top_border, const void* const bottom_border,
-    const ptrdiff_t stride, const int width, const int height,
+    const ptrdiff_t stride, const void* const top_border,
+    const ptrdiff_t top_border_stride, const void* const bottom_border,
+    const ptrdiff_t bottom_border_stride, const int width, const int height,
     RestorationBuffer* const restoration_buffer, void* const dest) {
   const int index = restoration_info.sgr_proj_info.index;
   const int radius_pass_0 = kSgrProjParams[index][0];  // 2 or 0
@@ -2503,14 +2527,17 @@ void SelfGuidedFilter_SSE4_1(
     // |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the
     // following assertion.
     assert(radius_pass_0 != 0);
-    BoxFilterProcessPass1(restoration_info, src - 3, top - 3, bottom - 3,
-                          stride, width, height, sgr_buffer, dst);
+    BoxFilterProcessPass1(restoration_info, src - 3, stride, top - 3,
+                          top_border_stride, bottom - 3, bottom_border_stride,
+                          width, height, sgr_buffer, dst);
   } else if (radius_pass_0 == 0) {
-    BoxFilterProcessPass2(restoration_info, src - 2, top - 2, bottom - 2,
-                          stride, width, height, sgr_buffer, dst);
+    BoxFilterProcessPass2(restoration_info, src - 2, stride, top - 2,
+                          top_border_stride, bottom - 2, bottom_border_stride,
+                          width, height, sgr_buffer, dst);
   } else {
-    BoxFilterProcess(restoration_info, src - 3, top - 3, bottom - 3, stride,
-                     width, height, sgr_buffer, dst);
+    BoxFilterProcess(restoration_info, src - 3, stride, top - 3,
+                     top_border_stride, bottom - 3, bottom_border_stride, width,
+                     height, sgr_buffer, dst);
   }
 }
 
@@ -2538,7 +2565,7 @@ void LoopRestorationInit_SSE4_1() { low_bitdepth::Init8bpp(); }
 }  // namespace dsp
 }  // namespace libgav1
 
-#else  // !LIBGAV1_TARGETING_SSE4_1
+#else   // !LIBGAV1_TARGETING_SSE4_1
 namespace libgav1 {
 namespace dsp {
 
diff --git a/src/dsp/x86/loop_restoration_sse4.h b/src/dsp/x86/loop_restoration_sse4.h
index 65b2b11..00df3af 100644
--- a/src/dsp/x86/loop_restoration_sse4.h
+++ b/src/dsp/x86/loop_restoration_sse4.h
@@ -47,6 +47,10 @@ void LoopRestorationInit10bpp_SSE4_1();
 #define LIBGAV1_Dsp10bpp_WienerFilter LIBGAV1_CPU_SSE4_1
 #endif
 
+#ifndef LIBGAV1_Dsp10bpp_SelfGuidedFilter
+#define LIBGAV1_Dsp10bpp_SelfGuidedFilter LIBGAV1_CPU_SSE4_1
+#endif
+
 #endif  // LIBGAV1_TARGETING_SSE4_1
 
 #endif  // LIBGAV1_SRC_DSP_X86_LOOP_RESTORATION_SSE4_H_
diff --git a/src/dsp/x86/mask_blend_sse4.cc b/src/dsp/x86/mask_blend_sse4.cc
index d8036be..2e836af 100644
--- a/src/dsp/x86/mask_blend_sse4.cc
+++ b/src/dsp/x86/mask_blend_sse4.cc
@@ -430,12 +430,515 @@ void Init8bpp() {
 }  // namespace
 }  // namespace low_bitdepth
 
-void MaskBlendInit_SSE4_1() { low_bitdepth::Init8bpp(); }
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+constexpr int kMax10bppSample = (1 << 10) - 1;
+constexpr int kMaskInverse = 64;
+constexpr int kRoundBitsMaskBlend = 4;
+
+inline __m128i RightShiftWithRoundingZero_U16(const __m128i v_val_d, int bits,
+                                              const __m128i zero) {
+  // Shift out all but the last bit.
+  const __m128i v_tmp_d = _mm_srli_epi16(v_val_d, bits - 1);
+  // Avg with zero will shift by 1 and round.
+  return _mm_avg_epu16(v_tmp_d, zero);
+}
+
+inline __m128i RightShiftWithRoundingConst_S32(const __m128i v_val_d, int bits,
+                                               const __m128i shift) {
+  const __m128i v_tmp_d = _mm_add_epi32(v_val_d, shift);
+  return _mm_srai_epi32(v_tmp_d, bits);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline __m128i GetMask4x2(const uint8_t* mask, ptrdiff_t mask_stride,
+                          const __m128i zero) {
+  if (subsampling_x == 1) {
+    if (subsampling_y == 0) {
+      const __m128i mask_val_0 = _mm_cvtepu8_epi16(LoadLo8(mask));
+      const __m128i mask_val_1 =
+          _mm_cvtepu8_epi16(LoadLo8(mask + (mask_stride << subsampling_y)));
+      __m128i subsampled_mask = _mm_hadd_epi16(mask_val_0, mask_val_1);
+      return RightShiftWithRoundingZero_U16(subsampled_mask, 1, zero);
+    }
+    const __m128i one = _mm_set1_epi8(1);
+    const __m128i mask_val_0 =
+        LoadHi8(LoadLo8(mask), mask + (mask_stride << 1));
+    const __m128i mask_val_1 = LoadHi8(LoadLo8(mask + mask_stride),
+                                       mask + (mask_stride << 1) + mask_stride);
+    const __m128i add = _mm_adds_epu8(mask_val_0, mask_val_1);
+    const __m128i subsampled_mask = _mm_maddubs_epi16(add, one);
+    return RightShiftWithRoundingZero_U16(subsampled_mask, 2, zero);
+  }
+  assert(subsampling_y == 0 && subsampling_x == 0);
+  const __m128i mask_val_0 = Load4(mask);
+  const __m128i mask_val_1 = Load4(mask + mask_stride);
+  return _mm_cvtepu8_epi16(
+      _mm_or_si128(mask_val_0, _mm_slli_si128(mask_val_1, 4)));
+}
+
+template <int subsampling_x, int subsampling_y>
+inline __m128i GetMask8(const uint8_t* mask, const ptrdiff_t stride,
+                        const __m128i zero) {
+  if (subsampling_x == 1) {
+    if (subsampling_y == 0) {
+      const __m128i row_vals = LoadUnaligned16(mask);
+      const __m128i mask_val_0 = _mm_cvtepu8_epi16(row_vals);
+      const __m128i mask_val_1 = _mm_cvtepu8_epi16(_mm_srli_si128(row_vals, 8));
+      __m128i subsampled_mask = _mm_hadd_epi16(mask_val_0, mask_val_1);
+      return RightShiftWithRoundingZero_U16(subsampled_mask, 1, zero);
+    }
+    const __m128i one = _mm_set1_epi8(1);
+    const __m128i mask_val_0 = LoadUnaligned16(mask);
+    const __m128i mask_val_1 = LoadUnaligned16(mask + stride);
+    const __m128i add_0 = _mm_adds_epu8(mask_val_0, mask_val_1);
+    const __m128i mask_0 = _mm_maddubs_epi16(add_0, one);
+    return RightShiftWithRoundingZero_U16(mask_0, 2, zero);
+  }
+  assert(subsampling_y == 0 && subsampling_x == 0);
+  const __m128i mask_val = LoadLo8(mask);
+  return _mm_cvtepu8_epi16(mask_val);
+}
+
+inline void WriteMaskBlendLine10bpp4x2_SSE4_1(
+    const uint16_t* pred_0, const uint16_t* pred_1,
+    const ptrdiff_t pred_stride_1, const __m128i& pred_mask_0,
+    const __m128i& pred_mask_1, const __m128i& offset, const __m128i& max,
+    const __m128i& shift4, uint16_t* dst, const ptrdiff_t dst_stride) {
+  const __m128i pred_val_0 = LoadUnaligned16(pred_0);
+  const __m128i pred_val_1 = LoadHi8(LoadLo8(pred_1), pred_1 + pred_stride_1);
+
+  // int res = (mask_value * pred_0[x] + (64 - mask_value) * pred_1[x]) >> 6;
+  const __m128i compound_pred_lo_0 = _mm_mullo_epi16(pred_val_0, pred_mask_0);
+  const __m128i compound_pred_hi_0 = _mm_mulhi_epu16(pred_val_0, pred_mask_0);
+  const __m128i compound_pred_lo_1 = _mm_mullo_epi16(pred_val_1, pred_mask_1);
+  const __m128i compound_pred_hi_1 = _mm_mulhi_epu16(pred_val_1, pred_mask_1);
+  const __m128i pack0_lo =
+      _mm_unpacklo_epi16(compound_pred_lo_0, compound_pred_hi_0);
+  const __m128i pack0_hi =
+      _mm_unpackhi_epi16(compound_pred_lo_0, compound_pred_hi_0);
+  const __m128i pack1_lo =
+      _mm_unpacklo_epi16(compound_pred_lo_1, compound_pred_hi_1);
+  const __m128i pack1_hi =
+      _mm_unpackhi_epi16(compound_pred_lo_1, compound_pred_hi_1);
+  const __m128i compound_pred_lo = _mm_add_epi32(pack0_lo, pack1_lo);
+  const __m128i compound_pred_hi = _mm_add_epi32(pack0_hi, pack1_hi);
+  // res -= (bitdepth == 8) ? 0 : kCompoundOffset;
+  const __m128i sub_0 =
+      _mm_sub_epi32(_mm_srli_epi32(compound_pred_lo, 6), offset);
+  const __m128i sub_1 =
+      _mm_sub_epi32(_mm_srli_epi32(compound_pred_hi, 6), offset);
+
+  // dst[x] = static_cast<Pixel>(
+  //     Clip3(RightShiftWithRounding(res, inter_post_round_bits), 0,
+  //           (1 << kBitdepth8) - 1));
+  const __m128i shift_0 =
+      RightShiftWithRoundingConst_S32(sub_0, kRoundBitsMaskBlend, shift4);
+  const __m128i shift_1 =
+      RightShiftWithRoundingConst_S32(sub_1, kRoundBitsMaskBlend, shift4);
+  const __m128i result = _mm_min_epi16(_mm_packus_epi32(shift_0, shift_1), max);
+  StoreLo8(dst, result);
+  StoreHi8(dst + dst_stride, result);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void MaskBlend10bpp4x4_SSE4_1(const uint16_t* pred_0,
+                                     const uint16_t* pred_1,
+                                     const ptrdiff_t pred_stride_1,
+                                     const uint8_t* mask,
+                                     const ptrdiff_t mask_stride, uint16_t* dst,
+                                     const ptrdiff_t dst_stride) {
+  const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i shift4 = _mm_set1_epi32((1 << kRoundBitsMaskBlend) >> 1);
+  const __m128i offset = _mm_set1_epi32(kCompoundOffset);
+  const __m128i max = _mm_set1_epi16(kMax10bppSample);
+  __m128i pred_mask_0 =
+      GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+  __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+  WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1, pred_mask_0,
+                                    pred_mask_1, offset, max, shift4, dst,
+                                    dst_stride);
+  pred_0 += 4 << 1;
+  pred_1 += pred_stride_1 << 1;
+  mask += mask_stride << (1 + subsampling_y);
+  dst += dst_stride << 1;
+
+  pred_mask_0 =
+      GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+  pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+  WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1, pred_mask_0,
+                                    pred_mask_1, offset, max, shift4, dst,
+                                    dst_stride);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void MaskBlend10bpp4xH_SSE4_1(const uint16_t* pred_0,
+                                     const uint16_t* pred_1,
+                                     const ptrdiff_t pred_stride_1,
+                                     const uint8_t* const mask_ptr,
+                                     const ptrdiff_t mask_stride,
+                                     const int height, uint16_t* dst,
+                                     const ptrdiff_t dst_stride) {
+  const uint8_t* mask = mask_ptr;
+  if (height == 4) {
+    MaskBlend10bpp4x4_SSE4_1<subsampling_x, subsampling_y>(
+        pred_0, pred_1, pred_stride_1, mask, mask_stride, dst, dst_stride);
+    return;
+  }
+  const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse);
+  const __m128i zero = _mm_setzero_si128();
+  const uint8_t pred0_stride2 = 4 << 1;
+  const ptrdiff_t pred1_stride2 = pred_stride_1 << 1;
+  const ptrdiff_t mask_stride2 = mask_stride << (1 + subsampling_y);
+  const ptrdiff_t dst_stride2 = dst_stride << 1;
+  const __m128i offset = _mm_set1_epi32(kCompoundOffset);
+  const __m128i max = _mm_set1_epi16(kMax10bppSample);
+  const __m128i shift4 = _mm_set1_epi32((1 << kRoundBitsMaskBlend) >> 1);
+  int y = height;
+  do {
+    __m128i pred_mask_0 =
+        GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+    __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+
+    WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
+                                      pred_mask_0, pred_mask_1, offset, max,
+                                      shift4, dst, dst_stride);
+    pred_0 += pred0_stride2;
+    pred_1 += pred1_stride2;
+    mask += mask_stride2;
+    dst += dst_stride2;
+
+    pred_mask_0 =
+        GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+    pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+    WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
+                                      pred_mask_0, pred_mask_1, offset, max,
+                                      shift4, dst, dst_stride);
+    pred_0 += pred0_stride2;
+    pred_1 += pred1_stride2;
+    mask += mask_stride2;
+    dst += dst_stride2;
+
+    pred_mask_0 =
+        GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+    pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+    WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
+                                      pred_mask_0, pred_mask_1, offset, max,
+                                      shift4, dst, dst_stride);
+    pred_0 += pred0_stride2;
+    pred_1 += pred1_stride2;
+    mask += mask_stride2;
+    dst += dst_stride2;
+
+    pred_mask_0 =
+        GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+    pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+    WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
+                                      pred_mask_0, pred_mask_1, offset, max,
+                                      shift4, dst, dst_stride);
+    pred_0 += pred0_stride2;
+    pred_1 += pred1_stride2;
+    mask += mask_stride2;
+    dst += dst_stride2;
+    y -= 8;
+  } while (y != 0);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void MaskBlend10bpp_SSE4_1(const void* prediction_0,
+                                  const void* prediction_1,
+                                  const ptrdiff_t prediction_stride_1,
+                                  const uint8_t* const mask_ptr,
+                                  const ptrdiff_t mask_stride, const int width,
+                                  const int height, void* dest,
+                                  const ptrdiff_t dest_stride) {
+  auto* dst = static_cast<uint16_t*>(dest);
+  const ptrdiff_t dst_stride = dest_stride / sizeof(dst[0]);
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  const ptrdiff_t pred_stride_0 = width;
+  const ptrdiff_t pred_stride_1 = prediction_stride_1;
+  if (width == 4) {
+    MaskBlend10bpp4xH_SSE4_1<subsampling_x, subsampling_y>(
+        pred_0, pred_1, pred_stride_1, mask_ptr, mask_stride, height, dst,
+        dst_stride);
+    return;
+  }
+  const uint8_t* mask = mask_ptr;
+  const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse);
+  const __m128i zero = _mm_setzero_si128();
+  const ptrdiff_t mask_stride_ss = mask_stride << subsampling_y;
+  const __m128i offset = _mm_set1_epi32(kCompoundOffset);
+  const __m128i max = _mm_set1_epi16(kMax10bppSample);
+  const __m128i shift4 = _mm_set1_epi32((1 << kRoundBitsMaskBlend) >> 1);
+  int y = height;
+  do {
+    int x = 0;
+    do {
+      const __m128i pred_mask_0 = GetMask8<subsampling_x, subsampling_y>(
+          mask + (x << subsampling_x), mask_stride, zero);
+      const __m128i pred_val_0 = LoadUnaligned16(pred_0 + x);
+      const __m128i pred_val_1 = LoadUnaligned16(pred_1 + x);
+      // 64 - mask
+      const __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+
+      const __m128i compound_pred_lo_0 =
+          _mm_mullo_epi16(pred_val_0, pred_mask_0);
+      const __m128i compound_pred_hi_0 =
+          _mm_mulhi_epu16(pred_val_0, pred_mask_0);
+      const __m128i compound_pred_lo_1 =
+          _mm_mullo_epi16(pred_val_1, pred_mask_1);
+      const __m128i compound_pred_hi_1 =
+          _mm_mulhi_epu16(pred_val_1, pred_mask_1);
+      const __m128i pack0_lo =
+          _mm_unpacklo_epi16(compound_pred_lo_0, compound_pred_hi_0);
+      const __m128i pack0_hi =
+          _mm_unpackhi_epi16(compound_pred_lo_0, compound_pred_hi_0);
+      const __m128i pack1_lo =
+          _mm_unpacklo_epi16(compound_pred_lo_1, compound_pred_hi_1);
+      const __m128i pack1_hi =
+          _mm_unpackhi_epi16(compound_pred_lo_1, compound_pred_hi_1);
+      const __m128i compound_pred_lo = _mm_add_epi32(pack0_lo, pack1_lo);
+      const __m128i compound_pred_hi = _mm_add_epi32(pack0_hi, pack1_hi);
+
+      const __m128i sub_0 =
+          _mm_sub_epi32(_mm_srli_epi32(compound_pred_lo, 6), offset);
+      const __m128i sub_1 =
+          _mm_sub_epi32(_mm_srli_epi32(compound_pred_hi, 6), offset);
+      const __m128i shift_0 =
+          RightShiftWithRoundingConst_S32(sub_0, kRoundBitsMaskBlend, shift4);
+      const __m128i shift_1 =
+          RightShiftWithRoundingConst_S32(sub_1, kRoundBitsMaskBlend, shift4);
+      const __m128i result =
+          _mm_min_epi16(_mm_packus_epi32(shift_0, shift_1), max);
+      StoreUnaligned16(dst + x, result);
+      x += 8;
+    } while (x < width);
+    dst += dst_stride;
+    pred_0 += pred_stride_0;
+    pred_1 += pred_stride_1;
+    mask += mask_stride_ss;
+  } while (--y != 0);
+}
+
+inline void InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(
+    const uint16_t* prediction_0, const uint16_t* prediction_1,
+    const ptrdiff_t pred_stride_1, const __m128i& pred_mask_0,
+    const __m128i& pred_mask_1, const __m128i& shift6, uint16_t* dst,
+    const ptrdiff_t dst_stride) {
+  const __m128i pred_val_0 = LoadUnaligned16(prediction_0);
+  const __m128i pred_val_1 =
+      LoadHi8(LoadLo8(prediction_1), prediction_1 + pred_stride_1);
+
+  const __m128i mask_0 = _mm_unpacklo_epi16(pred_mask_1, pred_mask_0);
+  const __m128i mask_1 = _mm_unpackhi_epi16(pred_mask_1, pred_mask_0);
+  const __m128i pred_0 = _mm_unpacklo_epi16(pred_val_0, pred_val_1);
+  const __m128i pred_1 = _mm_unpackhi_epi16(pred_val_0, pred_val_1);
+
+  const __m128i compound_pred_0 = _mm_madd_epi16(pred_0, mask_0);
+  const __m128i compound_pred_1 = _mm_madd_epi16(pred_1, mask_1);
+  const __m128i shift_0 =
+      RightShiftWithRoundingConst_S32(compound_pred_0, 6, shift6);
+  const __m128i shift_1 =
+      RightShiftWithRoundingConst_S32(compound_pred_1, 6, shift6);
+  const __m128i res = _mm_packus_epi32(shift_0, shift_1);
+  StoreLo8(dst, res);
+  StoreHi8(dst + dst_stride, res);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void InterIntraMaskBlend10bpp4x4_SSE4_1(
+    const uint16_t* pred_0, const uint16_t* pred_1,
+    const ptrdiff_t pred_stride_1, const uint8_t* mask,
+    const ptrdiff_t mask_stride, uint16_t* dst, const ptrdiff_t dst_stride) {
+  const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse);
+  const __m128i shift6 = _mm_set1_epi32((1 << 6) >> 1);
+  const __m128i zero = _mm_setzero_si128();
+  __m128i pred_mask_0 =
+      GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+  __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+  InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
+                                              pred_mask_0, pred_mask_1, shift6,
+                                              dst, dst_stride);
+  pred_0 += 4 << 1;
+  pred_1 += pred_stride_1 << 1;
+  mask += mask_stride << (1 + subsampling_y);
+  dst += dst_stride << 1;
+
+  pred_mask_0 =
+      GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+  pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+  InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
+                                              pred_mask_0, pred_mask_1, shift6,
+                                              dst, dst_stride);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void InterIntraMaskBlend10bpp4xH_SSE4_1(const uint16_t* pred_0,
+                                               const uint16_t* pred_1,
+                                               const ptrdiff_t pred_stride_1,
+                                               const uint8_t* const mask_ptr,
+                                               const ptrdiff_t mask_stride,
+                                               const int height, uint16_t* dst,
+                                               const ptrdiff_t dst_stride) {
+  const uint8_t* mask = mask_ptr;
+  if (height == 4) {
+    InterIntraMaskBlend10bpp4x4_SSE4_1<subsampling_x, subsampling_y>(
+        pred_0, pred_1, pred_stride_1, mask, mask_stride, dst, dst_stride);
+    return;
+  }
+  const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i shift6 = _mm_set1_epi32((1 << 6) >> 1);
+  const uint8_t pred0_stride2 = 4 << 1;
+  const ptrdiff_t pred1_stride2 = pred_stride_1 << 1;
+  const ptrdiff_t mask_stride2 = mask_stride << (1 + subsampling_y);
+  const ptrdiff_t dst_stride2 = dst_stride << 1;
+  int y = height;
+  do {
+    __m128i pred_mask_0 =
+        GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+    __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+    InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
+                                                pred_mask_0, pred_mask_1,
+                                                shift6, dst, dst_stride);
+    pred_0 += pred0_stride2;
+    pred_1 += pred1_stride2;
+    mask += mask_stride2;
+    dst += dst_stride2;
+
+    pred_mask_0 =
+        GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+    pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+    InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
+                                                pred_mask_0, pred_mask_1,
+                                                shift6, dst, dst_stride);
+    pred_0 += pred0_stride2;
+    pred_1 += pred1_stride2;
+    mask += mask_stride2;
+    dst += dst_stride2;
+
+    pred_mask_0 =
+        GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+    pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+    InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
+                                                pred_mask_0, pred_mask_1,
+                                                shift6, dst, dst_stride);
+    pred_0 += pred0_stride2;
+    pred_1 += pred1_stride2;
+    mask += mask_stride2;
+    dst += dst_stride2;
+
+    pred_mask_0 =
+        GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+    pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+    InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
+                                                pred_mask_0, pred_mask_1,
+                                                shift6, dst, dst_stride);
+    pred_0 += pred0_stride2;
+    pred_1 += pred1_stride2;
+    mask += mask_stride2;
+    dst += dst_stride2;
+    y -= 8;
+  } while (y != 0);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void InterIntraMaskBlend10bpp_SSE4_1(
+    const void* prediction_0, const void* prediction_1,
+    const ptrdiff_t prediction_stride_1, const uint8_t* const mask_ptr,
+    const ptrdiff_t mask_stride, const int width, const int height, void* dest,
+    const ptrdiff_t dest_stride) {
+  auto* dst = static_cast<uint16_t*>(dest);
+  const ptrdiff_t dst_stride = dest_stride / sizeof(dst[0]);
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  const ptrdiff_t pred_stride_0 = width;
+  const ptrdiff_t pred_stride_1 = prediction_stride_1;
+  if (width == 4) {
+    InterIntraMaskBlend10bpp4xH_SSE4_1<subsampling_x, subsampling_y>(
+        pred_0, pred_1, pred_stride_1, mask_ptr, mask_stride, height, dst,
+        dst_stride);
+    return;
+  }
+  const uint8_t* mask = mask_ptr;
+  const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse);
+  const __m128i shift6 = _mm_set1_epi32((1 << 6) >> 1);
+  const __m128i zero = _mm_setzero_si128();
+  const ptrdiff_t mask_stride_ss = mask_stride << subsampling_y;
+  int y = height;
+  do {
+    int x = 0;
+    do {
+      const __m128i pred_mask_0 = GetMask8<subsampling_x, subsampling_y>(
+          mask + (x << subsampling_x), mask_stride, zero);
+      const __m128i pred_val_0 = LoadUnaligned16(pred_0 + x);
+      const __m128i pred_val_1 = LoadUnaligned16(pred_1 + x);
+      // 64 - mask
+      const __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+      const __m128i mask_0 = _mm_unpacklo_epi16(pred_mask_1, pred_mask_0);
+      const __m128i mask_1 = _mm_unpackhi_epi16(pred_mask_1, pred_mask_0);
+      const __m128i pred_0 = _mm_unpacklo_epi16(pred_val_0, pred_val_1);
+      const __m128i pred_1 = _mm_unpackhi_epi16(pred_val_0, pred_val_1);
+
+      const __m128i compound_pred_0 = _mm_madd_epi16(pred_0, mask_0);
+      const __m128i compound_pred_1 = _mm_madd_epi16(pred_1, mask_1);
+      const __m128i shift_0 =
+          RightShiftWithRoundingConst_S32(compound_pred_0, 6, shift6);
+      const __m128i shift_1 =
+          RightShiftWithRoundingConst_S32(compound_pred_1, 6, shift6);
+      StoreUnaligned16(dst + x, _mm_packus_epi32(shift_0, shift_1));
+      x += 8;
+    } while (x < width);
+    dst += dst_stride;
+    pred_0 += pred_stride_0;
+    pred_1 += pred_stride_1;
+    mask += mask_stride_ss;
+  } while (--y != 0);
+}
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+
+#if DSP_ENABLED_10BPP_SSE4_1(MaskBlend444)
+  dsp->mask_blend[0][0] = MaskBlend10bpp_SSE4_1<0, 0>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(MaskBlend422)
+  dsp->mask_blend[1][0] = MaskBlend10bpp_SSE4_1<1, 0>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(MaskBlend420)
+  dsp->mask_blend[2][0] = MaskBlend10bpp_SSE4_1<1, 1>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(MaskBlendInterIntra444)
+  dsp->mask_blend[0][1] = InterIntraMaskBlend10bpp_SSE4_1<0, 0>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(MaskBlendInterIntra422)
+  dsp->mask_blend[1][1] = InterIntraMaskBlend10bpp_SSE4_1<1, 0>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(MaskBlendInterIntra420)
+  dsp->mask_blend[2][1] = InterIntraMaskBlend10bpp_SSE4_1<1, 1>;
+#endif
+}
+
+}  // namespace
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+void MaskBlendInit_SSE4_1() {
+  low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+}
 
 }  // namespace dsp
 }  // namespace libgav1
 
-#else  // !LIBGAV1_TARGETING_SSE4_1
+#else   // !LIBGAV1_TARGETING_SSE4_1
 
 namespace libgav1 {
 namespace dsp {
diff --git a/src/dsp/x86/mask_blend_sse4.h b/src/dsp/x86/mask_blend_sse4.h
index 52b0b5c..4a95f0c 100644
--- a/src/dsp/x86/mask_blend_sse4.h
+++ b/src/dsp/x86/mask_blend_sse4.h
@@ -55,6 +55,30 @@ void MaskBlendInit_SSE4_1();
 #define LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp420 LIBGAV1_CPU_SSE4_1
 #endif
 
+#ifndef LIBGAV1_Dsp10bpp_MaskBlend444
+#define LIBGAV1_Dsp10bpp_MaskBlend444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_MaskBlend422
+#define LIBGAV1_Dsp10bpp_MaskBlend422 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_MaskBlend420
+#define LIBGAV1_Dsp10bpp_MaskBlend420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_MaskBlendInterIntra444
+#define LIBGAV1_Dsp10bpp_MaskBlendInterIntra444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_MaskBlendInterIntra422
+#define LIBGAV1_Dsp10bpp_MaskBlendInterIntra422 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_MaskBlendInterIntra420
+#define LIBGAV1_Dsp10bpp_MaskBlendInterIntra420 LIBGAV1_CPU_SSE4_1
+#endif
+
 #endif  // LIBGAV1_TARGETING_SSE4_1
 
 #endif  // LIBGAV1_SRC_DSP_X86_MASK_BLEND_SSE4_H_
diff --git a/src/dsp/x86/motion_field_projection_sse4.cc b/src/dsp/x86/motion_field_projection_sse4.cc
index c506941..e3f2cce 100644
--- a/src/dsp/x86/motion_field_projection_sse4.cc
+++ b/src/dsp/x86/motion_field_projection_sse4.cc
@@ -139,9 +139,9 @@ inline void Store(const __m128i position, const __m128i reference_offset,
   const ptrdiff_t offset =
       static_cast<int16_t>(_mm_extract_epi16(position, idx));
   if ((idx & 3) == 0) {
-    dst_mv[offset].mv32 = _mm_cvtsi128_si32(mv);
+    dst_mv[offset].mv32 = static_cast<uint32_t>(_mm_cvtsi128_si32(mv));
   } else {
-    dst_mv[offset].mv32 = _mm_extract_epi32(mv, idx & 3);
+    dst_mv[offset].mv32 = static_cast<uint32_t>(_mm_extract_epi32(mv, idx & 3));
   }
   dst_reference_offset[offset] = _mm_extract_epi8(reference_offset, idx);
 }
@@ -386,7 +386,7 @@ void MotionFieldProjectionInit_SSE4_1() {
 }  // namespace dsp
 }  // namespace libgav1
 
-#else  // !LIBGAV1_TARGETING_SSE4_1
+#else   // !LIBGAV1_TARGETING_SSE4_1
 namespace libgav1 {
 namespace dsp {
 
diff --git a/src/dsp/x86/motion_vector_search_sse4.cc b/src/dsp/x86/motion_vector_search_sse4.cc
index e9cdd4c..7f5f035 100644
--- a/src/dsp/x86/motion_vector_search_sse4.cc
+++ b/src/dsp/x86/motion_vector_search_sse4.cc
@@ -251,7 +251,7 @@ void MotionVectorSearchInit_SSE4_1() {
 }  // namespace dsp
 }  // namespace libgav1
 
-#else  // !LIBGAV1_TARGETING_SSE4_1
+#else   // !LIBGAV1_TARGETING_SSE4_1
 namespace libgav1 {
 namespace dsp {
 
diff --git a/src/dsp/x86/obmc_sse4.cc b/src/dsp/x86/obmc_sse4.cc
index 3a1d1fd..c34a7f7 100644
--- a/src/dsp/x86/obmc_sse4.cc
+++ b/src/dsp/x86/obmc_sse4.cc
@@ -31,6 +31,7 @@
 
 namespace libgav1 {
 namespace dsp {
+namespace low_bitdepth {
 namespace {
 
 #include "src/dsp/obmc.inc"
@@ -311,13 +312,295 @@ void Init8bpp() {
 }
 
 }  // namespace
+}  // namespace low_bitdepth
 
-void ObmcInit_SSE4_1() { Init8bpp(); }
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+#include "src/dsp/obmc.inc"
+
+constexpr int kRoundBitsObmcBlend = 6;
+
+inline void OverlapBlendFromLeft2xH_SSE4_1(
+    uint16_t* const prediction, const ptrdiff_t pred_stride, const int height,
+    const uint16_t* const obmc_prediction, const ptrdiff_t obmc_pred_stride) {
+  uint16_t* pred = prediction;
+  const uint16_t* obmc_pred = obmc_prediction;
+  const ptrdiff_t pred_stride2 = pred_stride << 1;
+  const ptrdiff_t obmc_pred_stride2 = obmc_pred_stride << 1;
+  const __m128i mask_inverter = _mm_cvtsi32_si128(0x40404040);
+  const __m128i mask_val = _mm_shufflelo_epi16(Load2(kObmcMask), 0x00);
+  // 64 - mask.
+  const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
+  const __m128i masks =
+      _mm_cvtepi8_epi16(_mm_unpacklo_epi8(mask_val, obmc_mask_val));
+  int y = height;
+  do {
+    const __m128i pred_val = Load4x2(pred, pred + pred_stride);
+    const __m128i obmc_pred_val =
+        Load4x2(obmc_pred, obmc_pred + obmc_pred_stride);
+    const __m128i terms = _mm_unpacklo_epi16(pred_val, obmc_pred_val);
+    const __m128i result = RightShiftWithRounding_U32(
+        _mm_madd_epi16(terms, masks), kRoundBitsObmcBlend);
+    const __m128i packed_result = _mm_packus_epi32(result, result);
+    Store4(pred, packed_result);
+    Store4(pred + pred_stride, _mm_srli_si128(packed_result, 4));
+    pred += pred_stride2;
+    obmc_pred += obmc_pred_stride2;
+    y -= 2;
+  } while (y != 0);
+}
+
+inline void OverlapBlendFromLeft4xH_SSE4_1(
+    uint16_t* const prediction, const ptrdiff_t pred_stride, const int height,
+    const uint16_t* const obmc_prediction, const ptrdiff_t obmc_pred_stride) {
+  uint16_t* pred = prediction;
+  const uint16_t* obmc_pred = obmc_prediction;
+  const ptrdiff_t pred_stride2 = pred_stride << 1;
+  const ptrdiff_t obmc_pred_stride2 = obmc_pred_stride << 1;
+  const __m128i mask_inverter = _mm_cvtsi32_si128(0x40404040);
+  const __m128i mask_val = Load4(kObmcMask + 2);
+  // 64 - mask.
+  const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
+  const __m128i masks =
+      _mm_cvtepi8_epi16(_mm_unpacklo_epi8(mask_val, obmc_mask_val));
+  int y = height;
+  do {
+    const __m128i pred_val = LoadHi8(LoadLo8(pred), pred + pred_stride);
+    const __m128i obmc_pred_val =
+        LoadHi8(LoadLo8(obmc_pred), obmc_pred + obmc_pred_stride);
+    const __m128i terms_lo = _mm_unpacklo_epi16(pred_val, obmc_pred_val);
+    const __m128i terms_hi = _mm_unpackhi_epi16(pred_val, obmc_pred_val);
+    const __m128i result_lo = RightShiftWithRounding_U32(
+        _mm_madd_epi16(terms_lo, masks), kRoundBitsObmcBlend);
+    const __m128i result_hi = RightShiftWithRounding_U32(
+        _mm_madd_epi16(terms_hi, masks), kRoundBitsObmcBlend);
+    const __m128i packed_result = _mm_packus_epi32(result_lo, result_hi);
+    StoreLo8(pred, packed_result);
+    StoreHi8(pred + pred_stride, packed_result);
+    pred += pred_stride2;
+    obmc_pred += obmc_pred_stride2;
+    y -= 2;
+  } while (y != 0);
+}
+
+void OverlapBlendFromLeft10bpp_SSE4_1(void* const prediction,
+                                      const ptrdiff_t prediction_stride,
+                                      const int width, const int height,
+                                      const void* const obmc_prediction,
+                                      const ptrdiff_t obmc_prediction_stride) {
+  auto* pred = static_cast<uint16_t*>(prediction);
+  const auto* obmc_pred = static_cast<const uint16_t*>(obmc_prediction);
+  const ptrdiff_t pred_stride = prediction_stride / sizeof(pred[0]);
+  const ptrdiff_t obmc_pred_stride =
+      obmc_prediction_stride / sizeof(obmc_pred[0]);
+
+  if (width == 2) {
+    OverlapBlendFromLeft2xH_SSE4_1(pred, pred_stride, height, obmc_pred,
+                                   obmc_pred_stride);
+    return;
+  }
+  if (width == 4) {
+    OverlapBlendFromLeft4xH_SSE4_1(pred, pred_stride, height, obmc_pred,
+                                   obmc_pred_stride);
+    return;
+  }
+  const __m128i mask_inverter = _mm_set1_epi8(64);
+  const uint8_t* mask = kObmcMask + width - 2;
+  int x = 0;
+  do {
+    pred = static_cast<uint16_t*>(prediction) + x;
+    obmc_pred = static_cast<const uint16_t*>(obmc_prediction) + x;
+    const __m128i mask_val = LoadLo8(mask + x);
+    // 64 - mask
+    const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
+    const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val);
+    const __m128i masks_lo = _mm_cvtepi8_epi16(masks);
+    const __m128i masks_hi = _mm_cvtepi8_epi16(_mm_srli_si128(masks, 8));
+    int y = height;
+    do {
+      const __m128i pred_val = LoadUnaligned16(pred);
+      const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred);
+      const __m128i terms_lo = _mm_unpacklo_epi16(pred_val, obmc_pred_val);
+      const __m128i terms_hi = _mm_unpackhi_epi16(pred_val, obmc_pred_val);
+      const __m128i result_lo = RightShiftWithRounding_U32(
+          _mm_madd_epi16(terms_lo, masks_lo), kRoundBitsObmcBlend);
+      const __m128i result_hi = RightShiftWithRounding_U32(
+          _mm_madd_epi16(terms_hi, masks_hi), kRoundBitsObmcBlend);
+      StoreUnaligned16(pred, _mm_packus_epi32(result_lo, result_hi));
+
+      pred += pred_stride;
+      obmc_pred += obmc_pred_stride;
+    } while (--y != 0);
+    x += 8;
+  } while (x < width);
+}
+
+inline void OverlapBlendFromTop2xH_SSE4_1(uint16_t* const prediction,
+                                          const ptrdiff_t pred_stride,
+                                          const int height,
+                                          const uint16_t* const obmc_prediction,
+                                          const ptrdiff_t obmc_pred_stride) {
+  uint16_t* pred = prediction;
+  const uint16_t* obmc_pred = obmc_prediction;
+  const __m128i mask_inverter = _mm_set1_epi16(64);
+  const __m128i mask_shuffler = _mm_set_epi32(0x01010101, 0x01010101, 0, 0);
+  const __m128i mask_preinverter = _mm_set1_epi16(-256 | 1);
+  const uint8_t* mask = kObmcMask + height - 2;
+  const int compute_height =
+      height - (height >> 2);  // compute_height based on 8-bit opt
+  const ptrdiff_t pred_stride2 = pred_stride << 1;
+  const ptrdiff_t obmc_pred_stride2 = obmc_pred_stride << 1;
+  int y = 0;
+  do {
+    // First mask in the first half, second mask in the second half.
+    const __m128i mask_val = _mm_shuffle_epi8(Load4(mask + y), mask_shuffler);
+    const __m128i masks =
+        _mm_sub_epi8(mask_inverter, _mm_sign_epi8(mask_val, mask_preinverter));
+    const __m128i masks_lo = _mm_cvtepi8_epi16(masks);
+    const __m128i masks_hi = _mm_cvtepi8_epi16(_mm_srli_si128(masks, 8));
+
+    const __m128i pred_val = LoadHi8(LoadLo8(pred), pred + pred_stride);
+    const __m128i obmc_pred_val =
+        LoadHi8(LoadLo8(obmc_pred), obmc_pred + obmc_pred_stride);
+    const __m128i terms_lo = _mm_unpacklo_epi16(obmc_pred_val, pred_val);
+    const __m128i terms_hi = _mm_unpackhi_epi16(obmc_pred_val, pred_val);
+    const __m128i result_lo = RightShiftWithRounding_U32(
+        _mm_madd_epi16(terms_lo, masks_lo), kRoundBitsObmcBlend);
+    const __m128i result_hi = RightShiftWithRounding_U32(
+        _mm_madd_epi16(terms_hi, masks_hi), kRoundBitsObmcBlend);
+    const __m128i packed_result = _mm_packus_epi32(result_lo, result_hi);
+
+    Store4(pred, packed_result);
+    Store4(pred + pred_stride, _mm_srli_si128(packed_result, 8));
+    pred += pred_stride2;
+    obmc_pred += obmc_pred_stride2;
+    y += 2;
+  } while (y < compute_height);
+}
+
+inline void OverlapBlendFromTop4xH_SSE4_1(uint16_t* const prediction,
+                                          const ptrdiff_t pred_stride,
+                                          const int height,
+                                          const uint16_t* const obmc_prediction,
+                                          const ptrdiff_t obmc_pred_stride) {
+  uint16_t* pred = prediction;
+  const uint16_t* obmc_pred = obmc_prediction;
+  const __m128i mask_inverter = _mm_set1_epi16(64);
+  const __m128i mask_shuffler = _mm_set_epi32(0x01010101, 0x01010101, 0, 0);
+  const __m128i mask_preinverter = _mm_set1_epi16(-256 | 1);
+  const uint8_t* mask = kObmcMask + height - 2;
+  const int compute_height = height - (height >> 2);
+  const ptrdiff_t pred_stride2 = pred_stride << 1;
+  const ptrdiff_t obmc_pred_stride2 = obmc_pred_stride << 1;
+  int y = 0;
+  do {
+    // First mask in the first half, second mask in the second half.
+    const __m128i mask_val = _mm_shuffle_epi8(Load4(mask + y), mask_shuffler);
+    const __m128i masks =
+        _mm_sub_epi8(mask_inverter, _mm_sign_epi8(mask_val, mask_preinverter));
+    const __m128i masks_lo = _mm_cvtepi8_epi16(masks);
+    const __m128i masks_hi = _mm_cvtepi8_epi16(_mm_srli_si128(masks, 8));
+
+    const __m128i pred_val = LoadHi8(LoadLo8(pred), pred + pred_stride);
+    const __m128i obmc_pred_val =
+        LoadHi8(LoadLo8(obmc_pred), obmc_pred + obmc_pred_stride);
+    const __m128i terms_lo = _mm_unpacklo_epi16(obmc_pred_val, pred_val);
+    const __m128i terms_hi = _mm_unpackhi_epi16(obmc_pred_val, pred_val);
+    const __m128i result_lo = RightShiftWithRounding_U32(
+        _mm_madd_epi16(terms_lo, masks_lo), kRoundBitsObmcBlend);
+    const __m128i result_hi = RightShiftWithRounding_U32(
+        _mm_madd_epi16(terms_hi, masks_hi), kRoundBitsObmcBlend);
+    const __m128i packed_result = _mm_packus_epi32(result_lo, result_hi);
+
+    StoreLo8(pred, packed_result);
+    StoreHi8(pred + pred_stride, packed_result);
+    pred += pred_stride2;
+    obmc_pred += obmc_pred_stride2;
+    y += 2;
+  } while (y < compute_height);
+}
+
+void OverlapBlendFromTop10bpp_SSE4_1(void* const prediction,
+                                     const ptrdiff_t prediction_stride,
+                                     const int width, const int height,
+                                     const void* const obmc_prediction,
+                                     const ptrdiff_t obmc_prediction_stride) {
+  auto* pred = static_cast<uint16_t*>(prediction);
+  const auto* obmc_pred = static_cast<const uint16_t*>(obmc_prediction);
+  const ptrdiff_t pred_stride = prediction_stride / sizeof(pred[0]);
+  const ptrdiff_t obmc_pred_stride =
+      obmc_prediction_stride / sizeof(obmc_pred[0]);
+
+  if (width == 2) {
+    OverlapBlendFromTop2xH_SSE4_1(pred, pred_stride, height, obmc_pred,
+                                  obmc_pred_stride);
+    return;
+  }
+  if (width == 4) {
+    OverlapBlendFromTop4xH_SSE4_1(pred, pred_stride, height, obmc_pred,
+                                  obmc_pred_stride);
+    return;
+  }
+
+  const __m128i mask_inverter = _mm_set1_epi8(64);
+  const int compute_height = height - (height >> 2);
+  const uint8_t* mask = kObmcMask + height - 2;
+  pred = static_cast<uint16_t*>(prediction);
+  obmc_pred = static_cast<const uint16_t*>(obmc_prediction);
+  int y = 0;
+  do {
+    const __m128i mask_val = _mm_set1_epi8(mask[y]);
+    // 64 - mask
+    const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
+    const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val);
+    const __m128i masks_lo = _mm_cvtepi8_epi16(masks);
+    const __m128i masks_hi = _mm_cvtepi8_epi16(_mm_srli_si128(masks, 8));
+    int x = 0;
+    do {
+      const __m128i pred_val = LoadUnaligned16(pred + x);
+      const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred + x);
+      const __m128i terms_lo = _mm_unpacklo_epi16(pred_val, obmc_pred_val);
+      const __m128i terms_hi = _mm_unpackhi_epi16(pred_val, obmc_pred_val);
+      const __m128i result_lo = RightShiftWithRounding_U32(
+          _mm_madd_epi16(terms_lo, masks_lo), kRoundBitsObmcBlend);
+      const __m128i result_hi = RightShiftWithRounding_U32(
+          _mm_madd_epi16(terms_hi, masks_hi), kRoundBitsObmcBlend);
+      StoreUnaligned16(pred + x, _mm_packus_epi32(result_lo, result_hi));
+      x += 8;
+    } while (x < width);
+    pred += pred_stride;
+    obmc_pred += obmc_pred_stride;
+  } while (++y < compute_height);
+}
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+#if DSP_ENABLED_10BPP_SSE4_1(ObmcVertical)
+  dsp->obmc_blend[kObmcDirectionVertical] = OverlapBlendFromTop10bpp_SSE4_1;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(ObmcHorizontal)
+  dsp->obmc_blend[kObmcDirectionHorizontal] = OverlapBlendFromLeft10bpp_SSE4_1;
+#endif
+}
+
+}  // namespace
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+void ObmcInit_SSE4_1() {
+  low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+}
 
 }  // namespace dsp
 }  // namespace libgav1
 
-#else  // !LIBGAV1_TARGETING_SSE4_1
+#else   // !LIBGAV1_TARGETING_SSE4_1
 
 namespace libgav1 {
 namespace dsp {
diff --git a/src/dsp/x86/obmc_sse4.h b/src/dsp/x86/obmc_sse4.h
index bd8b416..448d2cf 100644
--- a/src/dsp/x86/obmc_sse4.h
+++ b/src/dsp/x86/obmc_sse4.h
@@ -38,6 +38,12 @@ void ObmcInit_SSE4_1();
 #ifndef LIBGAV1_Dsp8bpp_ObmcHorizontal
 #define LIBGAV1_Dsp8bpp_ObmcHorizontal LIBGAV1_CPU_SSE4_1
 #endif
+#ifndef LIBGAV1_Dsp10bpp_ObmcVertical
+#define LIBGAV1_Dsp10bpp_ObmcVertical LIBGAV1_CPU_SSE4_1
+#endif
+#ifndef LIBGAV1_Dsp10bpp_ObmcHorizontal
+#define LIBGAV1_Dsp10bpp_ObmcHorizontal LIBGAV1_CPU_SSE4_1
+#endif
 #endif  // LIBGAV1_TARGETING_SSE4_1
 
 #endif  // LIBGAV1_SRC_DSP_X86_OBMC_SSE4_H_
diff --git a/src/dsp/x86/super_res_sse4.cc b/src/dsp/x86/super_res_sse4.cc
index b2bdfd2..85d05bc 100644
--- a/src/dsp/x86/super_res_sse4.cc
+++ b/src/dsp/x86/super_res_sse4.cc
@@ -91,10 +91,10 @@ void SuperResCoefficients_SSE4_1(const int upscaled_width,
 }
 
 void SuperRes_SSE4_1(const void* const coefficients, void* const source,
-                     const ptrdiff_t stride, const int height,
+                     const ptrdiff_t source_stride, const int height,
                      const int downscaled_width, const int upscaled_width,
                      const int initial_subpixel_x, const int step,
-                     void* const dest) {
+                     void* const dest, const ptrdiff_t dest_stride) {
   auto* src = static_cast<uint8_t*>(source) - DivideBy2(kSuperResFilterTaps);
   auto* dst = static_cast<uint8_t*>(dest);
   int y = height;
@@ -104,16 +104,30 @@ void SuperRes_SSE4_1(const void* const coefficients, void* const source,
     ExtendLine<uint8_t>(src + DivideBy2(kSuperResFilterTaps), downscaled_width,
                         kSuperResHorizontalBorder, kSuperResHorizontalBorder);
     int subpixel_x = initial_subpixel_x;
-    // The below code calculates up to 15 extra upscaled
-    // pixels which will over-read up to 15 downscaled pixels in the end of each
-    // row. kSuperResHorizontalBorder accounts for this.
+    // The below code calculates up to 15 extra upscaled pixels which will
+    // over-read up to 15 downscaled pixels in the end of each row.
+    // kSuperResHorizontalPadding protects this behavior from segmentation
+    // faults and threading issues.
     int x = RightShiftWithCeiling(upscaled_width, 4);
     do {
       __m128i weighted_src[8];
       for (int i = 0; i < 8; ++i, filter += 16) {
-        __m128i s = LoadLo8(&src[subpixel_x >> kSuperResScaleBits]);
+        // TODO(b/178652672): Remove Msan loads when hadd bug is resolved.
+        // It's fine to write uninitialized bytes outside the frame, but the
+        // inside-frame pixels are incorrectly labeled uninitialized if
+        // uninitialized values go through the hadd intrinsics.
+        // |src| is offset 4 pixels to the left, and there are 4 extended border
+        // pixels, so a difference of 0 from |downscaled_width| indicates 8 good
+        // bytes. A difference of 1 indicates 7 good bytes.
+        const int msan_bytes_lo =
+            (subpixel_x >> kSuperResScaleBits) - downscaled_width;
+        __m128i s =
+            LoadLo8Msan(&src[subpixel_x >> kSuperResScaleBits], msan_bytes_lo);
         subpixel_x += step;
-        s = LoadHi8(s, &src[subpixel_x >> kSuperResScaleBits]);
+        const int msan_bytes_hi =
+            (subpixel_x >> kSuperResScaleBits) - downscaled_width;
+        s = LoadHi8Msan(s, &src[subpixel_x >> kSuperResScaleBits],
+                        msan_bytes_hi);
         subpixel_x += step;
         const __m128i f = LoadAligned16(filter);
         weighted_src[i] = _mm_maddubs_epi16(s, f);
@@ -135,26 +149,165 @@ void SuperRes_SSE4_1(const void* const coefficients, void* const source,
       StoreAligned16(dst_ptr, _mm_packus_epi16(a[0], a[1]));
       dst_ptr += 16;
     } while (--x != 0);
-    src += stride;
-    dst += stride;
+    src += source_stride;
+    dst += dest_stride;
   } while (--y != 0);
 }
 
 void Init8bpp() {
   Dsp* dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+#if DSP_ENABLED_8BPP_SSE4_1(SuperResCoefficients)
   dsp->super_res_coefficients = SuperResCoefficients_SSE4_1;
+#endif  // DSP_ENABLED_8BPP_SSE4_1(SuperResCoefficients)
+#if DSP_ENABLED_8BPP_SSE4_1(SuperRes)
   dsp->super_res = SuperRes_SSE4_1;
+#endif  // DSP_ENABLED_8BPP_SSE4_1(SuperRes)
 }
 
 }  // namespace
 }  // namespace low_bitdepth
 
-void SuperResInit_SSE4_1() { low_bitdepth::Init8bpp(); }
+//------------------------------------------------------------------------------
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+// Upscale_Filter as defined in AV1 Section 7.16
+alignas(16) const int16_t
+    kUpscaleFilter[kSuperResFilterShifts][kSuperResFilterTaps] = {
+        {0, 0, 0, 128, 0, 0, 0, 0},        {0, 0, -1, 128, 2, -1, 0, 0},
+        {0, 1, -3, 127, 4, -2, 1, 0},      {0, 1, -4, 127, 6, -3, 1, 0},
+        {0, 2, -6, 126, 8, -3, 1, 0},      {0, 2, -7, 125, 11, -4, 1, 0},
+        {-1, 2, -8, 125, 13, -5, 2, 0},    {-1, 3, -9, 124, 15, -6, 2, 0},
+        {-1, 3, -10, 123, 18, -6, 2, -1},  {-1, 3, -11, 122, 20, -7, 3, -1},
+        {-1, 4, -12, 121, 22, -8, 3, -1},  {-1, 4, -13, 120, 25, -9, 3, -1},
+        {-1, 4, -14, 118, 28, -9, 3, -1},  {-1, 4, -15, 117, 30, -10, 4, -1},
+        {-1, 5, -16, 116, 32, -11, 4, -1}, {-1, 5, -16, 114, 35, -12, 4, -1},
+        {-1, 5, -17, 112, 38, -12, 4, -1}, {-1, 5, -18, 111, 40, -13, 5, -1},
+        {-1, 5, -18, 109, 43, -14, 5, -1}, {-1, 6, -19, 107, 45, -14, 5, -1},
+        {-1, 6, -19, 105, 48, -15, 5, -1}, {-1, 6, -19, 103, 51, -16, 5, -1},
+        {-1, 6, -20, 101, 53, -16, 6, -1}, {-1, 6, -20, 99, 56, -17, 6, -1},
+        {-1, 6, -20, 97, 58, -17, 6, -1},  {-1, 6, -20, 95, 61, -18, 6, -1},
+        {-2, 7, -20, 93, 64, -18, 6, -2},  {-2, 7, -20, 91, 66, -19, 6, -1},
+        {-2, 7, -20, 88, 69, -19, 6, -1},  {-2, 7, -20, 86, 71, -19, 6, -1},
+        {-2, 7, -20, 84, 74, -20, 7, -2},  {-2, 7, -20, 81, 76, -20, 7, -1},
+        {-2, 7, -20, 79, 79, -20, 7, -2},  {-1, 7, -20, 76, 81, -20, 7, -2},
+        {-2, 7, -20, 74, 84, -20, 7, -2},  {-1, 6, -19, 71, 86, -20, 7, -2},
+        {-1, 6, -19, 69, 88, -20, 7, -2},  {-1, 6, -19, 66, 91, -20, 7, -2},
+        {-2, 6, -18, 64, 93, -20, 7, -2},  {-1, 6, -18, 61, 95, -20, 6, -1},
+        {-1, 6, -17, 58, 97, -20, 6, -1},  {-1, 6, -17, 56, 99, -20, 6, -1},
+        {-1, 6, -16, 53, 101, -20, 6, -1}, {-1, 5, -16, 51, 103, -19, 6, -1},
+        {-1, 5, -15, 48, 105, -19, 6, -1}, {-1, 5, -14, 45, 107, -19, 6, -1},
+        {-1, 5, -14, 43, 109, -18, 5, -1}, {-1, 5, -13, 40, 111, -18, 5, -1},
+        {-1, 4, -12, 38, 112, -17, 5, -1}, {-1, 4, -12, 35, 114, -16, 5, -1},
+        {-1, 4, -11, 32, 116, -16, 5, -1}, {-1, 4, -10, 30, 117, -15, 4, -1},
+        {-1, 3, -9, 28, 118, -14, 4, -1},  {-1, 3, -9, 25, 120, -13, 4, -1},
+        {-1, 3, -8, 22, 121, -12, 4, -1},  {-1, 3, -7, 20, 122, -11, 3, -1},
+        {-1, 2, -6, 18, 123, -10, 3, -1},  {0, 2, -6, 15, 124, -9, 3, -1},
+        {0, 2, -5, 13, 125, -8, 2, -1},    {0, 1, -4, 11, 125, -7, 2, 0},
+        {0, 1, -3, 8, 126, -6, 2, 0},      {0, 1, -3, 6, 127, -4, 1, 0},
+        {0, 1, -2, 4, 127, -3, 1, 0},      {0, 0, -1, 2, 128, -1, 0, 0},
+};
+
+void SuperResCoefficients_SSE4_1(const int upscaled_width,
+                                 const int initial_subpixel_x, const int step,
+                                 void* const coefficients) {
+  auto* dst = static_cast<uint16_t*>(coefficients);
+  int subpixel_x = initial_subpixel_x;
+  int x = RightShiftWithCeiling(upscaled_width, 3);
+  do {
+    for (int i = 0; i < 8; ++i, dst += 8) {
+      int remainder = subpixel_x & kSuperResScaleMask;
+      __m128i filter =
+          LoadAligned16(kUpscaleFilter[remainder >> kSuperResExtraBits]);
+      subpixel_x += step;
+      StoreAligned16(dst, filter);
+    }
+  } while (--x != 0);
+}
+
+template <int bitdepth>
+void SuperRes_SSE4_1(const void* const coefficients, void* const source,
+                     const ptrdiff_t source_stride, const int height,
+                     const int downscaled_width, const int upscaled_width,
+                     const int initial_subpixel_x, const int step,
+                     void* const dest, const ptrdiff_t dest_stride) {
+  auto* src = static_cast<uint16_t*>(source) - DivideBy2(kSuperResFilterTaps);
+  auto* dst = static_cast<uint16_t*>(dest);
+  int y = height;
+  do {
+    const auto* filter = static_cast<const uint16_t*>(coefficients);
+    uint16_t* dst_ptr = dst;
+    ExtendLine<uint16_t>(src + DivideBy2(kSuperResFilterTaps), downscaled_width,
+                         kSuperResHorizontalBorder, kSuperResHorizontalPadding);
+    int subpixel_x = initial_subpixel_x;
+    // The below code calculates up to 7 extra upscaled
+    // pixels which will over-read up to 7 downscaled pixels in the end of each
+    // row. kSuperResHorizontalPadding accounts for this.
+    int x = RightShiftWithCeiling(upscaled_width, 3);
+    do {
+      __m128i weighted_src[8];
+      for (int i = 0; i < 8; ++i, filter += 8) {
+        const __m128i s =
+            LoadUnaligned16(&src[subpixel_x >> kSuperResScaleBits]);
+        subpixel_x += step;
+        const __m128i f = LoadAligned16(filter);
+        weighted_src[i] = _mm_madd_epi16(s, f);
+      }
+
+      __m128i a[4];
+      a[0] = _mm_hadd_epi32(weighted_src[0], weighted_src[1]);
+      a[1] = _mm_hadd_epi32(weighted_src[2], weighted_src[3]);
+      a[2] = _mm_hadd_epi32(weighted_src[4], weighted_src[5]);
+      a[3] = _mm_hadd_epi32(weighted_src[6], weighted_src[7]);
+
+      a[0] = _mm_hadd_epi32(a[0], a[1]);
+      a[1] = _mm_hadd_epi32(a[2], a[3]);
+      a[0] = RightShiftWithRounding_S32(a[0], kFilterBits);
+      a[1] = RightShiftWithRounding_S32(a[1], kFilterBits);
+
+      // Clip the values at (1 << bd) - 1
+      const __m128i clipped_16 = _mm_min_epi16(
+          _mm_packus_epi32(a[0], a[1]), _mm_set1_epi16((1 << bitdepth) - 1));
+      StoreAligned16(dst_ptr, clipped_16);
+      dst_ptr += 8;
+    } while (--x != 0);
+    src += source_stride;
+    dst += dest_stride;
+  } while (--y != 0);
+}
+
+void Init10bpp() {
+  Dsp* dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+  static_cast<void>(dsp);
+#if DSP_ENABLED_10BPP_SSE4_1(SuperResCoefficients)
+  dsp->super_res_coefficients = SuperResCoefficients_SSE4_1;
+#else
+  static_cast<void>(SuperResCoefficients_SSE4_1);
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(SuperRes)
+  dsp->super_res = SuperRes_SSE4_1<10>;
+#else
+  static_cast<void>(SuperRes_SSE4_1);
+#endif
+}
+
+}  // namespace
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+void SuperResInit_SSE4_1() {
+  low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif
+}
 
 }  // namespace dsp
 }  // namespace libgav1
 
-#else  // !LIBGAV1_TARGETING_SSE4_1
+#else   // !LIBGAV1_TARGETING_SSE4_1
 
 namespace libgav1 {
 namespace dsp {
diff --git a/src/dsp/x86/super_res_sse4.h b/src/dsp/x86/super_res_sse4.h
index aef5147..07a7ef4 100644
--- a/src/dsp/x86/super_res_sse4.h
+++ b/src/dsp/x86/super_res_sse4.h
@@ -30,9 +30,21 @@ void SuperResInit_SSE4_1();
 }  // namespace libgav1
 
 #if LIBGAV1_TARGETING_SSE4_1
+#ifndef LIBGAV1_Dsp8bpp_SuperResCoefficients
+#define LIBGAV1_Dsp8bpp_SuperResCoefficients LIBGAV1_CPU_SSE4_1
+#endif
+
 #ifndef LIBGAV1_Dsp8bpp_SuperRes
 #define LIBGAV1_Dsp8bpp_SuperRes LIBGAV1_CPU_SSE4_1
 #endif
+
+#ifndef LIBGAV1_Dsp10bpp_SuperResCoefficients
+#define LIBGAV1_Dsp10bpp_SuperResCoefficients LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_SuperRes
+#define LIBGAV1_Dsp10bpp_SuperRes LIBGAV1_CPU_SSE4_1
+#endif
 #endif  // LIBGAV1_TARGETING_SSE4_1
 
 #endif  // LIBGAV1_SRC_DSP_X86_SUPER_RES_SSE4_H_
diff --git a/src/dsp/x86/transpose_sse4.h b/src/dsp/x86/transpose_sse4.h
index 208b301..9726495 100644
--- a/src/dsp/x86/transpose_sse4.h
+++ b/src/dsp/x86/transpose_sse4.h
@@ -30,9 +30,9 @@ LIBGAV1_ALWAYS_INLINE void Transpose2x16_U16(const __m128i* const in,
                                              __m128i* const out) {
   // Unpack 16 bit elements. Goes from:
   // in[0]:  00 01 10 11  20 21 30 31
-  // in[0]:  40 41 50 51  60 61 70 71
-  // in[0]:  80 81 90 91  a0 a1 b0 b1
-  // in[0]:  c0 c1 d0 d1  e0 e1 f0 f1
+  // in[1]:  40 41 50 51  60 61 70 71
+  // in[2]:  80 81 90 91  a0 a1 b0 b1
+  // in[3]:  c0 c1 d0 d1  e0 e1 f0 f1
   // to:
   // a0:     00 40 01 41  10 50 11 51
   // a1:     20 60 21 61  30 70 31 71
diff --git a/src/dsp/x86/warp_sse4.cc b/src/dsp/x86/warp_sse4.cc
index 43279ab..9ddfeac 100644
--- a/src/dsp/x86/warp_sse4.cc
+++ b/src/dsp/x86/warp_sse4.cc
@@ -513,7 +513,7 @@ void WarpInit_SSE4_1() { low_bitdepth::Init8bpp(); }
 
 }  // namespace dsp
 }  // namespace libgav1
-#else  // !LIBGAV1_TARGETING_SSE4_1
+#else   // !LIBGAV1_TARGETING_SSE4_1
 
 namespace libgav1 {
 namespace dsp {
diff --git a/src/dsp/x86/weight_mask_sse4.cc b/src/dsp/x86/weight_mask_sse4.cc
index dfd5662..08a1739 100644
--- a/src/dsp/x86/weight_mask_sse4.cc
+++ b/src/dsp/x86/weight_mask_sse4.cc
@@ -36,47 +36,65 @@ namespace {
 
 constexpr int kRoundingBits8bpp = 4;
 
-template <bool mask_is_inverse>
-inline void WeightMask8_SSE4(const int16_t* prediction_0,
-                             const int16_t* prediction_1, uint8_t* mask) {
-  const __m128i pred_0 = LoadAligned16(prediction_0);
-  const __m128i pred_1 = LoadAligned16(prediction_1);
-  const __m128i difference = RightShiftWithRounding_U16(
-      _mm_abs_epi16(_mm_sub_epi16(pred_0, pred_1)), kRoundingBits8bpp);
-  const __m128i scaled_difference = _mm_srli_epi16(difference, 4);
+template <bool mask_is_inverse, bool is_store_16>
+inline void WeightMask16_SSE4(const int16_t* prediction_0,
+                              const int16_t* prediction_1, uint8_t* mask,
+                              ptrdiff_t mask_stride) {
+  const __m128i pred_00 = LoadAligned16(prediction_0);
+  const __m128i pred_10 = LoadAligned16(prediction_1);
+  const __m128i difference_0 = RightShiftWithRounding_U16(
+      _mm_abs_epi16(_mm_sub_epi16(pred_00, pred_10)), kRoundingBits8bpp);
+  const __m128i scaled_difference_0 = _mm_srli_epi16(difference_0, 4);
+
+  const __m128i pred_01 = LoadAligned16(prediction_0 + 8);
+  const __m128i pred_11 = LoadAligned16(prediction_1 + 8);
+  const __m128i difference_1 = RightShiftWithRounding_U16(
+      _mm_abs_epi16(_mm_sub_epi16(pred_01, pred_11)), kRoundingBits8bpp);
+  const __m128i scaled_difference_1 = _mm_srli_epi16(difference_1, 4);
+
   const __m128i difference_offset = _mm_set1_epi8(38);
   const __m128i adjusted_difference =
-      _mm_adds_epu8(_mm_packus_epi16(scaled_difference, scaled_difference),
+      _mm_adds_epu8(_mm_packus_epi16(scaled_difference_0, scaled_difference_1),
                     difference_offset);
   const __m128i mask_ceiling = _mm_set1_epi8(64);
   const __m128i mask_value = _mm_min_epi8(adjusted_difference, mask_ceiling);
   if (mask_is_inverse) {
     const __m128i inverted_mask_value = _mm_sub_epi8(mask_ceiling, mask_value);
-    StoreLo8(mask, inverted_mask_value);
+    if (is_store_16) {
+      StoreAligned16(mask, inverted_mask_value);
+    } else {
+      StoreLo8(mask, inverted_mask_value);
+      StoreHi8(mask + mask_stride, inverted_mask_value);
+    }
   } else {
-    StoreLo8(mask, mask_value);
+    if (is_store_16) {
+      StoreAligned16(mask, mask_value);
+    } else {
+      StoreLo8(mask, mask_value);
+      StoreHi8(mask + mask_stride, mask_value);
+    }
   }
 }
 
-#define WEIGHT8_WITHOUT_STRIDE \
-  WeightMask8_SSE4<mask_is_inverse>(pred_0, pred_1, mask)
+#define WEIGHT8_PAIR_WITHOUT_STRIDE \
+  WeightMask16_SSE4<mask_is_inverse, false>(pred_0, pred_1, mask, mask_stride)
 
-#define WEIGHT8_AND_STRIDE \
-  WEIGHT8_WITHOUT_STRIDE;  \
-  pred_0 += 8;             \
-  pred_1 += 8;             \
-  mask += mask_stride
+#define WEIGHT8_PAIR_AND_STRIDE \
+  WEIGHT8_PAIR_WITHOUT_STRIDE;  \
+  pred_0 += 8 << 1;             \
+  pred_1 += 8 << 1;             \
+  mask += mask_stride << 1
 
 template <bool mask_is_inverse>
 void WeightMask8x8_SSE4(const void* prediction_0, const void* prediction_1,
                         uint8_t* mask, ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
-  int y = 0;
-  do {
-    WEIGHT8_AND_STRIDE;
-  } while (++y < 7);
-  WEIGHT8_WITHOUT_STRIDE;
+
+  WEIGHT8_PAIR_AND_STRIDE;
+  WEIGHT8_PAIR_AND_STRIDE;
+  WEIGHT8_PAIR_AND_STRIDE;
+  WEIGHT8_PAIR_WITHOUT_STRIDE;
 }
 
 template <bool mask_is_inverse>
@@ -84,13 +102,13 @@ void WeightMask8x16_SSE4(const void* prediction_0, const void* prediction_1,
                          uint8_t* mask, ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
-  int y3 = 0;
+  int y3 = 3;
   do {
-    WEIGHT8_AND_STRIDE;
-    WEIGHT8_AND_STRIDE;
-    WEIGHT8_AND_STRIDE;
-  } while (++y3 < 5);
-  WEIGHT8_WITHOUT_STRIDE;
+    WEIGHT8_PAIR_AND_STRIDE;
+    WEIGHT8_PAIR_AND_STRIDE;
+  } while (--y3 != 0);
+  WEIGHT8_PAIR_AND_STRIDE;
+  WEIGHT8_PAIR_WITHOUT_STRIDE;
 }
 
 template <bool mask_is_inverse>
@@ -98,21 +116,17 @@ void WeightMask8x32_SSE4(const void* prediction_0, const void* prediction_1,
                          uint8_t* mask, ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
-  int y5 = 0;
+  int y5 = 5;
   do {
-    WEIGHT8_AND_STRIDE;
-    WEIGHT8_AND_STRIDE;
-    WEIGHT8_AND_STRIDE;
-    WEIGHT8_AND_STRIDE;
-    WEIGHT8_AND_STRIDE;
-  } while (++y5 < 6);
-  WEIGHT8_AND_STRIDE;
-  WEIGHT8_WITHOUT_STRIDE;
+    WEIGHT8_PAIR_AND_STRIDE;
+    WEIGHT8_PAIR_AND_STRIDE;
+    WEIGHT8_PAIR_AND_STRIDE;
+  } while (--y5 != 0);
+  WEIGHT8_PAIR_WITHOUT_STRIDE;
 }
 
-#define WEIGHT16_WITHOUT_STRIDE                            \
-  WeightMask8_SSE4<mask_is_inverse>(pred_0, pred_1, mask); \
-  WeightMask8_SSE4<mask_is_inverse>(pred_0 + 8, pred_1 + 8, mask + 8)
+#define WEIGHT16_WITHOUT_STRIDE \
+  WeightMask16_SSE4<mask_is_inverse, true>(pred_0, pred_1, mask, mask_stride)
 
 #define WEIGHT16_AND_STRIDE \
   WEIGHT16_WITHOUT_STRIDE;  \
@@ -125,10 +139,10 @@ void WeightMask16x8_SSE4(const void* prediction_0, const void* prediction_1,
                          uint8_t* mask, ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
-  int y = 0;
+  int y = 7;
   do {
     WEIGHT16_AND_STRIDE;
-  } while (++y < 7);
+  } while (--y != 0);
   WEIGHT16_WITHOUT_STRIDE;
 }
 
@@ -137,12 +151,12 @@ void WeightMask16x16_SSE4(const void* prediction_0, const void* prediction_1,
                           uint8_t* mask, ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
-  int y3 = 0;
+  int y3 = 5;
   do {
     WEIGHT16_AND_STRIDE;
     WEIGHT16_AND_STRIDE;
     WEIGHT16_AND_STRIDE;
-  } while (++y3 < 5);
+  } while (--y3 != 0);
   WEIGHT16_WITHOUT_STRIDE;
 }
 
@@ -151,14 +165,14 @@ void WeightMask16x32_SSE4(const void* prediction_0, const void* prediction_1,
                           uint8_t* mask, ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
-  int y5 = 0;
+  int y5 = 6;
   do {
     WEIGHT16_AND_STRIDE;
     WEIGHT16_AND_STRIDE;
     WEIGHT16_AND_STRIDE;
     WEIGHT16_AND_STRIDE;
     WEIGHT16_AND_STRIDE;
-  } while (++y5 < 6);
+  } while (--y5 != 0);
   WEIGHT16_AND_STRIDE;
   WEIGHT16_WITHOUT_STRIDE;
 }
@@ -168,20 +182,19 @@ void WeightMask16x64_SSE4(const void* prediction_0, const void* prediction_1,
                           uint8_t* mask, ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
-  int y3 = 0;
+  int y3 = 21;
   do {
     WEIGHT16_AND_STRIDE;
     WEIGHT16_AND_STRIDE;
     WEIGHT16_AND_STRIDE;
-  } while (++y3 < 21);
+  } while (--y3 != 0);
   WEIGHT16_WITHOUT_STRIDE;
 }
 
-#define WEIGHT32_WITHOUT_STRIDE                                           \
-  WeightMask8_SSE4<mask_is_inverse>(pred_0, pred_1, mask);                \
-  WeightMask8_SSE4<mask_is_inverse>(pred_0 + 8, pred_1 + 8, mask + 8);    \
-  WeightMask8_SSE4<mask_is_inverse>(pred_0 + 16, pred_1 + 16, mask + 16); \
-  WeightMask8_SSE4<mask_is_inverse>(pred_0 + 24, pred_1 + 24, mask + 24)
+#define WEIGHT32_WITHOUT_STRIDE                                                \
+  WeightMask16_SSE4<mask_is_inverse, true>(pred_0, pred_1, mask, mask_stride); \
+  WeightMask16_SSE4<mask_is_inverse, true>(pred_0 + 16, pred_1 + 16,           \
+                                           mask + 16, mask_stride)
 
 #define WEIGHT32_AND_STRIDE \
   WEIGHT32_WITHOUT_STRIDE;  \
@@ -209,12 +222,12 @@ void WeightMask32x16_SSE4(const void* prediction_0, const void* prediction_1,
                           uint8_t* mask, ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
-  int y3 = 0;
+  int y3 = 5;
   do {
     WEIGHT32_AND_STRIDE;
     WEIGHT32_AND_STRIDE;
     WEIGHT32_AND_STRIDE;
-  } while (++y3 < 5);
+  } while (--y3 != 0);
   WEIGHT32_WITHOUT_STRIDE;
 }
 
@@ -223,14 +236,14 @@ void WeightMask32x32_SSE4(const void* prediction_0, const void* prediction_1,
                           uint8_t* mask, ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
-  int y5 = 0;
+  int y5 = 6;
   do {
     WEIGHT32_AND_STRIDE;
     WEIGHT32_AND_STRIDE;
     WEIGHT32_AND_STRIDE;
     WEIGHT32_AND_STRIDE;
     WEIGHT32_AND_STRIDE;
-  } while (++y5 < 6);
+  } while (--y5 != 0);
   WEIGHT32_AND_STRIDE;
   WEIGHT32_WITHOUT_STRIDE;
 }
@@ -240,24 +253,23 @@ void WeightMask32x64_SSE4(const void* prediction_0, const void* prediction_1,
                           uint8_t* mask, ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
-  int y3 = 0;
+  int y3 = 21;
   do {
     WEIGHT32_AND_STRIDE;
     WEIGHT32_AND_STRIDE;
     WEIGHT32_AND_STRIDE;
-  } while (++y3 < 21);
+  } while (--y3 != 0);
   WEIGHT32_WITHOUT_STRIDE;
 }
 
-#define WEIGHT64_WITHOUT_STRIDE                                           \
-  WeightMask8_SSE4<mask_is_inverse>(pred_0, pred_1, mask);                \
-  WeightMask8_SSE4<mask_is_inverse>(pred_0 + 8, pred_1 + 8, mask + 8);    \
-  WeightMask8_SSE4<mask_is_inverse>(pred_0 + 16, pred_1 + 16, mask + 16); \
-  WeightMask8_SSE4<mask_is_inverse>(pred_0 + 24, pred_1 + 24, mask + 24); \
-  WeightMask8_SSE4<mask_is_inverse>(pred_0 + 32, pred_1 + 32, mask + 32); \
-  WeightMask8_SSE4<mask_is_inverse>(pred_0 + 40, pred_1 + 40, mask + 40); \
-  WeightMask8_SSE4<mask_is_inverse>(pred_0 + 48, pred_1 + 48, mask + 48); \
-  WeightMask8_SSE4<mask_is_inverse>(pred_0 + 56, pred_1 + 56, mask + 56)
+#define WEIGHT64_WITHOUT_STRIDE                                                \
+  WeightMask16_SSE4<mask_is_inverse, true>(pred_0, pred_1, mask, mask_stride); \
+  WeightMask16_SSE4<mask_is_inverse, true>(pred_0 + 16, pred_1 + 16,           \
+                                           mask + 16, mask_stride);            \
+  WeightMask16_SSE4<mask_is_inverse, true>(pred_0 + 32, pred_1 + 32,           \
+                                           mask + 32, mask_stride);            \
+  WeightMask16_SSE4<mask_is_inverse, true>(pred_0 + 48, pred_1 + 48,           \
+                                           mask + 48, mask_stride)
 
 #define WEIGHT64_AND_STRIDE \
   WEIGHT64_WITHOUT_STRIDE;  \
@@ -447,12 +459,491 @@ void Init8bpp() {
 }  // namespace
 }  // namespace low_bitdepth
 
-void WeightMaskInit_SSE4_1() { low_bitdepth::Init8bpp(); }
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+constexpr int kRoundingBits10bpp = 6;
+constexpr int kScaledDiffShift = 4;
+
+template <bool mask_is_inverse, bool is_store_16>
+inline void WeightMask16_10bpp_SSE4(const uint16_t* prediction_0,
+                                    const uint16_t* prediction_1, uint8_t* mask,
+                                    ptrdiff_t mask_stride) {
+  const __m128i diff_offset = _mm_set1_epi8(38);
+  const __m128i mask_ceiling = _mm_set1_epi8(64);
+  const __m128i zero = _mm_setzero_si128();
+
+  // Range of prediction: [3988, 61532].
+  const __m128i pred_00 = LoadAligned16(prediction_0);
+  const __m128i pred_10 = LoadAligned16(prediction_1);
+  const __m128i pred_lo_00 = _mm_cvtepu16_epi32(pred_00);
+  const __m128i pred_lo_10 = _mm_cvtepu16_epi32(pred_10);
+  const __m128i diff_lo_0 = RightShiftWithRounding_U32(
+      _mm_abs_epi32(_mm_sub_epi32(pred_lo_00, pred_lo_10)), kRoundingBits10bpp);
+
+  const __m128i pred_hi_00 = _mm_unpackhi_epi16(pred_00, zero);
+  const __m128i pred_hi_10 = _mm_unpackhi_epi16(pred_10, zero);
+  const __m128i diff_hi_0 = RightShiftWithRounding_U32(
+      _mm_abs_epi32(_mm_sub_epi32(pred_hi_00, pred_hi_10)), kRoundingBits10bpp);
+
+  const __m128i diff_0 = _mm_packus_epi32(diff_lo_0, diff_hi_0);
+  const __m128i scaled_diff_0 = _mm_srli_epi16(diff_0, kScaledDiffShift);
+
+  const __m128i pred_01 = LoadAligned16(prediction_0 + 8);
+  const __m128i pred_11 = LoadAligned16(prediction_1 + 8);
+  const __m128i pred_lo_01 = _mm_cvtepu16_epi32(pred_01);
+  const __m128i pred_lo_11 = _mm_cvtepu16_epi32(pred_11);
+  const __m128i diff_lo_1 = RightShiftWithRounding_U32(
+      _mm_abs_epi32(_mm_sub_epi32(pred_lo_01, pred_lo_11)), kRoundingBits10bpp);
+
+  const __m128i pred_hi_01 = _mm_unpackhi_epi16(pred_01, zero);
+  const __m128i pred_hi_11 = _mm_unpackhi_epi16(pred_11, zero);
+  const __m128i diff_hi_1 = RightShiftWithRounding_U32(
+      _mm_abs_epi32(_mm_sub_epi32(pred_hi_01, pred_hi_11)), kRoundingBits10bpp);
+
+  const __m128i diff_1 = _mm_packus_epi32(diff_lo_1, diff_hi_1);
+  const __m128i scaled_diff_1 = _mm_srli_epi16(diff_1, kScaledDiffShift);
+
+  const __m128i adjusted_diff = _mm_adds_epu8(
+      _mm_packus_epi16(scaled_diff_0, scaled_diff_1), diff_offset);
+  const __m128i mask_value = _mm_min_epi8(adjusted_diff, mask_ceiling);
+
+  if (mask_is_inverse) {
+    const __m128i inverted_mask_value = _mm_sub_epi8(mask_ceiling, mask_value);
+    if (is_store_16) {
+      StoreAligned16(mask, inverted_mask_value);
+    } else {
+      StoreLo8(mask, inverted_mask_value);
+      StoreHi8(mask + mask_stride, inverted_mask_value);
+    }
+  } else {
+    if (is_store_16) {
+      StoreAligned16(mask, mask_value);
+    } else {
+      StoreLo8(mask, mask_value);
+      StoreHi8(mask + mask_stride, mask_value);
+    }
+  }
+}
+
+#define WEIGHT8_PAIR_WITHOUT_STRIDE_10BPP                               \
+  WeightMask16_10bpp_SSE4<mask_is_inverse, false>(pred_0, pred_1, mask, \
+                                                  mask_stride)
+
+#define WEIGHT8_PAIR_AND_STRIDE_10BPP \
+  WEIGHT8_PAIR_WITHOUT_STRIDE_10BPP;  \
+  pred_0 += 8 << 1;                   \
+  pred_1 += 8 << 1;                   \
+  mask += mask_stride << 1
+
+template <bool mask_is_inverse>
+void WeightMask8x8_10bpp_SSE4(const void* prediction_0,
+                              const void* prediction_1, uint8_t* mask,
+                              ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+
+  WEIGHT8_PAIR_AND_STRIDE_10BPP;
+  WEIGHT8_PAIR_AND_STRIDE_10BPP;
+  WEIGHT8_PAIR_AND_STRIDE_10BPP;
+  WEIGHT8_PAIR_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask8x16_10bpp_SSE4(const void* prediction_0,
+                               const void* prediction_1, uint8_t* mask,
+                               ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  int y3 = 3;
+  do {
+    WEIGHT8_PAIR_AND_STRIDE_10BPP;
+    WEIGHT8_PAIR_AND_STRIDE_10BPP;
+  } while (--y3 != 0);
+  WEIGHT8_PAIR_AND_STRIDE_10BPP;
+  WEIGHT8_PAIR_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask8x32_10bpp_SSE4(const void* prediction_0,
+                               const void* prediction_1, uint8_t* mask,
+                               ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  int y5 = 5;
+  do {
+    WEIGHT8_PAIR_AND_STRIDE_10BPP;
+    WEIGHT8_PAIR_AND_STRIDE_10BPP;
+    WEIGHT8_PAIR_AND_STRIDE_10BPP;
+  } while (--y5 != 0);
+  WEIGHT8_PAIR_WITHOUT_STRIDE_10BPP;
+}
+
+#define WEIGHT16_WITHOUT_STRIDE_10BPP                                  \
+  WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0, pred_1, mask, \
+                                                 mask_stride)
+
+#define WEIGHT16_AND_STRIDE_10BPP \
+  WEIGHT16_WITHOUT_STRIDE_10BPP;  \
+  pred_0 += 16;                   \
+  pred_1 += 16;                   \
+  mask += mask_stride
+
+template <bool mask_is_inverse>
+void WeightMask16x8_10bpp_SSE4(const void* prediction_0,
+                               const void* prediction_1, uint8_t* mask,
+                               ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  int y = 7;
+  do {
+    WEIGHT16_AND_STRIDE_10BPP;
+  } while (--y != 0);
+  WEIGHT16_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask16x16_10bpp_SSE4(const void* prediction_0,
+                                const void* prediction_1, uint8_t* mask,
+                                ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  int y3 = 5;
+  do {
+    WEIGHT16_AND_STRIDE_10BPP;
+    WEIGHT16_AND_STRIDE_10BPP;
+    WEIGHT16_AND_STRIDE_10BPP;
+  } while (--y3 != 0);
+  WEIGHT16_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask16x32_10bpp_SSE4(const void* prediction_0,
+                                const void* prediction_1, uint8_t* mask,
+                                ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  int y5 = 6;
+  do {
+    WEIGHT16_AND_STRIDE_10BPP;
+    WEIGHT16_AND_STRIDE_10BPP;
+    WEIGHT16_AND_STRIDE_10BPP;
+    WEIGHT16_AND_STRIDE_10BPP;
+    WEIGHT16_AND_STRIDE_10BPP;
+  } while (--y5 != 0);
+  WEIGHT16_AND_STRIDE_10BPP;
+  WEIGHT16_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask16x64_10bpp_SSE4(const void* prediction_0,
+                                const void* prediction_1, uint8_t* mask,
+                                ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  int y3 = 21;
+  do {
+    WEIGHT16_AND_STRIDE_10BPP;
+    WEIGHT16_AND_STRIDE_10BPP;
+    WEIGHT16_AND_STRIDE_10BPP;
+  } while (--y3 != 0);
+  WEIGHT16_WITHOUT_STRIDE_10BPP;
+}
+
+#define WEIGHT32_WITHOUT_STRIDE_10BPP                                      \
+  WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0, pred_1, mask,     \
+                                                 mask_stride);             \
+  WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0 + 16, pred_1 + 16, \
+                                                 mask + 16, mask_stride)
+
+#define WEIGHT32_AND_STRIDE_10BPP \
+  WEIGHT32_WITHOUT_STRIDE_10BPP;  \
+  pred_0 += 32;                   \
+  pred_1 += 32;                   \
+  mask += mask_stride
+
+template <bool mask_is_inverse>
+void WeightMask32x8_10bpp_SSE4(const void* prediction_0,
+                               const void* prediction_1, uint8_t* mask,
+                               ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  WEIGHT32_AND_STRIDE_10BPP;
+  WEIGHT32_AND_STRIDE_10BPP;
+  WEIGHT32_AND_STRIDE_10BPP;
+  WEIGHT32_AND_STRIDE_10BPP;
+  WEIGHT32_AND_STRIDE_10BPP;
+  WEIGHT32_AND_STRIDE_10BPP;
+  WEIGHT32_AND_STRIDE_10BPP;
+  WEIGHT32_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask32x16_10bpp_SSE4(const void* prediction_0,
+                                const void* prediction_1, uint8_t* mask,
+                                ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  int y3 = 5;
+  do {
+    WEIGHT32_AND_STRIDE_10BPP;
+    WEIGHT32_AND_STRIDE_10BPP;
+    WEIGHT32_AND_STRIDE_10BPP;
+  } while (--y3 != 0);
+  WEIGHT32_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask32x32_10bpp_SSE4(const void* prediction_0,
+                                const void* prediction_1, uint8_t* mask,
+                                ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  int y5 = 6;
+  do {
+    WEIGHT32_AND_STRIDE_10BPP;
+    WEIGHT32_AND_STRIDE_10BPP;
+    WEIGHT32_AND_STRIDE_10BPP;
+    WEIGHT32_AND_STRIDE_10BPP;
+    WEIGHT32_AND_STRIDE_10BPP;
+  } while (--y5 != 0);
+  WEIGHT32_AND_STRIDE_10BPP;
+  WEIGHT32_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask32x64_10bpp_SSE4(const void* prediction_0,
+                                const void* prediction_1, uint8_t* mask,
+                                ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  int y3 = 21;
+  do {
+    WEIGHT32_AND_STRIDE_10BPP;
+    WEIGHT32_AND_STRIDE_10BPP;
+    WEIGHT32_AND_STRIDE_10BPP;
+  } while (--y3 != 0);
+  WEIGHT32_WITHOUT_STRIDE_10BPP;
+}
+
+#define WEIGHT64_WITHOUT_STRIDE_10BPP                                      \
+  WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0, pred_1, mask,     \
+                                                 mask_stride);             \
+  WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0 + 16, pred_1 + 16, \
+                                                 mask + 16, mask_stride);  \
+  WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0 + 32, pred_1 + 32, \
+                                                 mask + 32, mask_stride);  \
+  WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0 + 48, pred_1 + 48, \
+                                                 mask + 48, mask_stride)
+
+#define WEIGHT64_AND_STRIDE_10BPP \
+  WEIGHT64_WITHOUT_STRIDE_10BPP;  \
+  pred_0 += 64;                   \
+  pred_1 += 64;                   \
+  mask += mask_stride
+
+template <bool mask_is_inverse>
+void WeightMask64x16_10bpp_SSE4(const void* prediction_0,
+                                const void* prediction_1, uint8_t* mask,
+                                ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  int y3 = 5;
+  do {
+    WEIGHT64_AND_STRIDE_10BPP;
+    WEIGHT64_AND_STRIDE_10BPP;
+    WEIGHT64_AND_STRIDE_10BPP;
+  } while (--y3 != 0);
+  WEIGHT64_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask64x32_10bpp_SSE4(const void* prediction_0,
+                                const void* prediction_1, uint8_t* mask,
+                                ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  int y5 = 6;
+  do {
+    WEIGHT64_AND_STRIDE_10BPP;
+    WEIGHT64_AND_STRIDE_10BPP;
+    WEIGHT64_AND_STRIDE_10BPP;
+    WEIGHT64_AND_STRIDE_10BPP;
+    WEIGHT64_AND_STRIDE_10BPP;
+  } while (--y5 != 0);
+  WEIGHT64_AND_STRIDE_10BPP;
+  WEIGHT64_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask64x64_10bpp_SSE4(const void* prediction_0,
+                                const void* prediction_1, uint8_t* mask,
+                                ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  int y3 = 21;
+  do {
+    WEIGHT64_AND_STRIDE_10BPP;
+    WEIGHT64_AND_STRIDE_10BPP;
+    WEIGHT64_AND_STRIDE_10BPP;
+  } while (--y3 != 0);
+  WEIGHT64_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask64x128_10bpp_SSE4(const void* prediction_0,
+                                 const void* prediction_1, uint8_t* mask,
+                                 ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  int y3 = 42;
+  do {
+    WEIGHT64_AND_STRIDE_10BPP;
+    WEIGHT64_AND_STRIDE_10BPP;
+    WEIGHT64_AND_STRIDE_10BPP;
+  } while (--y3 != 0);
+  WEIGHT64_AND_STRIDE_10BPP;
+  WEIGHT64_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask128x64_10bpp_SSE4(const void* prediction_0,
+                                 const void* prediction_1, uint8_t* mask,
+                                 ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  int y3 = 21;
+  const ptrdiff_t adjusted_mask_stride = mask_stride - 64;
+  do {
+    WEIGHT64_WITHOUT_STRIDE_10BPP;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += 64;
+    WEIGHT64_WITHOUT_STRIDE_10BPP;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += adjusted_mask_stride;
+
+    WEIGHT64_WITHOUT_STRIDE_10BPP;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += 64;
+    WEIGHT64_WITHOUT_STRIDE_10BPP;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += adjusted_mask_stride;
+
+    WEIGHT64_WITHOUT_STRIDE_10BPP;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += 64;
+    WEIGHT64_WITHOUT_STRIDE_10BPP;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += adjusted_mask_stride;
+  } while (--y3 != 0);
+  WEIGHT64_WITHOUT_STRIDE_10BPP;
+  pred_0 += 64;
+  pred_1 += 64;
+  mask += 64;
+  WEIGHT64_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask128x128_10bpp_SSE4(const void* prediction_0,
+                                  const void* prediction_1, uint8_t* mask,
+                                  ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  int y3 = 42;
+  const ptrdiff_t adjusted_mask_stride = mask_stride - 64;
+  do {
+    WEIGHT64_WITHOUT_STRIDE_10BPP;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += 64;
+    WEIGHT64_WITHOUT_STRIDE_10BPP;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += adjusted_mask_stride;
+
+    WEIGHT64_WITHOUT_STRIDE_10BPP;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += 64;
+    WEIGHT64_WITHOUT_STRIDE_10BPP;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += adjusted_mask_stride;
+
+    WEIGHT64_WITHOUT_STRIDE_10BPP;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += 64;
+    WEIGHT64_WITHOUT_STRIDE_10BPP;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += adjusted_mask_stride;
+  } while (--y3 != 0);
+  WEIGHT64_WITHOUT_STRIDE_10BPP;
+  pred_0 += 64;
+  pred_1 += 64;
+  mask += 64;
+  WEIGHT64_WITHOUT_STRIDE_10BPP;
+  pred_0 += 64;
+  pred_1 += 64;
+  mask += adjusted_mask_stride;
+
+  WEIGHT64_WITHOUT_STRIDE_10BPP;
+  pred_0 += 64;
+  pred_1 += 64;
+  mask += 64;
+  WEIGHT64_WITHOUT_STRIDE_10BPP;
+}
+
+#define INIT_WEIGHT_MASK_10BPP(width, height, w_index, h_index) \
+  dsp->weight_mask[w_index][h_index][0] =                       \
+      WeightMask##width##x##height##_10bpp_SSE4<0>;             \
+  dsp->weight_mask[w_index][h_index][1] =                       \
+      WeightMask##width##x##height##_10bpp_SSE4<1>
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+  INIT_WEIGHT_MASK_10BPP(8, 8, 0, 0);
+  INIT_WEIGHT_MASK_10BPP(8, 16, 0, 1);
+  INIT_WEIGHT_MASK_10BPP(8, 32, 0, 2);
+  INIT_WEIGHT_MASK_10BPP(16, 8, 1, 0);
+  INIT_WEIGHT_MASK_10BPP(16, 16, 1, 1);
+  INIT_WEIGHT_MASK_10BPP(16, 32, 1, 2);
+  INIT_WEIGHT_MASK_10BPP(16, 64, 1, 3);
+  INIT_WEIGHT_MASK_10BPP(32, 8, 2, 0);
+  INIT_WEIGHT_MASK_10BPP(32, 16, 2, 1);
+  INIT_WEIGHT_MASK_10BPP(32, 32, 2, 2);
+  INIT_WEIGHT_MASK_10BPP(32, 64, 2, 3);
+  INIT_WEIGHT_MASK_10BPP(64, 16, 3, 1);
+  INIT_WEIGHT_MASK_10BPP(64, 32, 3, 2);
+  INIT_WEIGHT_MASK_10BPP(64, 64, 3, 3);
+  INIT_WEIGHT_MASK_10BPP(64, 128, 3, 4);
+  INIT_WEIGHT_MASK_10BPP(128, 64, 4, 3);
+  INIT_WEIGHT_MASK_10BPP(128, 128, 4, 4);
+}
+
+}  // namespace
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+void WeightMaskInit_SSE4_1() {
+  low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif
+}
 
 }  // namespace dsp
 }  // namespace libgav1
 
-#else  // !LIBGAV1_TARGETING_SSE4_1
+#else   // !LIBGAV1_TARGETING_SSE4_1
 
 namespace libgav1 {
 namespace dsp {
diff --git a/src/dsp/x86/weight_mask_sse4.h b/src/dsp/x86/weight_mask_sse4.h
index 07636b7..e5d9d70 100644
--- a/src/dsp/x86/weight_mask_sse4.h
+++ b/src/dsp/x86/weight_mask_sse4.h
@@ -99,6 +99,73 @@ void WeightMaskInit_SSE4_1();
 #define LIBGAV1_Dsp8bpp_WeightMask_128x128 LIBGAV1_CPU_SSE4_1
 #endif
 
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_8x8
+#define LIBGAV1_Dsp10bpp_WeightMask_8x8 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_8x16
+#define LIBGAV1_Dsp10bpp_WeightMask_8x16 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_8x32
+#define LIBGAV1_Dsp10bpp_WeightMask_8x32 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_16x8
+#define LIBGAV1_Dsp10bpp_WeightMask_16x8 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_16x16
+#define LIBGAV1_Dsp10bpp_WeightMask_16x16 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_16x32
+#define LIBGAV1_Dsp10bpp_WeightMask_16x32 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_16x64
+#define LIBGAV1_Dsp10bpp_WeightMask_16x64 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_32x8
+#define LIBGAV1_Dsp10bpp_WeightMask_32x8 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_32x16
+#define LIBGAV1_Dsp10bpp_WeightMask_32x16 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_32x32
+#define LIBGAV1_Dsp10bpp_WeightMask_32x32 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_32x64
+#define LIBGAV1_Dsp10bpp_WeightMask_32x64 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_64x16
+#define LIBGAV1_Dsp10bpp_WeightMask_64x16 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_64x32
+#define LIBGAV1_Dsp10bpp_WeightMask_64x32 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_64x64
+#define LIBGAV1_Dsp10bpp_WeightMask_64x64 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_64x128
+#define LIBGAV1_Dsp10bpp_WeightMask_64x128 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_128x64
+#define LIBGAV1_Dsp10bpp_WeightMask_128x64 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_128x128
+#define LIBGAV1_Dsp10bpp_WeightMask_128x128 LIBGAV1_CPU_SSE4_1
+#endif
 #endif  // LIBGAV1_TARGETING_SSE4_1
 
 #endif  // LIBGAV1_SRC_DSP_ARM_WEIGHT_MASK_SSE4_H_
diff --git a/src/gav1/decoder_settings.h b/src/gav1/decoder_settings.h
index ab22a4d..7ee487f 100644
--- a/src/gav1/decoder_settings.h
+++ b/src/gav1/decoder_settings.h
@@ -62,7 +62,8 @@ typedef struct Libgav1DecoderSettings {
   Libgav1GetFrameBufferCallback get_frame_buffer;
   // Release frame buffer callback.
   Libgav1ReleaseFrameBufferCallback release_frame_buffer;
-  // Release input frame buffer callback.
+  // Release input frame buffer callback. This callback must be set when
+  // |frame_parallel| is true.
   Libgav1ReleaseInputBufferCallback release_input_buffer;
   // Passed as the private_data argument to the callbacks.
   void* callback_private_data;
@@ -117,7 +118,8 @@ struct DecoderSettings {
   GetFrameBufferCallback get_frame_buffer = nullptr;
   // Release frame buffer callback.
   ReleaseFrameBufferCallback release_frame_buffer = nullptr;
-  // Release input frame buffer callback.
+  // Release input frame buffer callback. This callback must be set when
+  // |frame_parallel| is true.
   ReleaseInputBufferCallback release_input_buffer = nullptr;
   // Passed as the private_data argument to the callbacks.
   void* callback_private_data = nullptr;
diff --git a/src/gav1/symbol_visibility.h b/src/gav1/symbol_visibility.h
index ad7498c..116a514 100644
--- a/src/gav1/symbol_visibility.h
+++ b/src/gav1/symbol_visibility.h
@@ -58,6 +58,11 @@
 //
 // Much of the above information and more can be found at
 // https://gcc.gnu.org/wiki/Visibility
+//
+// NOTE: A third-party build system for libgav1 can add -DLIBGAV1_PUBLIC= to the
+// compiler command line to override the definition of LIBGAV1_PUBLIC in this
+// header. This can be used to create a libgav1 static library that will not
+// export any symbols when it is linked into a shared library.
 
 #if !defined(LIBGAV1_PUBLIC)
 #if defined(_WIN32)
@@ -76,7 +81,7 @@
 #else
 #define LIBGAV1_PUBLIC
 #endif  // defined(LIBGAV1_BUILDING_DLL) && LIBGAV1_BUILDING_DLL
-#else
+#else   // !defined(_WIN32)
 #if defined(__GNUC__) && __GNUC__ >= 4
 #define LIBGAV1_PUBLIC __attribute__((visibility("default")))
 #else
diff --git a/src/gav1/version.h b/src/gav1/version.h
index 78a573e..c018928 100644
--- a/src/gav1/version.h
+++ b/src/gav1/version.h
@@ -24,7 +24,7 @@
 
 #define LIBGAV1_MAJOR_VERSION 0
 #define LIBGAV1_MINOR_VERSION 16
-#define LIBGAV1_PATCH_VERSION 1
+#define LIBGAV1_PATCH_VERSION 3
 
 #define LIBGAV1_VERSION                                           \
   ((LIBGAV1_MAJOR_VERSION << 16) | (LIBGAV1_MINOR_VERSION << 8) | \
diff --git a/src/obu_parser.cc b/src/obu_parser.cc
index bbf00ed..69480d7 100644
--- a/src/obu_parser.cc
+++ b/src/obu_parser.cc
@@ -479,9 +479,13 @@ bool ObuParser::ParseSequenceHeader(bool seen_frame_header) {
       LIBGAV1_DLOG(ERROR, "Sequence header changed in the middle of a frame.");
       return false;
     }
+    sequence_header_changed_ = true;
     decoder_state_.ClearReferenceFrames();
   }
   sequence_header_ = sequence_header;
+  if (!has_sequence_header_) {
+    sequence_header_changed_ = true;
+  }
   has_sequence_header_ = true;
   // Section 6.4.1: It is a requirement of bitstream conformance that if
   // OperatingPointIdc is equal to 0, then obu_extension_flag is equal to 0 for
@@ -509,12 +513,12 @@ void ObuParser::MarkInvalidReferenceFrames() {
     if (lower_bound_is_smaller) {
       if (reference_frame_id > decoder_state_.current_frame_id ||
           reference_frame_id < lower_bound) {
-        decoder_state_.reference_valid[i] = false;
+        decoder_state_.reference_frame[i] = nullptr;
       }
     } else {
       if (reference_frame_id > decoder_state_.current_frame_id &&
           reference_frame_id < lower_bound) {
-        decoder_state_.reference_valid[i] = false;
+        decoder_state_.reference_frame[i] = nullptr;
       }
     }
   }
@@ -621,7 +625,7 @@ bool ObuParser::ParseReferenceOrderHint() {
     frame_header_.reference_order_hint[i] = scratch;
     if (frame_header_.reference_order_hint[i] !=
         decoder_state_.reference_order_hint[i]) {
-      decoder_state_.reference_valid[i] = false;
+      decoder_state_.reference_frame[i] = nullptr;
     }
   }
   return true;
@@ -1787,10 +1791,11 @@ bool ObuParser::ParseFrameParameters() {
         // whenever display_frame_id is read, the value matches
         // RefFrameId[ frame_to_show_map_idx ] ..., and that
         // RefValid[ frame_to_show_map_idx ] is equal to 1.
+        //
+        // The current_frame_ == nullptr check below is equivalent to checking
+        // if RefValid[ frame_to_show_map_idx ] is equal to 1.
         if (frame_header_.display_frame_id !=
-                decoder_state_
-                    .reference_frame_id[frame_header_.frame_to_show] ||
-            !decoder_state_.reference_valid[frame_header_.frame_to_show]) {
+            decoder_state_.reference_frame_id[frame_header_.frame_to_show]) {
           LIBGAV1_DLOG(ERROR,
                        "Reference buffer %d has a frame id number mismatch.",
                        frame_header_.frame_to_show);
@@ -1868,7 +1873,6 @@ bool ObuParser::ParseFrameParameters() {
     }
   }
   if (frame_header_.frame_type == kFrameKey && frame_header_.show_frame) {
-    decoder_state_.reference_valid.fill(false);
     decoder_state_.reference_order_hint.fill(0);
     decoder_state_.reference_frame.fill(nullptr);
   }
@@ -2019,15 +2023,8 @@ bool ObuParser::ParseFrameParameters() {
       // Note if support for Annex C: Error resilience behavior is added this
       // check should be omitted per C.5 Decoder consequences of processable
       // frames.
-      if (!decoder_state_.reference_valid[reference_frame_index]) {
-        LIBGAV1_DLOG(ERROR, "ref_frame_idx[%d] (%d) is not valid.", i,
-                     reference_frame_index);
-        return false;
-      }
-      // Check if the inter frame requests a nonexistent reference, whether or
-      // not frame_refs_short_signaling is used.
       if (decoder_state_.reference_frame[reference_frame_index] == nullptr) {
-        LIBGAV1_DLOG(ERROR, "ref_frame_idx[%d] (%d) is not a decoded frame.", i,
+        LIBGAV1_DLOG(ERROR, "ref_frame_idx[%d] (%d) is not valid.", i,
                      reference_frame_index);
         return false;
       }
@@ -2043,12 +2040,8 @@ bool ObuParser::ParseFrameParameters() {
         // Section 6.8.2: It is a requirement of bitstream conformance that
         // whenever expectedFrameId[ i ] is calculated, the value matches
         // RefFrameId[ ref_frame_idx[ i ] ] ...
-        //
-        // Section 6.8.2: It is a requirement of bitstream conformance that
-        // RefValid[ ref_frame_idx[ i ] ] is equal to 1, ...
         if (frame_header_.expected_frame_id[i] !=
-                decoder_state_.reference_frame_id[reference_frame_index] ||
-            !decoder_state_.reference_valid[reference_frame_index]) {
+            decoder_state_.reference_frame_id[reference_frame_index]) {
           LIBGAV1_DLOG(ERROR,
                        "Reference buffer %d has a frame id number mismatch.",
                        reference_frame_index);
@@ -2665,6 +2658,7 @@ StatusCode ObuParser::ParseOneFrame(RefCountedBufferPtr* const current_frame) {
   metadata_ = {};
   tile_buffers_.clear();
   next_tile_group_start_ = 0;
+  sequence_header_changed_ = false;
 
   bool parsed_one_full_frame = false;
   bool seen_frame_header = false;
diff --git a/src/obu_parser.h b/src/obu_parser.h
index 86d165f..c4619ed 100644
--- a/src/obu_parser.h
+++ b/src/obu_parser.h
@@ -276,6 +276,9 @@ class ObuParser : public Allocable {
   const ObuFrameHeader& frame_header() const { return frame_header_; }
   const Vector<TileBuffer>& tile_buffers() const { return tile_buffers_; }
   const ObuMetadata& metadata() const { return metadata_; }
+  // Returns true if the last call to ParseOneFrame() encountered a sequence
+  // header change.
+  bool sequence_header_changed() const { return sequence_header_changed_; }
 
   // Setters.
   void set_sequence_header(const ObuSequenceHeader& sequence_header) {
@@ -384,6 +387,9 @@ class ObuParser : public Allocable {
   int next_tile_group_start_ = 0;
   // If true, the sequence_header_ field is valid.
   bool has_sequence_header_ = false;
+  // If true, it means that the last call to ParseOneFrame() encountered a
+  // sequence header change.
+  bool sequence_header_changed_ = false;
   // If true, the obu_extension_flag syntax element in the OBU header must be
   // 0. Set to true when parsing a sequence header if OperatingPointIdc is 0.
   bool extension_disallowed_ = false;
diff --git a/src/post_filter.h b/src/post_filter.h
index 800d51d..dfcd08e 100644
--- a/src/post_filter.h
+++ b/src/post_filter.h
@@ -272,8 +272,6 @@ class PostFilter {
   void CopyBordersForOneSuperBlockRow(int row4x4, int sb4x4,
                                       bool for_loop_restoration);
   // Sets up the |loop_restoration_border_| for loop restoration.
-  // TODO(linfengz): Unify duplicates in the following two functions if
-  // possible.
   // This is called when there is no CDEF filter. We copy rows from
   // |superres_buffer_| and do the line extension.
   void SetupLoopRestorationBorder(int row4x4_start);
@@ -401,11 +399,14 @@ class PostFilter {
   // Applies super resolution for the |src| for |rows[plane]| rows of each
   // plane. If |line_buffer_row| is larger than or equal to 0, one more row will
   // be processed, the line buffer indicated by |line_buffer_row| will be used
-  // as the source.
+  // as the source. If |dst_is_loop_restoration_border| is true, then it means
+  // that the |dst| pointers come from |loop_restoration_border_| and the
+  // strides will be populated from that buffer.
   void ApplySuperRes(
       const std::array<uint8_t*, kMaxPlanes>& src,
       const std::array<int, kMaxPlanes>& rows, int line_buffer_row,
-      const std::array<uint8_t*, kMaxPlanes>& dst);  // Section 7.16.
+      const std::array<uint8_t*, kMaxPlanes>& dst,
+      bool dst_is_loop_restoration_border = false);  // Section 7.16.
   // Applies SuperRes for the superblock row starting at |row4x4| with a height
   // of 4*|sb4x4|.
   void ApplySuperResForOneSuperBlockRow(int row4x4, int sb4x4,
diff --git a/src/post_filter/cdef.cc b/src/post_filter/cdef.cc
index 994f448..f32b0a0 100644
--- a/src/post_filter/cdef.cc
+++ b/src/post_filter/cdef.cc
@@ -272,7 +272,7 @@ void PostFilter::ApplyCdefForOneUnit(uint16_t* cdef_block, const int index,
   const uint16_t* cdef_src_row_base[kMaxPlanes];
   int cdef_src_row_base_stride[kMaxPlanes];
   int column_step[kMaxPlanes];
-  assert(planes_ >= 1);
+  assert(planes_ == kMaxPlanesMonochrome || planes_ == kMaxPlanes);
   int plane = kPlaneY;
   do {
     cdef_buffer_row_base[plane] =
diff --git a/src/post_filter/loop_restoration.cc b/src/post_filter/loop_restoration.cc
index 3d5da90..826ef48 100644
--- a/src/post_filter/loop_restoration.cc
+++ b/src/post_filter/loop_restoration.cc
@@ -29,15 +29,15 @@ void PostFilter::ApplyLoopRestorationForOneRow(
                                                unit_row * num_horizontal_units);
   const bool in_place = DoCdef() || thread_pool_ != nullptr;
   const Pixel* border = nullptr;
+  ptrdiff_t border_stride = 0;
   src_buffer += unit_y * stride;
   if (in_place) {
-    assert(loop_restoration_border_.stride(plane) ==
-           static_cast<int>(sizeof(Pixel) * stride));
     const int border_unit_y = std::max(
         RightShiftWithCeiling(unit_y, 4 - subsampling_y_[plane]) - 4, 0);
+    border_stride = loop_restoration_border_.stride(plane) / sizeof(Pixel);
     border =
         reinterpret_cast<const Pixel*>(loop_restoration_border_.data(plane)) +
-        border_unit_y * stride;
+        border_unit_y * border_stride;
   }
   int unit_column = 0;
   int column = 0;
@@ -61,18 +61,22 @@ void PostFilter::ApplyLoopRestorationForOneRow(
       }
     } else {
       const Pixel* top_border = src - kRestorationVerticalBorder * stride;
+      ptrdiff_t top_border_stride = stride;
       const Pixel* bottom_border = src + current_process_unit_height * stride;
+      ptrdiff_t bottom_border_stride = stride;
       const bool frame_bottom_border =
           (unit_y + current_process_unit_height >= plane_height);
       if (in_place && (unit_y != 0 || !frame_bottom_border)) {
         const Pixel* loop_restoration_border = border + column;
         if (unit_y != 0) {
           top_border = loop_restoration_border;
-          loop_restoration_border += 4 * stride;
+          top_border_stride = border_stride;
+          loop_restoration_border += 4 * border_stride;
         }
         if (!frame_bottom_border) {
-          bottom_border =
-              loop_restoration_border + kRestorationVerticalBorder * stride;
+          bottom_border = loop_restoration_border +
+                          kRestorationVerticalBorder * border_stride;
+          bottom_border_stride = border_stride;
         }
       }
       RestorationBuffer restoration_buffer;
@@ -81,10 +85,10 @@ void PostFilter::ApplyLoopRestorationForOneRow(
              type == kLoopRestorationTypeWiener);
       const dsp::LoopRestorationFunc restoration_func =
           dsp_.loop_restorations[type - 2];
-      restoration_func(restoration_info[unit_column], src, top_border,
-                       bottom_border, stride, current_process_unit_width,
-                       current_process_unit_height, &restoration_buffer,
-                       dst_buffer + column);
+      restoration_func(restoration_info[unit_column], src, stride, top_border,
+                       top_border_stride, bottom_border, bottom_border_stride,
+                       current_process_unit_width, current_process_unit_height,
+                       &restoration_buffer, dst_buffer + column);
     }
     ++unit_column;
     column += plane_unit_size;
diff --git a/src/post_filter/post_filter.cc b/src/post_filter/post_filter.cc
index 0eacf34..7671f01 100644
--- a/src/post_filter/post_filter.cc
+++ b/src/post_filter/post_filter.cc
@@ -306,11 +306,11 @@ void PostFilter::ExtendBordersForReferenceFrame() {
 }
 
 void PostFilter::CopyDeblockedPixels(Plane plane, int row4x4) {
-  assert(frame_buffer_.stride(plane) == loop_restoration_border_.stride(plane));
-  const ptrdiff_t stride = frame_buffer_.stride(plane);
+  const ptrdiff_t src_stride = frame_buffer_.stride(plane);
   const uint8_t* const src = GetSourceBuffer(plane, row4x4, 0);
   const int row_offset = DivideBy4(row4x4);
-  uint8_t* dst = loop_restoration_border_.data(plane) + row_offset * stride;
+  const ptrdiff_t dst_stride = loop_restoration_border_.stride(plane);
+  uint8_t* dst = loop_restoration_border_.data(plane) + row_offset * dst_stride;
   const int num_pixels = SubsampledValue(MultiplyBy4(frame_header_.columns4x4),
                                          subsampling_x_[plane]);
   const int row_width = num_pixels << pixel_size_log2_;
@@ -326,9 +326,9 @@ void PostFilter::CopyDeblockedPixels(Plane plane, int row4x4) {
       // border extension).
       row = last_valid_row;
     }
-    memcpy(dst, src + row * stride, row_width);
+    memcpy(dst, src + row * src_stride, row_width);
     last_valid_row = row;
-    dst += stride;
+    dst += dst_stride;
   }
 }
 
@@ -395,9 +395,6 @@ void PostFilter::SetupLoopRestorationBorder(const int row4x4) {
     if (loop_restoration_.type[plane] == kLoopRestorationTypeNone) {
       continue;
     }
-    assert(frame_buffer_.stride(plane) ==
-           loop_restoration_border_.stride(plane));
-    const ptrdiff_t stride = frame_buffer_.stride(plane);
     const int row_offset = DivideBy4(row4x4);
     const int num_pixels =
         SubsampledValue(upscaled_width_, subsampling_x_[plane]);
@@ -406,9 +403,13 @@ void PostFilter::SetupLoopRestorationBorder(const int row4x4) {
     const int row = kLoopRestorationBorderRows[subsampling_y_[plane]];
     const int absolute_row =
         (MultiplyBy4(row4x4) >> subsampling_y_[plane]) + row;
+    const ptrdiff_t src_stride = frame_buffer_.stride(plane);
     const uint8_t* src =
-        GetSuperResBuffer(static_cast<Plane>(plane), row4x4, 0) + row * stride;
-    uint8_t* dst = loop_restoration_border_.data(plane) + row_offset * stride;
+        GetSuperResBuffer(static_cast<Plane>(plane), row4x4, 0) +
+        row * src_stride;
+    const ptrdiff_t dst_stride = loop_restoration_border_.stride(plane);
+    uint8_t* dst =
+        loop_restoration_border_.data(plane) + row_offset * dst_stride;
     for (int i = 0; i < 4; ++i) {
       memcpy(dst, src, row_width);
 #if LIBGAV1_MAX_BITDEPTH >= 10
@@ -421,8 +422,8 @@ void PostFilter::SetupLoopRestorationBorder(const int row4x4) {
                             kRestorationHorizontalBorder);
       // If we run out of rows, copy the last valid row (mimics the bottom
       // border extension).
-      if (absolute_row + i < plane_height - 1) src += stride;
-      dst += stride;
+      if (absolute_row + i < plane_height - 1) src += src_stride;
+      dst += dst_stride;
     }
   } while (++plane < planes_);
 }
@@ -434,7 +435,7 @@ void PostFilter::SetupLoopRestorationBorder(int row4x4_start, int sb4x4) {
   for (int sb_y = 0; sb_y < sb4x4; sb_y += 16) {
     const int row4x4 = row4x4_start + sb_y;
     const int row_offset_start = DivideBy4(row4x4);
-    std::array<uint8_t*, kMaxPlanes> dst = {
+    const std::array<uint8_t*, kMaxPlanes> dst = {
         loop_restoration_border_.data(kPlaneY) +
             row_offset_start * loop_restoration_border_.stride(kPlaneY),
         loop_restoration_border_.data(kPlaneU) +
@@ -462,13 +463,14 @@ void PostFilter::SetupLoopRestorationBorder(int row4x4_start, int sb4x4) {
                      row * frame_buffer_.stride(plane);
         rows[plane] = Clip3(plane_height - absolute_row, 0, 4);
       } while (++plane < planes_);
-      ApplySuperRes(src, rows, /*line_buffer_row=*/-1, dst);
+      ApplySuperRes(src, rows, /*line_buffer_row=*/-1, dst,
+                    /*dst_is_loop_restoration_border=*/true);
       // If we run out of rows, copy the last valid row (mimics the bottom
       // border extension).
       plane = kPlaneY;
       do {
         if (rows[plane] == 0 || rows[plane] >= 4) continue;
-        const ptrdiff_t stride = frame_buffer_.stride(plane);
+        const ptrdiff_t stride = loop_restoration_border_.stride(plane);
         uint8_t* dst_line = dst[plane] + rows[plane] * stride;
         const uint8_t* const src_line = dst_line - stride;
         const int upscaled_width = super_res_info_[plane].upscaled_width
diff --git a/src/post_filter/super_res.cc b/src/post_filter/super_res.cc
index a70e4ed..554e537 100644
--- a/src/post_filter/super_res.cc
+++ b/src/post_filter/super_res.cc
@@ -19,7 +19,8 @@ namespace libgav1 {
 void PostFilter::ApplySuperRes(const std::array<uint8_t*, kMaxPlanes>& src,
                                const std::array<int, kMaxPlanes>& rows,
                                const int line_buffer_row,
-                               const std::array<uint8_t*, kMaxPlanes>& dst) {
+                               const std::array<uint8_t*, kMaxPlanes>& dst,
+                               bool dst_is_loop_restoration_border /*=false*/) {
   int plane = kPlaneY;
   do {
     const int plane_width =
@@ -28,13 +29,19 @@ void PostFilter::ApplySuperRes(const std::array<uint8_t*, kMaxPlanes>& src,
     if (bitdepth_ >= 10) {
       auto* input = reinterpret_cast<uint16_t*>(src[plane]);
       auto* output = reinterpret_cast<uint16_t*>(dst[plane]);
-      const ptrdiff_t stride = frame_buffer_.stride(plane) / sizeof(uint16_t);
+      const ptrdiff_t input_stride =
+          frame_buffer_.stride(plane) / sizeof(uint16_t);
+      const ptrdiff_t output_stride =
+          (dst_is_loop_restoration_border
+               ? loop_restoration_border_.stride(plane)
+               : frame_buffer_.stride(plane)) /
+          sizeof(uint16_t);
       if (rows[plane] > 0) {
         dsp_.super_res(superres_coefficients_[static_cast<int>(plane != 0)],
-                       input, stride, rows[plane], plane_width,
+                       input, input_stride, rows[plane], plane_width,
                        super_res_info_[plane].upscaled_width,
                        super_res_info_[plane].initial_subpixel_x,
-                       super_res_info_[plane].step, output);
+                       super_res_info_[plane].step, output, output_stride);
       }
       // In the multi-threaded case, the |superres_line_buffer_| holds the last
       // input row. Apply SuperRes for that row.
@@ -44,24 +51,29 @@ void PostFilter::ApplySuperRes(const std::array<uint8_t*, kMaxPlanes>& src,
             line_buffer_row * superres_line_buffer_.stride(plane) /
                 sizeof(uint16_t) +
             kSuperResHorizontalBorder;
-        dsp_.super_res(
-            superres_coefficients_[static_cast<int>(plane != 0)],
-            line_buffer_start, /*stride=*/0,
-            /*height=*/1, plane_width, super_res_info_[plane].upscaled_width,
-            super_res_info_[plane].initial_subpixel_x,
-            super_res_info_[plane].step, output + rows[plane] * stride);
+        dsp_.super_res(superres_coefficients_[static_cast<int>(plane != 0)],
+                       line_buffer_start, /*source_stride=*/0,
+                       /*height=*/1, plane_width,
+                       super_res_info_[plane].upscaled_width,
+                       super_res_info_[plane].initial_subpixel_x,
+                       super_res_info_[plane].step,
+                       output + rows[plane] * output_stride, /*dest_stride=*/0);
       }
       continue;
     }
 #endif  // LIBGAV1_MAX_BITDEPTH >= 10
     uint8_t* input = src[plane];
     uint8_t* output = dst[plane];
+    const ptrdiff_t input_stride = frame_buffer_.stride(plane);
+    const ptrdiff_t output_stride = dst_is_loop_restoration_border
+                                        ? loop_restoration_border_.stride(plane)
+                                        : frame_buffer_.stride(plane);
     if (rows[plane] > 0) {
       dsp_.super_res(superres_coefficients_[static_cast<int>(plane != 0)],
-                     input, frame_buffer_.stride(plane), rows[plane],
-                     plane_width, super_res_info_[plane].upscaled_width,
+                     input, input_stride, rows[plane], plane_width,
+                     super_res_info_[plane].upscaled_width,
                      super_res_info_[plane].initial_subpixel_x,
-                     super_res_info_[plane].step, output);
+                     super_res_info_[plane].step, output, output_stride);
     }
     // In the multi-threaded case, the |superres_line_buffer_| holds the last
     // input row. Apply SuperRes for that row.
@@ -70,13 +82,13 @@ void PostFilter::ApplySuperRes(const std::array<uint8_t*, kMaxPlanes>& src,
           superres_line_buffer_.data(plane) +
           line_buffer_row * superres_line_buffer_.stride(plane) +
           kSuperResHorizontalBorder;
-      dsp_.super_res(superres_coefficients_[static_cast<int>(plane != 0)],
-                     line_buffer_start, /*stride=*/0,
-                     /*height=*/1, plane_width,
-                     super_res_info_[plane].upscaled_width,
-                     super_res_info_[plane].initial_subpixel_x,
-                     super_res_info_[plane].step,
-                     output + rows[plane] * frame_buffer_.stride(plane));
+      dsp_.super_res(
+          superres_coefficients_[static_cast<int>(plane != 0)],
+          line_buffer_start, /*source_stride=*/0,
+          /*height=*/1, plane_width, super_res_info_[plane].upscaled_width,
+          super_res_info_[plane].initial_subpixel_x,
+          super_res_info_[plane].step, output + rows[plane] * output_stride,
+          /*dest_stride=*/0);
     }
   } while (++plane < planes_);
 }
diff --git a/src/residual_buffer_pool.cc b/src/residual_buffer_pool.cc
index e166392..44a842c 100644
--- a/src/residual_buffer_pool.cc
+++ b/src/residual_buffer_pool.cc
@@ -129,7 +129,8 @@ std::unique_ptr<ResidualBuffer> ResidualBufferPool::Get() {
 }
 
 void ResidualBufferPool::Release(std::unique_ptr<ResidualBuffer> buffer) {
-  buffer->transform_parameters()->Reset();
+  buffer->transform_parameters()->Clear();
+  buffer->partition_tree_order()->Clear();
   std::lock_guard<std::mutex> lock(mutex_);
   buffers_.Push(std::move(buffer));
 }
diff --git a/src/residual_buffer_pool.h b/src/residual_buffer_pool.h
index f7bc75d..75924db 100644
--- a/src/residual_buffer_pool.h
+++ b/src/residual_buffer_pool.h
@@ -27,73 +27,11 @@
 #include "src/utils/compiler_attributes.h"
 #include "src/utils/constants.h"
 #include "src/utils/memory.h"
+#include "src/utils/queue.h"
 #include "src/utils/types.h"
 
 namespace libgav1 {
 
-// A simple fixed size queue implementation to hold the transform parameters
-// when |Tile::split_parse_and_decode_| is true. We don't have to do any
-// boundary checks since we always push data into the queue before accessing it.
-class TransformParameterQueue {
- public:
-  TransformParameterQueue() = default;
-
-  // Move only.
-  TransformParameterQueue(TransformParameterQueue&& other) = default;
-  TransformParameterQueue& operator=(TransformParameterQueue&& other) = default;
-
-  LIBGAV1_MUST_USE_RESULT bool Init(int max_size) {
-    max_size_ = max_size;
-    // No initialization is necessary since the data will be always written to
-    // before being read.
-    non_zero_coeff_count_.reset(new (std::nothrow) int16_t[max_size_]);
-    tx_type_.reset(new (std::nothrow) TransformType[max_size_]);
-    return non_zero_coeff_count_ != nullptr && tx_type_ != nullptr;
-  }
-
-  // Adds the |non_zero_coeff_count| and the |tx_type| to the back of the queue.
-  void Push(int non_zero_coeff_count, TransformType tx_type) {
-    assert(back_ < max_size_);
-    non_zero_coeff_count_[back_] = non_zero_coeff_count;
-    tx_type_[back_++] = tx_type;
-  }
-
-  // Returns the non_zero_coeff_count at the front of the queue.
-  int16_t NonZeroCoeffCount() const {
-    assert(front_ != back_);
-    return non_zero_coeff_count_[front_];
-  }
-
-  // Returns the tx_type at the front of the queue.
-  TransformType Type() const {
-    assert(front_ != back_);
-    return tx_type_[front_];
-  }
-
-  // Removes the |non_zero_coeff_count| and the |tx_type| from the front of the
-  // queue.
-  void Pop() {
-    assert(front_ != back_);
-    ++front_;
-  }
-
-  // Clears the queue.
-  void Reset() {
-    front_ = 0;
-    back_ = 0;
-  }
-
-  // Used only in the tests. Returns the number of elements in the queue.
-  int Size() const { return back_ - front_; }
-
- private:
-  int max_size_ = 0;
-  std::unique_ptr<int16_t[]> non_zero_coeff_count_;
-  std::unique_ptr<TransformType[]> tx_type_;
-  int front_ = 0;
-  int back_ = 0;
-};
-
 // This class is used for parsing and decoding a superblock. Members of this
 // class are populated in the "parse" step and consumed in the "decode" step.
 class ResidualBuffer : public Allocable {
@@ -104,7 +42,8 @@ class ResidualBuffer : public Allocable {
     if (buffer != nullptr) {
       buffer->buffer_ = MakeAlignedUniquePtr<uint8_t>(32, buffer_size);
       if (buffer->buffer_ == nullptr ||
-          !buffer->transform_parameters_.Init(queue_size)) {
+          !buffer->transform_parameters_.Init(queue_size) ||
+          !buffer->partition_tree_order_.Init(queue_size)) {
         buffer = nullptr;
       }
     }
@@ -118,9 +57,14 @@ class ResidualBuffer : public Allocable {
   // Buffer used to store the residual values.
   uint8_t* buffer() { return buffer_.get(); }
   // Queue used to store the transform parameters.
-  TransformParameterQueue* transform_parameters() {
+  Queue<TransformParameters>* transform_parameters() {
     return &transform_parameters_;
   }
+  // Queue used to store the block ordering in the partition tree of the
+  // superblocks.
+  Queue<PartitionTreeNode>* partition_tree_order() {
+    return &partition_tree_order_;
+  }
 
  private:
   friend class ResidualBufferStack;
@@ -128,7 +72,8 @@ class ResidualBuffer : public Allocable {
   ResidualBuffer() = default;
 
   AlignedUniquePtr<uint8_t> buffer_;
-  TransformParameterQueue transform_parameters_;
+  Queue<TransformParameters> transform_parameters_;
+  Queue<PartitionTreeNode> partition_tree_order_;
   // Used by ResidualBufferStack to form a chain of ResidualBuffers.
   ResidualBuffer* next_ = nullptr;
 };
diff --git a/src/threading_strategy.cc b/src/threading_strategy.cc
index cd4d576..17ce18f 100644
--- a/src/threading_strategy.cc
+++ b/src/threading_strategy.cc
@@ -36,24 +36,25 @@ constexpr int kFrameParallelThresholdMultiplier =
 // Computes the number of frame threads to be used based on the following
 // heuristic:
 //   * If |thread_count| == 1, return 0.
-//   * If |thread_count| <= |tile_count| * 4, return 0.
+//   * If |thread_count| <= |tile_count| * kFrameParallelThresholdMultiplier,
+//     return 0.
 //   * Otherwise, return the largest value of i which satisfies the following
 //     condition: i + i * tile_columns <= thread_count. This ensures that there
 //     are at least |tile_columns| worker threads for each frame thread.
 //   * This function will never return 1 or a value > |thread_count|.
 //
-//  This heuristic is based empirical performance data. The in-frame threading
-//  model (combination of tile multithreading, superblock row multithreading and
-//  post filter multithreading) performs better than the frame parallel model
-//  until we reach the threshold of |thread_count| > |tile_count| *
-//  kFrameParallelThresholdMultiplier.
+//  This heuristic is based on empirical performance data. The in-frame
+//  threading model (combination of tile multithreading, superblock row
+//  multithreading and post filter multithreading) performs better than the
+//  frame parallel model until we reach the threshold of |thread_count| >
+//  |tile_count| * kFrameParallelThresholdMultiplier.
 //
 //  It is a function of |tile_count| since tile threading and superblock row
-//  multithreading will scale only as a factor of |tile_count|. The threshold 4
-//  is arrived at based on empirical data. The general idea is that superblock
-//  row multithreading plateaus at 4 * |tile_count| because in most practical
-//  cases there aren't more than that many superblock rows and columns available
-//  to work on in parallel.
+//  multithreading will scale only as a factor of |tile_count|. The threshold
+//  kFrameParallelThresholdMultiplier is arrived at based on empirical data.
+//  The general idea is that superblock row multithreading plateaus at 4 *
+//  |tile_count| because in most practical cases there aren't more than that
+//  many superblock rows and columns available to work on in parallel.
 int ComputeFrameThreadCount(int thread_count, int tile_count,
                             int tile_columns) {
   assert(thread_count > 0);
@@ -132,7 +133,7 @@ bool ThreadingStrategy::Reset(const ObuFrameHeader& frame_header,
     thread_count -= 2;
     if (thread_count <= 0) break;
   }
-#else  // !defined(__ANDROID__)
+#else   // !defined(__ANDROID__)
   // Assign the remaining threads to each Tile.
   for (int i = 0; i < tile_count; ++i) {
     const int count = thread_count / tile_count +
diff --git a/src/tile.h b/src/tile.h
index 73bb5fd..6bae2a0 100644
--- a/src/tile.h
+++ b/src/tile.h
@@ -48,7 +48,6 @@
 #include "src/utils/constants.h"
 #include "src/utils/entropy_decoder.h"
 #include "src/utils/memory.h"
-#include "src/utils/parameter_tree.h"
 #include "src/utils/segmentation_map.h"
 #include "src/utils/threadpool.h"
 #include "src/utils/types.h"
@@ -292,26 +291,25 @@ class Tile : public Allocable {
   // iteratively. It performs a DFS traversal over the partition tree to process
   // the blocks in the right order.
   bool ProcessPartition(
-      int row4x4_start, int column4x4_start, ParameterTree* root,
-      TileScratchBuffer* scratch_buffer,
+      int row4x4_start, int column4x4_start, TileScratchBuffer* scratch_buffer,
       ResidualPtr* residual);  // Iterative implementation of 5.11.4.
   bool ProcessBlock(int row4x4, int column4x4, BlockSize block_size,
-                    ParameterTree* tree, TileScratchBuffer* scratch_buffer,
+                    TileScratchBuffer* scratch_buffer,
                     ResidualPtr* residual);   // 5.11.5.
   void ResetCdef(int row4x4, int column4x4);  // 5.11.55.
 
   // This function is used to decode a superblock when the parsing has already
   // been done for that superblock.
-  bool DecodeSuperBlock(ParameterTree* tree, TileScratchBuffer* scratch_buffer,
-                        ResidualPtr* residual);
+  bool DecodeSuperBlock(int sb_row_index, int sb_column_index,
+                        TileScratchBuffer* scratch_buffer);
   // Helper function used by DecodeSuperBlock(). Note that the decode_block()
   // function in the spec is equivalent to ProcessBlock() in the code.
-  bool DecodeBlock(ParameterTree* tree, TileScratchBuffer* scratch_buffer,
-                   ResidualPtr* residual);
+  bool DecodeBlock(int row4x4, int column4x4, BlockSize block_size,
+                   TileScratchBuffer* scratch_buffer, ResidualPtr* residual);
 
   void ClearBlockDecoded(TileScratchBuffer* scratch_buffer, int row4x4,
                          int column4x4);  // 5.11.3.
-  bool ProcessSuperBlock(int row4x4, int column4x4, int block_width4x4,
+  bool ProcessSuperBlock(int row4x4, int column4x4,
                          TileScratchBuffer* scratch_buffer,
                          ProcessingMode mode);
   void ResetLoopRestorationParams();
diff --git a/src/tile/bitstream/palette.cc b/src/tile/bitstream/palette.cc
index 674d210..41b42d6 100644
--- a/src/tile/bitstream/palette.cc
+++ b/src/tile/bitstream/palette.cc
@@ -130,10 +130,10 @@ void Tile::ReadPaletteColors(const Block& block, Plane plane) {
 
 void Tile::ReadPaletteModeInfo(const Block& block) {
   BlockParameters& bp = *block.bp;
+  bp.palette_mode_info.size[kPlaneTypeY] = 0;
+  bp.palette_mode_info.size[kPlaneTypeUV] = 0;
   if (IsBlockSmallerThan8x8(block.size) || block.size > kBlock64x64 ||
       !frame_header_.allow_screen_content_tools) {
-    bp.palette_mode_info.size[kPlaneTypeY] = 0;
-    bp.palette_mode_info.size[kPlaneTypeUV] = 0;
     return;
   }
   const int block_size_context =
@@ -156,7 +156,7 @@ void Tile::ReadPaletteModeInfo(const Block& block) {
       ReadPaletteColors(block, kPlaneY);
     }
   }
-  if (bp.uv_mode == kPredictionModeDc && block.HasChroma()) {
+  if (block.HasChroma() && bp.uv_mode == kPredictionModeDc) {
     const int context =
         static_cast<int>(bp.palette_mode_info.size[kPlaneTypeY] > 0);
     const bool has_palette_uv =
diff --git a/src/tile/tile.cc b/src/tile/tile.cc
index ee48f17..9699517 100644
--- a/src/tile/tile.cc
+++ b/src/tile/tile.cc
@@ -609,7 +609,7 @@ bool Tile::ProcessSuperBlockRow(int row4x4,
   const int block_width4x4 = kNum4x4BlocksWide[SuperBlockSize()];
   for (int column4x4 = column4x4_start_; column4x4 < column4x4_end_;
        column4x4 += block_width4x4) {
-    if (!ProcessSuperBlock(row4x4, column4x4, block_width4x4, scratch_buffer,
+    if (!ProcessSuperBlock(row4x4, column4x4, scratch_buffer,
                            processing_mode)) {
       LIBGAV1_DLOG(ERROR, "Error decoding super block row: %d column: %d",
                    row4x4, column4x4);
@@ -642,9 +642,6 @@ void Tile::SaveSymbolDecoderContext() {
 }
 
 bool Tile::ParseAndDecode() {
-  // If this is the main thread, we build the loop filter bit masks when parsing
-  // so that it happens in the current thread. This ensures that the main thread
-  // does as much work as possible.
   if (split_parse_and_decode_) {
     if (!ThreadedParseAndDecode()) return false;
     SaveSymbolDecoderContext();
@@ -776,8 +773,8 @@ bool Tile::ThreadedParseAndDecode() {
     for (int column4x4 = column4x4_start_, column_index = 0;
          column4x4 < column4x4_end_;
          column4x4 += block_width4x4, ++column_index) {
-      if (!ProcessSuperBlock(row4x4, column4x4, block_width4x4,
-                             scratch_buffer.get(), kProcessingModeParseOnly)) {
+      if (!ProcessSuperBlock(row4x4, column4x4, scratch_buffer.get(),
+                             kProcessingModeParseOnly)) {
         std::lock_guard<std::mutex> lock(threading_.mutex);
         threading_.abort = true;
         break;
@@ -862,8 +859,8 @@ void Tile::DecodeSuperBlock(int row_index, int column_index,
       tile_scratch_buffer_pool_->Get();
   bool ok = scratch_buffer != nullptr;
   if (ok) {
-    ok = ProcessSuperBlock(row4x4, column4x4, block_width4x4,
-                           scratch_buffer.get(), kProcessingModeDecodeOnly);
+    ok = ProcessSuperBlock(row4x4, column4x4, scratch_buffer.get(),
+                           kProcessingModeDecodeOnly);
     tile_scratch_buffer_pool_->Release(std::move(scratch_buffer));
   }
   std::unique_lock<std::mutex> lock(threading_.mutex);
@@ -1629,11 +1626,12 @@ bool Tile::TransformBlock(const Block& block, Plane plane, int base_x,
     const int sb_row_index = SuperBlockRowIndex(block.row4x4);
     const int sb_column_index = SuperBlockColumnIndex(block.column4x4);
     if (mode == kProcessingModeDecodeOnly) {
-      TransformParameterQueue& tx_params =
+      Queue<TransformParameters>& tx_params =
           *residual_buffer_threaded_[sb_row_index][sb_column_index]
                ->transform_parameters();
       ReconstructBlock(block, plane, start_x, start_y, tx_size,
-                       tx_params.Type(), tx_params.NonZeroCoeffCount());
+                       tx_params.Front().type,
+                       tx_params.Front().non_zero_coeff_count);
       tx_params.Pop();
     } else {
       TransformType tx_type;
@@ -1656,7 +1654,7 @@ bool Tile::TransformBlock(const Block& block, Plane plane, int base_x,
         assert(mode == kProcessingModeParseOnly);
         residual_buffer_threaded_[sb_row_index][sb_column_index]
             ->transform_parameters()
-            ->Push(non_zero_coeff_count, tx_type);
+            ->Push(TransformParameters(tx_type, non_zero_coeff_count));
       }
     }
   }
@@ -1886,6 +1884,7 @@ bool Tile::AssignInterMv(const Block& block, bool is_compound) {
   GetClampParameters(block, min, max);
   BlockParameters& bp = *block.bp;
   const PredictionParameters& prediction_parameters = *bp.prediction_parameters;
+  bp.mv.mv64 = 0;
   if (is_compound) {
     for (int i = 0; i < 2; ++i) {
       const PredictionMode mode = GetSinglePredictionMode(i, bp.y_mode);
@@ -1948,6 +1947,7 @@ bool Tile::AssignIntraMv(const Block& block) {
   BlockParameters& bp = *block.bp;
   const PredictionParameters& prediction_parameters = *bp.prediction_parameters;
   const MotionVector& ref_mv_0 = prediction_parameters.reference_mv(0);
+  bp.mv.mv64 = 0;
   ReadMotionVector(block, 0);
   if (ref_mv_0.mv32 == 0) {
     const MotionVector& ref_mv_1 = prediction_parameters.reference_mv(1);
@@ -2122,7 +2122,6 @@ void Tile::PopulateDeblockFilterLevel(const Block& block) {
 }
 
 bool Tile::ProcessBlock(int row4x4, int column4x4, BlockSize block_size,
-                        ParameterTree* const tree,
                         TileScratchBuffer* const scratch_buffer,
                         ResidualPtr* residual) {
   // Do not process the block if the starting point is beyond the visible frame.
@@ -2133,8 +2132,24 @@ bool Tile::ProcessBlock(int row4x4, int column4x4, BlockSize block_size,
       column4x4 >= frame_header_.columns4x4) {
     return true;
   }
-  BlockParameters& bp = *tree->parameters();
-  block_parameters_holder_.FillCache(row4x4, column4x4, block_size, &bp);
+
+  if (split_parse_and_decode_) {
+    // Push block ordering info to the queue. DecodeBlock() will use this queue
+    // to decode the blocks in the correct order.
+    const int sb_row_index = SuperBlockRowIndex(row4x4);
+    const int sb_column_index = SuperBlockColumnIndex(column4x4);
+    residual_buffer_threaded_[sb_row_index][sb_column_index]
+        ->partition_tree_order()
+        ->Push(PartitionTreeNode(row4x4, column4x4, block_size));
+  }
+
+  BlockParameters* bp_ptr =
+      block_parameters_holder_.Get(row4x4, column4x4, block_size);
+  if (bp_ptr == nullptr) {
+    LIBGAV1_DLOG(ERROR, "Failed to get BlockParameters.");
+    return false;
+  }
+  BlockParameters& bp = *bp_ptr;
   Block block(*this, block_size, row4x4, column4x4, scratch_buffer, residual);
   bp.size = block_size;
   bp.prediction_parameters =
@@ -2186,16 +2201,13 @@ bool Tile::ProcessBlock(int row4x4, int column4x4, BlockSize block_size,
   return true;
 }
 
-bool Tile::DecodeBlock(ParameterTree* const tree,
+bool Tile::DecodeBlock(int row4x4, int column4x4, BlockSize block_size,
                        TileScratchBuffer* const scratch_buffer,
                        ResidualPtr* residual) {
-  const int row4x4 = tree->row4x4();
-  const int column4x4 = tree->column4x4();
   if (row4x4 >= frame_header_.rows4x4 ||
       column4x4 >= frame_header_.columns4x4) {
     return true;
   }
-  const BlockSize block_size = tree->block_size();
   Block block(*this, block_size, row4x4, column4x4, scratch_buffer, residual);
   if (!ComputePrediction(block) ||
       !Residual(block, kProcessingModeDecodeOnly)) {
@@ -2206,27 +2218,22 @@ bool Tile::DecodeBlock(ParameterTree* const tree,
 }
 
 bool Tile::ProcessPartition(int row4x4_start, int column4x4_start,
-                            ParameterTree* const root,
                             TileScratchBuffer* const scratch_buffer,
                             ResidualPtr* residual) {
-  Stack<ParameterTree*, kDfsStackSize> stack;
+  Stack<PartitionTreeNode, kDfsStackSize> stack;
 
   // Set up the first iteration.
-  ParameterTree* node = root;
-  int row4x4 = row4x4_start;
-  int column4x4 = column4x4_start;
-  BlockSize block_size = SuperBlockSize();
+  stack.Push(
+      PartitionTreeNode(row4x4_start, column4x4_start, SuperBlockSize()));
 
   // DFS loop. If it sees a terminal node (leaf node), ProcessBlock is invoked.
   // Otherwise, the children are pushed into the stack for future processing.
   do {
-    if (!stack.Empty()) {
-      // Set up subsequent iterations.
-      node = stack.Pop();
-      row4x4 = node->row4x4();
-      column4x4 = node->column4x4();
-      block_size = node->block_size();
-    }
+    PartitionTreeNode node = stack.Pop();
+    int row4x4 = node.row4x4;
+    int column4x4 = node.column4x4;
+    BlockSize block_size = node.block_size;
+
     if (row4x4 >= frame_header_.rows4x4 ||
         column4x4 >= frame_header_.columns4x4) {
       continue;
@@ -2262,13 +2269,13 @@ bool Tile::ProcessPartition(int row4x4_start, int column4x4_start,
           sequence_header_.color_config.subsampling_y);
       return false;
     }
-    if (!node->SetPartitionType(partition)) {
-      LIBGAV1_DLOG(ERROR, "node->SetPartitionType() failed.");
-      return false;
-    }
+
+    const int quarter_block4x4 = half_block4x4 >> 1;
+    const BlockSize split_size = kSubSize[kPartitionSplit][block_size];
+    assert(partition == kPartitionNone || sub_size != kBlockInvalid);
     switch (partition) {
       case kPartitionNone:
-        if (!ProcessBlock(row4x4, column4x4, sub_size, node, scratch_buffer,
+        if (!ProcessBlock(row4x4, column4x4, sub_size, scratch_buffer,
                           residual)) {
           return false;
         }
@@ -2276,28 +2283,82 @@ bool Tile::ProcessPartition(int row4x4_start, int column4x4_start,
       case kPartitionSplit:
         // The children must be added in reverse order since a stack is being
         // used.
-        for (int i = 3; i >= 0; --i) {
-          ParameterTree* const child = node->children(i);
-          assert(child != nullptr);
-          stack.Push(child);
-        }
+        stack.Push(PartitionTreeNode(row4x4 + half_block4x4,
+                                     column4x4 + half_block4x4, sub_size));
+        stack.Push(
+            PartitionTreeNode(row4x4 + half_block4x4, column4x4, sub_size));
+        stack.Push(
+            PartitionTreeNode(row4x4, column4x4 + half_block4x4, sub_size));
+        stack.Push(PartitionTreeNode(row4x4, column4x4, sub_size));
         break;
       case kPartitionHorizontal:
+        if (!ProcessBlock(row4x4, column4x4, sub_size, scratch_buffer,
+                          residual) ||
+            !ProcessBlock(row4x4 + half_block4x4, column4x4, sub_size,
+                          scratch_buffer, residual)) {
+          return false;
+        }
+        break;
       case kPartitionVertical:
+        if (!ProcessBlock(row4x4, column4x4, sub_size, scratch_buffer,
+                          residual) ||
+            !ProcessBlock(row4x4, column4x4 + half_block4x4, sub_size,
+                          scratch_buffer, residual)) {
+          return false;
+        }
+        break;
       case kPartitionHorizontalWithTopSplit:
+        if (!ProcessBlock(row4x4, column4x4, split_size, scratch_buffer,
+                          residual) ||
+            !ProcessBlock(row4x4, column4x4 + half_block4x4, split_size,
+                          scratch_buffer, residual) ||
+            !ProcessBlock(row4x4 + half_block4x4, column4x4, sub_size,
+                          scratch_buffer, residual)) {
+          return false;
+        }
+        break;
       case kPartitionHorizontalWithBottomSplit:
+        if (!ProcessBlock(row4x4, column4x4, sub_size, scratch_buffer,
+                          residual) ||
+            !ProcessBlock(row4x4 + half_block4x4, column4x4, split_size,
+                          scratch_buffer, residual) ||
+            !ProcessBlock(row4x4 + half_block4x4, column4x4 + half_block4x4,
+                          split_size, scratch_buffer, residual)) {
+          return false;
+        }
+        break;
       case kPartitionVerticalWithLeftSplit:
+        if (!ProcessBlock(row4x4, column4x4, split_size, scratch_buffer,
+                          residual) ||
+            !ProcessBlock(row4x4 + half_block4x4, column4x4, split_size,
+                          scratch_buffer, residual) ||
+            !ProcessBlock(row4x4, column4x4 + half_block4x4, sub_size,
+                          scratch_buffer, residual)) {
+          return false;
+        }
+        break;
       case kPartitionVerticalWithRightSplit:
+        if (!ProcessBlock(row4x4, column4x4, sub_size, scratch_buffer,
+                          residual) ||
+            !ProcessBlock(row4x4, column4x4 + half_block4x4, split_size,
+                          scratch_buffer, residual) ||
+            !ProcessBlock(row4x4 + half_block4x4, column4x4 + half_block4x4,
+                          split_size, scratch_buffer, residual)) {
+          return false;
+        }
+        break;
       case kPartitionHorizontal4:
+        for (int i = 0; i < 4; ++i) {
+          if (!ProcessBlock(row4x4 + i * quarter_block4x4, column4x4, sub_size,
+                            scratch_buffer, residual)) {
+            return false;
+          }
+        }
+        break;
       case kPartitionVertical4:
         for (int i = 0; i < 4; ++i) {
-          ParameterTree* const child = node->children(i);
-          // Once a null child is seen, all the subsequent children will also be
-          // null.
-          if (child == nullptr) break;
-          if (!ProcessBlock(child->row4x4(), child->column4x4(),
-                            child->block_size(), child, scratch_buffer,
-                            residual)) {
+          if (!ProcessBlock(row4x4, column4x4 + i * quarter_block4x4, sub_size,
+                            scratch_buffer, residual)) {
             return false;
           }
         }
@@ -2370,7 +2431,7 @@ void Tile::ClearBlockDecoded(TileScratchBuffer* const scratch_buffer,
   }
 }
 
-bool Tile::ProcessSuperBlock(int row4x4, int column4x4, int block_width4x4,
+bool Tile::ProcessSuperBlock(int row4x4, int column4x4,
                              TileScratchBuffer* const scratch_buffer,
                              ProcessingMode mode) {
   const bool parsing =
@@ -2388,13 +2449,10 @@ bool Tile::ProcessSuperBlock(int row4x4, int column4x4, int block_width4x4,
   if (parsing) {
     ReadLoopRestorationCoefficients(row4x4, column4x4, block_size);
   }
-  const int row = row4x4 / block_width4x4;
-  const int column = column4x4 / block_width4x4;
   if (parsing && decoding) {
     uint8_t* residual_buffer = residual_buffer_.get();
-    if (!ProcessPartition(row4x4, column4x4,
-                          block_parameters_holder_.Tree(row, column),
-                          scratch_buffer, &residual_buffer)) {
+    if (!ProcessPartition(row4x4, column4x4, scratch_buffer,
+                          &residual_buffer)) {
       LIBGAV1_DLOG(ERROR, "Error decoding partition row: %d column: %d", row4x4,
                    column4x4);
       return false;
@@ -2412,18 +2470,14 @@ bool Tile::ProcessSuperBlock(int row4x4, int column4x4, int block_width4x4,
     }
     uint8_t* residual_buffer =
         residual_buffer_threaded_[sb_row_index][sb_column_index]->buffer();
-    if (!ProcessPartition(row4x4, column4x4,
-                          block_parameters_holder_.Tree(row, column),
-                          scratch_buffer, &residual_buffer)) {
+    if (!ProcessPartition(row4x4, column4x4, scratch_buffer,
+                          &residual_buffer)) {
       LIBGAV1_DLOG(ERROR, "Error parsing partition row: %d column: %d", row4x4,
                    column4x4);
       return false;
     }
   } else {
-    uint8_t* residual_buffer =
-        residual_buffer_threaded_[sb_row_index][sb_column_index]->buffer();
-    if (!DecodeSuperBlock(block_parameters_holder_.Tree(row, column),
-                          scratch_buffer, &residual_buffer)) {
+    if (!DecodeSuperBlock(sb_row_index, sb_column_index, scratch_buffer)) {
       LIBGAV1_DLOG(ERROR, "Error decoding superblock row: %d column: %d",
                    row4x4, column4x4);
       return false;
@@ -2434,26 +2488,23 @@ bool Tile::ProcessSuperBlock(int row4x4, int column4x4, int block_width4x4,
   return true;
 }
 
-bool Tile::DecodeSuperBlock(ParameterTree* const tree,
-                            TileScratchBuffer* const scratch_buffer,
-                            ResidualPtr* residual) {
-  Stack<ParameterTree*, kDfsStackSize> stack;
-  stack.Push(tree);
-  do {
-    ParameterTree* const node = stack.Pop();
-    if (node->partition() != kPartitionNone) {
-      for (int i = 3; i >= 0; --i) {
-        if (node->children(i) == nullptr) continue;
-        stack.Push(node->children(i));
-      }
-      continue;
-    }
-    if (!DecodeBlock(node, scratch_buffer, residual)) {
+bool Tile::DecodeSuperBlock(int sb_row_index, int sb_column_index,
+                            TileScratchBuffer* const scratch_buffer) {
+  uint8_t* residual_buffer =
+      residual_buffer_threaded_[sb_row_index][sb_column_index]->buffer();
+  Queue<PartitionTreeNode>& partition_tree_order =
+      *residual_buffer_threaded_[sb_row_index][sb_column_index]
+           ->partition_tree_order();
+  while (!partition_tree_order.Empty()) {
+    PartitionTreeNode block = partition_tree_order.Front();
+    if (!DecodeBlock(block.row4x4, block.column4x4, block.block_size,
+                     scratch_buffer, &residual_buffer)) {
       LIBGAV1_DLOG(ERROR, "Error decoding block row: %d column: %d",
-                   node->row4x4(), node->column4x4());
+                   block.row4x4, block.column4x4);
       return false;
     }
-  } while (!stack.Empty());
+    partition_tree_order.Pop();
+  }
   return true;
 }
 
diff --git a/src/utils/array_2d.h b/src/utils/array_2d.h
index 2df6241..df2da9f 100644
--- a/src/utils/array_2d.h
+++ b/src/utils/array_2d.h
@@ -120,7 +120,7 @@ class Array2D {
   const T* operator[](int row) const { return data_view_[row]; }
 
  private:
-  std::unique_ptr<T[]> data_ = nullptr;
+  std::unique_ptr<T[]> data_;
   size_t allocated_size_ = 0;
   size_t size_ = 0;
   Array2DView<T> data_view_;
diff --git a/src/utils/block_parameters_holder.cc b/src/utils/block_parameters_holder.cc
index 3ccdb9b..3bb9f1e 100644
--- a/src/utils/block_parameters_holder.cc
+++ b/src/utils/block_parameters_holder.cc
@@ -19,53 +19,29 @@
 #include "src/utils/common.h"
 #include "src/utils/constants.h"
 #include "src/utils/logging.h"
-#include "src/utils/parameter_tree.h"
 #include "src/utils/types.h"
 
 namespace libgav1 {
 
-namespace {
-
-// Returns the number of super block rows/columns for |value4x4| where value4x4
-// is either rows4x4 or columns4x4.
-int RowsOrColumns4x4ToSuperBlocks(int value4x4, bool use_128x128_superblock) {
-  return use_128x128_superblock ? DivideBy128(MultiplyBy4(value4x4) + 127)
-                                : DivideBy64(MultiplyBy4(value4x4) + 63);
-}
-
-}  // namespace
-
-bool BlockParametersHolder::Reset(int rows4x4, int columns4x4,
-                                  bool use_128x128_superblock) {
+bool BlockParametersHolder::Reset(int rows4x4, int columns4x4) {
   rows4x4_ = rows4x4;
   columns4x4_ = columns4x4;
-  use_128x128_superblock_ = use_128x128_superblock;
-  if (!block_parameters_cache_.Reset(rows4x4_, columns4x4_)) {
-    LIBGAV1_DLOG(ERROR, "block_parameters_cache_.Reset() failed.");
-    return false;
-  }
-  const int rows =
-      RowsOrColumns4x4ToSuperBlocks(rows4x4_, use_128x128_superblock_);
-  const int columns =
-      RowsOrColumns4x4ToSuperBlocks(columns4x4_, use_128x128_superblock_);
-  const BlockSize sb_size =
-      use_128x128_superblock_ ? kBlock128x128 : kBlock64x64;
-  const int multiplier = kNum4x4BlocksWide[sb_size];
-  if (!trees_.Reset(rows, columns, /*zero_initialize=*/false)) {
-    LIBGAV1_DLOG(ERROR, "trees_.Reset() failed.");
-    return false;
-  }
-  for (int i = 0; i < rows; ++i) {
-    for (int j = 0; j < columns; ++j) {
-      trees_[i][j] =
-          ParameterTree::Create(i * multiplier, j * multiplier, sb_size);
-      if (trees_[i][j] == nullptr) {
-        LIBGAV1_DLOG(ERROR, "Allocation of trees_[%d][%d] failed.", i, j);
-        return false;
-      }
-    }
+  index_ = 0;
+  return block_parameters_cache_.Reset(rows4x4_, columns4x4_) &&
+         block_parameters_.Resize(rows4x4_ * columns4x4_);
+}
+
+BlockParameters* BlockParametersHolder::Get(int row4x4, int column4x4,
+                                            BlockSize block_size) {
+  const size_t index = index_.fetch_add(1, std::memory_order_relaxed);
+  if (index >= block_parameters_.size()) return nullptr;
+  auto& bp = block_parameters_.get()[index];
+  if (bp == nullptr) {
+    bp.reset(new (std::nothrow) BlockParameters);
+    if (bp == nullptr) return nullptr;
   }
-  return true;
+  FillCache(row4x4, column4x4, block_size, bp.get());
+  return bp.get();
 }
 
 void BlockParametersHolder::FillCache(int row4x4, int column4x4,
diff --git a/src/utils/block_parameters_holder.h b/src/utils/block_parameters_holder.h
index 35543c3..ca36907 100644
--- a/src/utils/block_parameters_holder.h
+++ b/src/utils/block_parameters_holder.h
@@ -17,18 +17,18 @@
 #ifndef LIBGAV1_SRC_UTILS_BLOCK_PARAMETERS_HOLDER_H_
 #define LIBGAV1_SRC_UTILS_BLOCK_PARAMETERS_HOLDER_H_
 
+#include <atomic>
 #include <memory>
 
 #include "src/utils/array_2d.h"
 #include "src/utils/compiler_attributes.h"
 #include "src/utils/constants.h"
-#include "src/utils/parameter_tree.h"
+#include "src/utils/dynamic_buffer.h"
 #include "src/utils/types.h"
 
 namespace libgav1 {
 
-// Holds a 2D array of |ParameterTree| objects. Each tree stores the parameters
-// corresponding to a superblock.
+// Holds the BlockParameters pointers to each 4x4 block in the frame.
 class BlockParametersHolder {
  public:
   BlockParametersHolder() = default;
@@ -37,10 +37,13 @@ class BlockParametersHolder {
   BlockParametersHolder(const BlockParametersHolder&) = delete;
   BlockParametersHolder& operator=(const BlockParametersHolder&) = delete;
 
-  // If |use_128x128_superblock| is true, 128x128 superblocks will be used,
-  // otherwise 64x64 superblocks will be used.
-  LIBGAV1_MUST_USE_RESULT bool Reset(int rows4x4, int columns4x4,
-                                     bool use_128x128_superblock);
+  LIBGAV1_MUST_USE_RESULT bool Reset(int rows4x4, int columns4x4);
+
+  // Returns a pointer to a BlockParameters object that can be used safely until
+  // the next call to Reset(). Returns nullptr on memory allocation failure. It
+  // also fills the cache matrix for the block starting at |row4x4|, |column4x4|
+  // of size |block_size| with the returned pointer.
+  BlockParameters* Get(int row4x4, int column4x4, BlockSize block_size);
 
   // Finds the BlockParameters corresponding to |row4x4| and |column4x4|. This
   // is done as a simple look up of the |block_parameters_cache_| matrix.
@@ -59,20 +62,24 @@ class BlockParametersHolder {
 
   int columns4x4() const { return columns4x4_; }
 
-  // Returns the ParameterTree corresponding to superblock starting at (|row|,
-  // |column|).
-  ParameterTree* Tree(int row, int column) { return trees_[row][column].get(); }
+ private:
+  // Needs access to FillCache for testing Cdef.
+  template <int bitdepth, typename Pixel>
+  friend class PostFilterApplyCdefTest;
 
-  // Fills the cache matrix for the block starting at |row4x4|, |column4x4| of
-  // size |block_size| with the pointer |bp|.
   void FillCache(int row4x4, int column4x4, BlockSize block_size,
                  BlockParameters* bp);
 
- private:
   int rows4x4_ = 0;
   int columns4x4_ = 0;
-  bool use_128x128_superblock_ = false;
-  Array2D<std::unique_ptr<ParameterTree>> trees_;
+
+  // Owns the memory of BlockParameters pointers for the entire frame. It can
+  // hold upto |rows4x4_| * |columns4x4_| objects. Each object will be allocated
+  // on demand and re-used across frames.
+  DynamicBuffer<std::unique_ptr<BlockParameters>> block_parameters_;
+
+  // Points to the next available index of |block_parameters_|.
+  std::atomic<int> index_;
 
   // This is a 2d array of size |rows4x4_| * |columns4x4_|. This is filled in by
   // FillCache() and used by Find() to perform look ups using exactly one look
diff --git a/src/utils/common.h b/src/utils/common.h
index ae43c2b..2e599f0 100644
--- a/src/utils/common.h
+++ b/src/utils/common.h
@@ -30,7 +30,6 @@
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
-#include <cstdlib>
 #include <cstring>
 #include <type_traits>
 
@@ -131,7 +130,7 @@ inline int CountLeadingZeros(uint64_t n) {
 #if defined(HAVE_BITSCANREVERSE64)
   const unsigned char bit_set =
       _BitScanReverse64(&first_set_bit, static_cast<unsigned __int64>(n));
-#else  // !defined(HAVE_BITSCANREVERSE64)
+#else   // !defined(HAVE_BITSCANREVERSE64)
   const auto n_hi = static_cast<unsigned long>(n >> 32);  // NOLINT(runtime/int)
   if (n_hi != 0) {
     const unsigned char bit_set = _BitScanReverse(&first_set_bit, n_hi);
@@ -376,7 +375,7 @@ constexpr bool IsDirectionalMode(PredictionMode mode) {
 // behavior and result apply to other CPUs' SIMD instructions.
 inline int GetRelativeDistance(const unsigned int a, const unsigned int b,
                                const unsigned int order_hint_shift_bits) {
-  const int diff = a - b;
+  const int diff = static_cast<int>(a) - static_cast<int>(b);
   assert(order_hint_shift_bits <= 31);
   if (order_hint_shift_bits == 0) {
     assert(a == 0);
diff --git a/src/utils/constants.h b/src/utils/constants.h
index 34cf56d..a2076c5 100644
--- a/src/utils/constants.h
+++ b/src/utils/constants.h
@@ -629,6 +629,52 @@ inline const char* ToString(const LoopRestorationType type) {
   abort();
 }
 
+inline const char* ToString(const TransformSize size) {
+  switch (size) {
+    case kTransformSize4x4:
+      return "kTransformSize4x4";
+    case kTransformSize4x8:
+      return "kTransformSize4x8";
+    case kTransformSize4x16:
+      return "kTransformSize4x16";
+    case kTransformSize8x4:
+      return "kTransformSize8x4";
+    case kTransformSize8x8:
+      return "kTransformSize8x8";
+    case kTransformSize8x16:
+      return "kTransformSize8x16";
+    case kTransformSize8x32:
+      return "kTransformSize8x32";
+    case kTransformSize16x4:
+      return "kTransformSize16x4";
+    case kTransformSize16x8:
+      return "kTransformSize16x8";
+    case kTransformSize16x16:
+      return "kTransformSize16x16";
+    case kTransformSize16x32:
+      return "kTransformSize16x32";
+    case kTransformSize16x64:
+      return "kTransformSize16x64";
+    case kTransformSize32x8:
+      return "kTransformSize32x8";
+    case kTransformSize32x16:
+      return "kTransformSize32x16";
+    case kTransformSize32x32:
+      return "kTransformSize32x32";
+    case kTransformSize32x64:
+      return "kTransformSize32x64";
+    case kTransformSize64x16:
+      return "kTransformSize64x16";
+    case kTransformSize64x32:
+      return "kTransformSize64x32";
+    case kTransformSize64x64:
+      return "kTransformSize64x64";
+    case kNumTransformSizes:
+      return "kNumTransformSizes";
+  }
+  abort();
+}
+
 inline const char* ToString(const TransformType type) {
   switch (type) {
     case kTransformTypeDctDct:
diff --git a/src/utils/cpu.cc b/src/utils/cpu.cc
index a6b7057..b3c51da 100644
--- a/src/utils/cpu.cc
+++ b/src/utils/cpu.cc
@@ -39,7 +39,7 @@ uint64_t Xgetbv() {
   __asm__ volatile("xgetbv" : "=a"(eax), "=d"(edx) : "c"(ecx));
   return (static_cast<uint64_t>(edx) << 32) | eax;
 }
-#else  // _MSC_VER
+#else   // _MSC_VER
 void CpuId(int leaf, uint32_t info[4]) {
   __cpuidex(reinterpret_cast<int*>(info), leaf, 0 /*ecx=subleaf*/);
 }
diff --git a/src/utils/cpu.h b/src/utils/cpu.h
index 630b251..aefc2df 100644
--- a/src/utils/cpu.h
+++ b/src/utils/cpu.h
@@ -38,7 +38,7 @@ namespace libgav1 {
 #if !defined(LIBGAV1_ENABLE_AVX2)
 #define LIBGAV1_ENABLE_AVX2 1
 #endif  // !defined(LIBGAV1_ENABLE_AVX2)
-#else  // !LIBGAV1_ENABLE_SSE4_1
+#else   // !LIBGAV1_ENABLE_SSE4_1
 // Disable AVX2 when SSE4.1 is disabled as it may rely on shared components.
 #undef LIBGAV1_ENABLE_AVX2
 #define LIBGAV1_ENABLE_AVX2 0
diff --git a/src/utils/dynamic_buffer.h b/src/utils/dynamic_buffer.h
index b51345a..40ece26 100644
--- a/src/utils/dynamic_buffer.h
+++ b/src/utils/dynamic_buffer.h
@@ -46,6 +46,8 @@ class DynamicBuffer {
     return true;
   }
 
+  size_t size() const { return size_; }
+
  private:
   std::unique_ptr<T[]> buffer_;
   size_t size_ = 0;
diff --git a/src/utils/libgav1_utils.cmake b/src/utils/libgav1_utils.cmake
index 8b6ec4b..587ca5d 100644
--- a/src/utils/libgav1_utils.cmake
+++ b/src/utils/libgav1_utils.cmake
@@ -39,8 +39,6 @@ list(APPEND libgav1_utils_sources
             "${libgav1_source}/utils/logging.cc"
             "${libgav1_source}/utils/logging.h"
             "${libgav1_source}/utils/memory.h"
-            "${libgav1_source}/utils/parameter_tree.cc"
-            "${libgav1_source}/utils/parameter_tree.h"
             "${libgav1_source}/utils/queue.h"
             "${libgav1_source}/utils/raw_bit_reader.cc"
             "${libgav1_source}/utils/raw_bit_reader.h"
diff --git a/src/utils/logging.cc b/src/utils/logging.cc
index 9a43c22..26e3e15 100644
--- a/src/utils/logging.cc
+++ b/src/utils/logging.cc
@@ -56,7 +56,7 @@ void Log(LogSeverity severity, const char* file, int line, const char* format,
   va_end(ap);
   fprintf(stderr, "\n");
 }
-#else  // !LIBGAV1_ENABLE_LOGGING
+#else   // !LIBGAV1_ENABLE_LOGGING
 void Log(LogSeverity /*severity*/, const char* /*file*/, int /*line*/,
          const char* /*format*/, ...) {}
 #endif  // LIBGAV1_ENABLE_LOGGING
diff --git a/src/utils/logging.h b/src/utils/logging.h
index 48928db..473aebd 100644
--- a/src/utils/logging.h
+++ b/src/utils/logging.h
@@ -35,13 +35,13 @@
 // setting LIBGAV1_ENABLE_LOGGING.
 // Severity is given as an all-caps version of enum LogSeverity with the
 // leading 'k' removed: LIBGAV1_DLOG(INFO, "...");
-#define LIBGAV1_DLOG(severity, ...)                                       \
-  do {                                                                    \
-    constexpr const char* libgav1_logging_internal_basename =             \
-        ::libgav1::internal::Basename(__FILE__, sizeof(__FILE__) - 1);    \
-    ::libgav1::internal::Log(LIBGAV1_LOGGING_INTERNAL_##severity,         \
-                             libgav1_logging_internal_basename, __LINE__, \
-                             __VA_ARGS__);                                \
+#define LIBGAV1_DLOG(severity, ...)                                     \
+  do {                                                                  \
+    constexpr const char* libgav1_logging_internal_basename =           \
+        libgav1::internal::Basename(__FILE__, sizeof(__FILE__) - 1);    \
+    libgav1::internal::Log(LIBGAV1_LOGGING_INTERNAL_##severity,         \
+                           libgav1_logging_internal_basename, __LINE__, \
+                           __VA_ARGS__);                                \
   } while (0)
 #else
 #define LIBGAV1_DLOG(severity, ...) \
@@ -49,10 +49,10 @@
   } while (0)
 #endif  // LIBGAV1_ENABLE_LOGGING
 
-#define LIBGAV1_LOGGING_INTERNAL_ERROR ::libgav1::internal::LogSeverity::kError
+#define LIBGAV1_LOGGING_INTERNAL_ERROR libgav1::internal::LogSeverity::kError
 #define LIBGAV1_LOGGING_INTERNAL_WARNING \
-  ::libgav1::internal::LogSeverity::kWarning
-#define LIBGAV1_LOGGING_INTERNAL_INFO ::libgav1::internal::LogSeverity::kInfo
+  libgav1::internal::LogSeverity::kWarning
+#define LIBGAV1_LOGGING_INTERNAL_INFO libgav1::internal::LogSeverity::kInfo
 
 namespace libgav1 {
 namespace internal {
diff --git a/src/utils/memory.h b/src/utils/memory.h
index 219a83f..a8da53b 100644
--- a/src/utils/memory.h
+++ b/src/utils/memory.h
@@ -71,7 +71,7 @@ inline void* AlignedAlloc(size_t alignment, size_t size) {
   // more convenient to use memalign(). Unlike glibc, Android does not consider
   // memalign() an obsolete function.
   return memalign(alignment, size);
-#else  // !defined(__ANDROID__)
+#else   // !defined(__ANDROID__)
   void* ptr = nullptr;
   // posix_memalign requires that the requested alignment be at least
   // sizeof(void*). In this case, fall back on malloc which should return
diff --git a/src/utils/parameter_tree.cc b/src/utils/parameter_tree.cc
deleted file mode 100644
index 9426ce6..0000000
--- a/src/utils/parameter_tree.cc
+++ /dev/null
@@ -1,133 +0,0 @@
-// Copyright 2019 The libgav1 Authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "src/utils/parameter_tree.h"
-
-#include <cassert>
-#include <memory>
-#include <new>
-
-#include "src/utils/common.h"
-#include "src/utils/constants.h"
-#include "src/utils/logging.h"
-#include "src/utils/types.h"
-
-namespace libgav1 {
-
-// static
-std::unique_ptr<ParameterTree> ParameterTree::Create(int row4x4, int column4x4,
-                                                     BlockSize block_size,
-                                                     bool is_leaf) {
-  std::unique_ptr<ParameterTree> tree(
-      new (std::nothrow) ParameterTree(row4x4, column4x4, block_size));
-  if (tree != nullptr && is_leaf && !tree->SetPartitionType(kPartitionNone)) {
-    tree = nullptr;
-  }
-  return tree;
-}
-
-bool ParameterTree::SetPartitionType(Partition partition) {
-  assert(!partition_type_set_);
-  partition_ = partition;
-  partition_type_set_ = true;
-  const int block_width4x4 = kNum4x4BlocksWide[block_size_];
-  const int half_block4x4 = block_width4x4 >> 1;
-  const int quarter_block4x4 = half_block4x4 >> 1;
-  const BlockSize sub_size = kSubSize[partition][block_size_];
-  const BlockSize split_size = kSubSize[kPartitionSplit][block_size_];
-  assert(partition == kPartitionNone || sub_size != kBlockInvalid);
-  switch (partition) {
-    case kPartitionNone:
-      parameters_.reset(new (std::nothrow) BlockParameters());
-      return parameters_ != nullptr;
-    case kPartitionHorizontal:
-      children_[0] = ParameterTree::Create(row4x4_, column4x4_, sub_size, true);
-      children_[1] = ParameterTree::Create(row4x4_ + half_block4x4, column4x4_,
-                                           sub_size, true);
-      return children_[0] != nullptr && children_[1] != nullptr;
-    case kPartitionVertical:
-      children_[0] = ParameterTree::Create(row4x4_, column4x4_, sub_size, true);
-      children_[1] = ParameterTree::Create(row4x4_, column4x4_ + half_block4x4,
-                                           sub_size, true);
-      return children_[0] != nullptr && children_[1] != nullptr;
-    case kPartitionSplit:
-      children_[0] =
-          ParameterTree::Create(row4x4_, column4x4_, sub_size, false);
-      children_[1] = ParameterTree::Create(row4x4_, column4x4_ + half_block4x4,
-                                           sub_size, false);
-      children_[2] = ParameterTree::Create(row4x4_ + half_block4x4, column4x4_,
-                                           sub_size, false);
-      children_[3] = ParameterTree::Create(
-          row4x4_ + half_block4x4, column4x4_ + half_block4x4, sub_size, false);
-      return children_[0] != nullptr && children_[1] != nullptr &&
-             children_[2] != nullptr && children_[3] != nullptr;
-    case kPartitionHorizontalWithTopSplit:
-      assert(split_size != kBlockInvalid);
-      children_[0] =
-          ParameterTree::Create(row4x4_, column4x4_, split_size, true);
-      children_[1] = ParameterTree::Create(row4x4_, column4x4_ + half_block4x4,
-                                           split_size, true);
-      children_[2] = ParameterTree::Create(row4x4_ + half_block4x4, column4x4_,
-                                           sub_size, true);
-      return children_[0] != nullptr && children_[1] != nullptr &&
-             children_[2] != nullptr;
-    case kPartitionHorizontalWithBottomSplit:
-      assert(split_size != kBlockInvalid);
-      children_[0] = ParameterTree::Create(row4x4_, column4x4_, sub_size, true);
-      children_[1] = ParameterTree::Create(row4x4_ + half_block4x4, column4x4_,
-                                           split_size, true);
-      children_[2] =
-          ParameterTree::Create(row4x4_ + half_block4x4,
-                                column4x4_ + half_block4x4, split_size, true);
-      return children_[0] != nullptr && children_[1] != nullptr &&
-             children_[2] != nullptr;
-    case kPartitionVerticalWithLeftSplit:
-      assert(split_size != kBlockInvalid);
-      children_[0] =
-          ParameterTree::Create(row4x4_, column4x4_, split_size, true);
-      children_[1] = ParameterTree::Create(row4x4_ + half_block4x4, column4x4_,
-                                           split_size, true);
-      children_[2] = ParameterTree::Create(row4x4_, column4x4_ + half_block4x4,
-                                           sub_size, true);
-      return children_[0] != nullptr && children_[1] != nullptr &&
-             children_[2] != nullptr;
-    case kPartitionVerticalWithRightSplit:
-      assert(split_size != kBlockInvalid);
-      children_[0] = ParameterTree::Create(row4x4_, column4x4_, sub_size, true);
-      children_[1] = ParameterTree::Create(row4x4_, column4x4_ + half_block4x4,
-                                           split_size, true);
-      children_[2] =
-          ParameterTree::Create(row4x4_ + half_block4x4,
-                                column4x4_ + half_block4x4, split_size, true);
-      return children_[0] != nullptr && children_[1] != nullptr &&
-             children_[2] != nullptr;
-    case kPartitionHorizontal4:
-      for (int i = 0; i < 4; ++i) {
-        children_[i] = ParameterTree::Create(row4x4_ + i * quarter_block4x4,
-                                             column4x4_, sub_size, true);
-        if (children_[i] == nullptr) return false;
-      }
-      return true;
-    default:
-      assert(partition == kPartitionVertical4);
-      for (int i = 0; i < 4; ++i) {
-        children_[i] = ParameterTree::Create(
-            row4x4_, column4x4_ + i * quarter_block4x4, sub_size, true);
-        if (children_[i] == nullptr) return false;
-      }
-      return true;
-  }
-}
-
-}  // namespace libgav1
diff --git a/src/utils/parameter_tree.h b/src/utils/parameter_tree.h
deleted file mode 100644
index 935f3eb..0000000
--- a/src/utils/parameter_tree.h
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- * Copyright 2019 The libgav1 Authors
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef LIBGAV1_SRC_UTILS_PARAMETER_TREE_H_
-#define LIBGAV1_SRC_UTILS_PARAMETER_TREE_H_
-
-#include <cassert>
-#include <memory>
-
-#include "src/utils/common.h"
-#include "src/utils/compiler_attributes.h"
-#include "src/utils/constants.h"
-#include "src/utils/memory.h"
-#include "src/utils/types.h"
-
-namespace libgav1 {
-
-class ParameterTree : public Allocable {
- public:
-  // Creates a parameter tree to store the parameters of a block of size
-  // |block_size| starting at coordinates |row4x4| and |column4x4|. If |is_leaf|
-  // is set to true, the memory will be allocated for the BlockParameters for
-  // this node. Otherwise, no memory will be allocated. If |is_leaf| is set to
-  // false, |block_size| must be a square block, i.e.,
-  // kBlockWidthPixels[block_size] must be equal to
-  // kBlockHeightPixels[block_size].
-  static std::unique_ptr<ParameterTree> Create(int row4x4, int column4x4,
-                                               BlockSize block_size,
-                                               bool is_leaf = false);
-
-  // Move only (not Copyable).
-  ParameterTree(ParameterTree&& other) = default;
-  ParameterTree& operator=(ParameterTree&& other) = default;
-  ParameterTree(const ParameterTree&) = delete;
-  ParameterTree& operator=(const ParameterTree&) = delete;
-
-  // Set the partition type of the current node to |partition|.
-  // if (partition == kPartitionNone) {
-  //   Memory will be allocated for the BlockParameters for this node.
-  // } else if (partition != kPartitionSplit) {
-  //   The appropriate child nodes will be populated and memory will be
-  //   allocated for the BlockParameters of the children.
-  // } else {
-  //   The appropriate child nodes will be populated but they are considered to
-  //   be hanging, i.e., future calls to SetPartitionType() on the child nodes
-  //   will have to set them or their descendants to a terminal type.
-  // }
-  // This function must be called only once per node.
-  LIBGAV1_MUST_USE_RESULT bool SetPartitionType(Partition partition);
-
-  // Basic getters.
-  int row4x4() const { return row4x4_; }
-  int column4x4() const { return column4x4_; }
-  BlockSize block_size() const { return block_size_; }
-  Partition partition() const { return partition_; }
-  ParameterTree* children(int index) const {
-    assert(index < 4);
-    return children_[index].get();
-  }
-  // Returns the BlockParameters object of the current node if one exists.
-  // Otherwise returns nullptr. This function will return a valid
-  // BlockParameters object only for leaf nodes.
-  BlockParameters* parameters() const { return parameters_.get(); }
-
- private:
-  ParameterTree(int row4x4, int column4x4, BlockSize block_size)
-      : row4x4_(row4x4), column4x4_(column4x4), block_size_(block_size) {}
-
-  Partition partition_ = kPartitionNone;
-  std::unique_ptr<BlockParameters> parameters_ = nullptr;
-  int row4x4_ = -1;
-  int column4x4_ = -1;
-  BlockSize block_size_ = kBlockInvalid;
-  bool partition_type_set_ = false;
-
-  // Child values are defined as follows for various partition types:
-  //  * Horizontal: 0 top partition; 1 bottom partition; 2 nullptr; 3 nullptr;
-  //  * Vertical: 0 left partition; 1 right partition; 2 nullptr; 3 nullptr;
-  //  * Split: 0 top-left partition; 1 top-right partition; 2; bottom-left
-  //    partition; 3 bottom-right partition;
-  //  * HorizontalWithTopSplit: 0 top-left partition; 1 top-right partition; 2
-  //    bottom partition; 3 nullptr;
-  //  * HorizontalWithBottomSplit: 0 top partition; 1 bottom-left partition; 2
-  //    bottom-right partition; 3 nullptr;
-  //  * VerticalWithLeftSplit: 0 top-left partition; 1 bottom-left partition; 2
-  //    right partition; 3 nullptr;
-  //  * VerticalWithRightSplit: 0 left-partition; 1 top-right partition; 2
-  //    bottom-right partition; 3 nullptr;
-  //  * Horizontal4: 0 top partition; 1 second top partition; 2 third top
-  //    partition; 3 bottom partition;
-  //  * Vertical4: 0 left partition; 1 second left partition; 2 third left
-  //    partition; 3 right partition;
-  std::unique_ptr<ParameterTree> children_[4] = {};
-
-  friend class ParameterTreeTest;
-};
-
-}  // namespace libgav1
-
-#endif  // LIBGAV1_SRC_UTILS_PARAMETER_TREE_H_
diff --git a/src/utils/raw_bit_reader.h b/src/utils/raw_bit_reader.h
index 76e7bfa..7d8ce8f 100644
--- a/src/utils/raw_bit_reader.h
+++ b/src/utils/raw_bit_reader.h
@@ -38,7 +38,7 @@ class RawBitReader : public BitReader, public Allocable {
                         size_t* value);    // le(n) in the spec.
   bool ReadUnsignedLeb128(size_t* value);  // leb128() in the spec.
   // Reads a variable length unsigned number and stores it in |*value|. On a
-  // successful return, |*value| is in the range of 0 to UINT32_MAX − 1,
+  // successful return, |*value| is in the range of 0 to UINT32_MAX - 1,
   // inclusive.
   bool ReadUvlc(uint32_t* value);  // uvlc() in the spec.
   bool Finished() const;
diff --git a/src/utils/threadpool.cc b/src/utils/threadpool.cc
index 8c8f4fe..a3099e1 100644
--- a/src/utils/threadpool.cc
+++ b/src/utils/threadpool.cc
@@ -37,17 +37,21 @@
 #include <chrono>  // NOLINT (unapproved c++11 header)
 #endif
 
+// Define the GetTid() function, a wrapper for the gettid() system call in
+// Linux.
+#if defined(__ANDROID__)
+static pid_t GetTid() { return gettid(); }
+#elif defined(__GLIBC__)
 // The glibc wrapper for the gettid() system call was added in glibc 2.30.
 // Emulate it for older versions of glibc.
-#if defined(__GLIBC_PREREQ)
-#if !__GLIBC_PREREQ(2, 30)
-
+#if __GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 30)
+static pid_t GetTid() { return gettid(); }
+#else  // Older than glibc 2.30
 #include <sys/syscall.h>
 
-static pid_t gettid() { return static_cast<pid_t>(syscall(SYS_gettid)); }
-
-#endif
-#endif  // defined(__GLIBC_PREREQ)
+static pid_t GetTid() { return static_cast<pid_t>(syscall(SYS_gettid)); }
+#endif  // glibc 2.30 or later.
+#endif  // defined(__GLIBC__)
 
 namespace libgav1 {
 
@@ -216,7 +220,7 @@ void ThreadPool::WorkerThread::SetupName() {
     // If the |name| buffer is longer than 16 bytes, pthread_setname_np fails
     // with error 34 (ERANGE) on Android.
     char name[16];
-    pid_t id = gettid();
+    pid_t id = GetTid();
     int rv = snprintf(name, sizeof(name), "%s/%" PRId64, pool_->name_prefix_,
                       static_cast<int64_t>(id));
     assert(rv >= 0);
diff --git a/src/utils/types.h b/src/utils/types.h
index 374f06b..eba13b7 100644
--- a/src/utils/types.h
+++ b/src/utils/types.h
@@ -18,6 +18,7 @@
 #define LIBGAV1_SRC_UTILS_TYPES_H_
 
 #include <array>
+#include <cstddef>
 #include <cstdint>
 #include <memory>
 
@@ -512,6 +513,10 @@ struct ObuFrameHeader {
   Delta delta_lf;
   // A valid value of reference_frame_index[i] is in the range [0, 7]. -1
   // indicates an invalid value.
+  //
+  // NOTE: When the frame is an intra frame (frame_type is kFrameKey or
+  // kFrameIntraOnly), reference_frame_index is not used and may be
+  // uninitialized.
   int8_t reference_frame_index[kNumInterReferenceFrameTypes];
   // The ref_order_hint[ i ] syntax element in the uncompressed header.
   // Specifies the expected output order hint for each reference frame.
@@ -521,5 +526,24 @@ struct ObuFrameHeader {
   FilmGrainParams film_grain_params;
 };
 
+// Structure used for traversing the partition tree.
+struct PartitionTreeNode {
+  PartitionTreeNode() = default;
+  PartitionTreeNode(int row4x4, int column4x4, BlockSize block_size)
+      : row4x4(row4x4), column4x4(column4x4), block_size(block_size) {}
+  int row4x4 = -1;
+  int column4x4 = -1;
+  BlockSize block_size = kBlockInvalid;
+};
+
+// Structure used for storing the transform parameters in a superblock.
+struct TransformParameters {
+  TransformParameters() = default;
+  TransformParameters(TransformType type, int non_zero_coeff_count)
+      : type(type), non_zero_coeff_count(non_zero_coeff_count) {}
+  TransformType type;
+  int non_zero_coeff_count;
+};
+
 }  // namespace libgav1
 #endif  // LIBGAV1_SRC_UTILS_TYPES_H_
diff --git a/tests/block_utils.cc b/tests/block_utils.cc
new file mode 100644
index 0000000..96833a2
--- /dev/null
+++ b/tests/block_utils.cc
@@ -0,0 +1,130 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tests/block_utils.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <cstdio>
+#include <cstring>
+
+namespace libgav1 {
+namespace test_utils {
+namespace {
+
+template <typename Pixel>
+void PrintBlockDiff(const Pixel* block1, const Pixel* block2, int width,
+                    int height, int stride1, int stride2,
+                    const bool print_padding) {
+  const int print_width = print_padding ? std::min(stride1, stride2) : width;
+  const int field_width = (sizeof(Pixel) == 1) ? 4 : 5;
+
+  for (int y = 0; y < height; ++y) {
+    printf("[%2d] ", y);
+    for (int x = 0; x < print_width; ++x) {
+      if (x >= width) {
+        if (block1[x] == block2[x]) {
+          printf("[%*d] ", field_width, block1[x]);
+        } else {
+          printf("[*%*d] ", field_width - 1, block1[x]);
+        }
+      } else {
+        if (block1[x] == block2[x]) {
+          printf("%*d ", field_width, block1[x]);
+        } else {
+          printf("*%*d ", field_width - 1, block1[x]);
+        }
+      }
+    }
+    printf("\n");
+    block1 += stride1;
+    block2 += stride2;
+  }
+}
+
+}  // namespace
+
+template <typename Pixel>
+void PrintBlock(const Pixel* block, int width, int height, int stride,
+                const bool print_padding /*= false*/) {
+  const int print_width = print_padding ? stride : width;
+  const int field_width = (sizeof(Pixel) == 1) ? 4 : 5;
+  for (int y = 0; y < height; ++y) {
+    printf("[%2d] ", y);
+    for (int x = 0; x < print_width; ++x) {
+      if (x >= width) {
+        printf("[%*d] ", field_width, block[x]);
+      } else {
+        printf("%*d ", field_width, block[x]);
+      }
+    }
+    printf("\n");
+    block += stride;
+  }
+}
+
+template void PrintBlock(const uint8_t* block, int width, int height,
+                         int stride, bool print_padding /*= false*/);
+template void PrintBlock(const uint16_t* block, int width, int height,
+                         int stride, bool print_padding /*= false*/);
+template void PrintBlock(const int8_t* block, int width, int height, int stride,
+                         bool print_padding /*= false*/);
+template void PrintBlock(const int16_t* block, int width, int height,
+                         int stride, bool print_padding /*= false*/);
+
+template <typename Pixel>
+bool CompareBlocks(const Pixel* block1, const Pixel* block2, int width,
+                   int height, int stride1, int stride2,
+                   const bool check_padding, const bool print_diff /*= true*/) {
+  bool ok = true;
+  const int check_width = check_padding ? std::min(stride1, stride2) : width;
+  for (int y = 0; y < height; ++y) {
+    const uint64_t row1 = static_cast<uint64_t>(y) * stride1;
+    const uint64_t row2 = static_cast<uint64_t>(y) * stride2;
+    ok = memcmp(block1 + row1, block2 + row2,
+                sizeof(block1[0]) * check_width) == 0;
+    if (!ok) break;
+  }
+  if (!ok && print_diff) {
+    printf("block1 (width: %d height: %d stride: %d):\n", width, height,
+           stride1);
+    PrintBlockDiff(block1, block2, width, height, stride1, stride2,
+                   check_padding);
+    printf("\nblock2 (width: %d height: %d stride: %d):\n", width, height,
+           stride2);
+    PrintBlockDiff(block2, block1, width, height, stride2, stride1,
+                   check_padding);
+  }
+  return ok;
+}
+
+template bool CompareBlocks(const uint8_t* block1, const uint8_t* block2,
+                            int width, int height, int stride1, int stride2,
+                            const bool check_padding,
+                            const bool print_diff /*= true*/);
+template bool CompareBlocks(const uint16_t* block1, const uint16_t* block2,
+                            int width, int height, int stride1, int stride2,
+                            const bool check_padding,
+                            const bool print_diff /*= true*/);
+template bool CompareBlocks(const int8_t* block1, const int8_t* block2,
+                            int width, int height, int stride1, int stride2,
+                            const bool check_padding,
+                            const bool print_diff /*= true*/);
+template bool CompareBlocks(const int16_t* block1, const int16_t* block2,
+                            int width, int height, int stride1, int stride2,
+                            const bool check_padding,
+                            const bool print_diff /*= true*/);
+
+}  // namespace test_utils
+}  // namespace libgav1
diff --git a/tests/block_utils.h b/tests/block_utils.h
new file mode 100644
index 0000000..4542420
--- /dev/null
+++ b/tests/block_utils.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_TESTS_BLOCK_UTILS_H_
+#define LIBGAV1_TESTS_BLOCK_UTILS_H_
+
+#include <cstdint>
+
+namespace libgav1 {
+namespace test_utils {
+
+//------------------------------------------------------------------------------
+// Prints |block| pixel by pixel with |width| pixels per row if |print_padding|
+// is false, |stride| otherwise. If |print_padding| is true padding pixels are
+// surrounded in '[]'.
+template <typename Pixel>
+void PrintBlock(const Pixel* block, int width, int height, int stride,
+                bool print_padding = false);
+
+extern template void PrintBlock(const uint8_t* block, int width, int height,
+                                int stride, bool print_padding /*= false*/);
+extern template void PrintBlock(const uint16_t* block, int width, int height,
+                                int stride, bool print_padding /*= false*/);
+
+//------------------------------------------------------------------------------
+// Compares |block1| and |block2| pixel by pixel checking |width| pixels per row
+// if |check_padding| is false, min(|stride1|, |stride2|) pixels otherwise.
+// Prints the blocks with differences marked with a '*' if |print_diff| is
+// true (the default).
+
+template <typename Pixel>
+bool CompareBlocks(const Pixel* block1, const Pixel* block2, int width,
+                   int height, int stride1, int stride2, bool check_padding,
+                   bool print_diff = true);
+
+extern template bool CompareBlocks(const uint8_t* block1, const uint8_t* block2,
+                                   int width, int height, int stride1,
+                                   int stride2, bool check_padding,
+                                   bool print_diff /*= true*/);
+extern template bool CompareBlocks(const uint16_t* block1,
+                                   const uint16_t* block2, int width,
+                                   int height, int stride1, int stride2,
+                                   bool check_padding,
+                                   bool print_diff /*= true*/);
+
+}  // namespace test_utils
+}  // namespace libgav1
+
+#endif  // LIBGAV1_TESTS_BLOCK_UTILS_H_
diff --git a/tests/libgav1_tests.cmake b/tests/libgav1_tests.cmake
new file mode 100644
index 0000000..ac2fb2e
--- /dev/null
+++ b/tests/libgav1_tests.cmake
@@ -0,0 +1,626 @@
+# Copyright 2020 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_LIBGAV1_TESTS_CMAKE_)
+  return()
+endif() # LIBGAV1_LIBGAV1_TESTS_CMAKE_
+set(LIBGAV1_LIBGAV1_TESTS_CMAKE_ 1)
+
+set(libgav1_googletest "${libgav1_root}/third_party/googletest")
+if(NOT LIBGAV1_ENABLE_TESTS OR NOT EXISTS "${libgav1_googletest}")
+  macro(libgav1_add_tests_targets)
+
+  endmacro()
+
+  if(LIBGAV1_ENABLE_TESTS AND NOT EXISTS "${libgav1_googletest}")
+    message(
+      "GoogleTest not found, setting LIBGAV1_ENABLE_TESTS to false.\n"
+      "To enable tests download the GoogleTest repository to"
+      " third_party/googletest:\n\n  git \\\n    -C ${libgav1_root} \\\n"
+      "    clone \\\n"
+      "    https://github.com/google/googletest.git third_party/googletest\n")
+    set(LIBGAV1_ENABLE_TESTS FALSE CACHE BOOL "Enables tests." FORCE)
+  endif()
+  return()
+endif()
+
+# Check GoogleTest compiler requirements.
+if((CMAKE_CXX_COMPILER_ID
+    MATCHES
+    "Clang|GNU"
+    AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS "5")
+   OR (MSVC AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS "19"))
+  macro(libgav1_add_tests_targets)
+
+  endmacro()
+
+  message(
+    WARNING
+      "${CMAKE_CXX_COMPILER} (${CMAKE_CXX_COMPILER_ID} version"
+      " ${CMAKE_CXX_COMPILER_VERSION}) is below the minimum requirements for"
+      " GoogleTest; disabling unit tests. See"
+      " https://github.com/google/googletest#compilers for more detail.")
+  set(LIBGAV1_ENABLE_TESTS FALSE CACHE BOOL "Enables tests." FORCE)
+  return()
+endif()
+
+list(APPEND libgav1_tests_block_utils_sources
+            "${libgav1_root}/tests/block_utils.h"
+            "${libgav1_root}/tests/block_utils.cc")
+
+list(APPEND libgav1_tests_utils_sources
+            "${libgav1_root}/tests/third_party/libvpx/acm_random.h"
+            "${libgav1_root}/tests/third_party/libvpx/md5_helper.h"
+            "${libgav1_root}/tests/third_party/libvpx/md5_utils.cc"
+            "${libgav1_root}/tests/third_party/libvpx/md5_utils.h"
+            "${libgav1_root}/tests/utils.h" "${libgav1_root}/tests/utils.cc")
+
+list(APPEND libgav1_tests_utils_test_sources
+            "${libgav1_root}/tests/utils_test.cc")
+
+list(APPEND libgav1_average_blend_test_sources
+            "${libgav1_source}/dsp/average_blend_test.cc")
+list(APPEND libgav1_cdef_test_sources "${libgav1_source}/dsp/cdef_test.cc")
+list(APPEND libgav1_convolve_test_sources
+            "${libgav1_source}/dsp/convolve_test.cc")
+list(APPEND libgav1_distance_weighted_blend_test_sources
+            "${libgav1_source}/dsp/distance_weighted_blend_test.cc")
+list(APPEND libgav1_dsp_test_sources "${libgav1_source}/dsp/dsp_test.cc")
+list(APPEND libgav1_intra_edge_test_sources
+            "${libgav1_source}/dsp/intra_edge_test.cc")
+list(APPEND libgav1_intrapred_cfl_test_sources
+            "${libgav1_source}/dsp/intrapred_cfl_test.cc")
+list(APPEND libgav1_intrapred_directional_test_sources
+            "${libgav1_source}/dsp/intrapred_directional_test.cc")
+list(APPEND libgav1_intrapred_filter_test_sources
+            "${libgav1_source}/dsp/intrapred_filter_test.cc")
+list(APPEND libgav1_intrapred_test_sources
+            "${libgav1_source}/dsp/intrapred_test.cc")
+list(APPEND libgav1_inverse_transform_test_sources
+            "${libgav1_source}/dsp/inverse_transform_test.cc")
+list(APPEND libgav1_loop_filter_test_sources
+            "${libgav1_source}/dsp/loop_filter_test.cc")
+list(APPEND libgav1_loop_restoration_test_sources
+            "${libgav1_source}/dsp/loop_restoration_test.cc")
+list(APPEND libgav1_mask_blend_test_sources
+            "${libgav1_source}/dsp/mask_blend_test.cc")
+list(APPEND libgav1_motion_field_projection_test_sources
+            "${libgav1_source}/dsp/motion_field_projection_test.cc")
+list(APPEND libgav1_motion_vector_search_test_sources
+            "${libgav1_source}/dsp/motion_vector_search_test.cc")
+list(APPEND libgav1_super_res_test_sources
+            "${libgav1_source}/dsp/super_res_test.cc")
+list(APPEND libgav1_weight_mask_test_sources
+            "${libgav1_source}/dsp/weight_mask_test.cc")
+list(APPEND libgav1_obmc_test_sources "${libgav1_source}/dsp/obmc_test.cc")
+list(APPEND libgav1_warp_test_sources "${libgav1_source}/dsp/warp_test.cc")
+
+macro(libgav1_add_tests_targets)
+  if(NOT LIBGAV1_ENABLE_TESTS)
+    message(
+      FATAL_ERROR
+        "This version of libgav1_add_tests_targets() should only be used with"
+        " LIBGAV1_ENABLE_TESTS set to true.")
+  endif()
+  libgav1_add_library(TEST
+                      NAME
+                      libgav1_gtest
+                      TYPE
+                      STATIC
+                      SOURCES
+                      "${libgav1_googletest}/googletest/src/gtest-all.cc"
+                      DEFINES
+                      ${libgav1_defines}
+                      INCLUDES
+                      ${libgav1_gtest_include_paths}
+                      ${libgav1_include_paths})
+
+  libgav1_add_library(TEST
+                      NAME
+                      libgav1_gtest_main
+                      TYPE
+                      STATIC
+                      SOURCES
+                      "${libgav1_googletest}/googletest/src/gtest_main.cc"
+                      DEFINES
+                      ${libgav1_defines}
+                      INCLUDES
+                      ${libgav1_gtest_include_paths}
+                      ${libgav1_include_paths})
+
+  if(ANDROID OR IOS)
+    if(DEFINED LIBGAV1_THREADPOOL_USE_STD_MUTEX
+       AND NOT LIBGAV1_THREADPOOL_USE_STD_MUTEX)
+      set(use_absl_threading TRUE)
+    endif()
+  elseif(NOT
+         (DEFINED
+          LIBGAV1_THREADPOOL_USE_STD_MUTEX
+          AND LIBGAV1_THREADPOOL_USE_STD_MUTEX))
+    set(use_absl_threading TRUE)
+  endif()
+
+  if(use_absl_threading)
+    list(APPEND libgav1_common_test_absl_deps absl::synchronization)
+  endif()
+
+  libgav1_add_executable(TEST
+                         NAME
+                         tests_utils_test
+                         SOURCES
+                         ${libgav1_tests_utils_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_dsp
+                         libgav1_tests_utils
+                         libgav1_utils
+                         LIB_DEPS
+                         absl::time
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_library(TEST
+                      NAME
+                      libgav1_tests_block_utils
+                      TYPE
+                      OBJECT
+                      SOURCES
+                      ${libgav1_tests_block_utils_sources}
+                      DEFINES
+                      ${libgav1_defines}
+                      INCLUDES
+                      ${libgav1_test_include_paths})
+
+  libgav1_add_library(TEST
+                      NAME
+                      libgav1_tests_utils
+                      TYPE
+                      OBJECT
+                      SOURCES
+                      ${libgav1_tests_utils_sources}
+                      DEFINES
+                      ${libgav1_defines}
+                      INCLUDES
+                      ${libgav1_test_include_paths})
+
+  libgav1_add_executable(TEST
+                         NAME
+                         average_blend_test
+                         SOURCES
+                         ${libgav1_average_blend_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_decoder
+                         libgav1_dsp
+                         libgav1_tests_block_utils
+                         libgav1_tests_utils
+                         libgav1_utils
+                         LIB_DEPS
+                         absl::str_format_internal
+                         absl::strings
+                         absl::time
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         cdef_test
+                         SOURCES
+                         ${libgav1_cdef_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_decoder
+                         libgav1_dsp
+                         libgav1_tests_utils
+                         libgav1_utils
+                         LIB_DEPS
+                         absl::strings
+                         absl::time
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         convolve_test
+                         SOURCES
+                         ${libgav1_convolve_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_decoder
+                         libgav1_dsp
+                         libgav1_tests_block_utils
+                         libgav1_tests_utils
+                         libgav1_utils
+                         LIB_DEPS
+                         absl::str_format_internal
+                         absl::time
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         distance_weighted_blend_test
+                         SOURCES
+                         ${libgav1_distance_weighted_blend_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_decoder
+                         libgav1_dsp
+                         libgav1_tests_utils
+                         libgav1_utils
+                         LIB_DEPS
+                         absl::str_format_internal
+                         absl::time
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         dsp_test
+                         SOURCES
+                         ${libgav1_dsp_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_decoder
+                         libgav1_dsp
+                         libgav1_tests_utils
+                         libgav1_utils
+                         LIB_DEPS
+                         absl::strings
+                         absl::time
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         intrapred_cfl_test
+                         SOURCES
+                         ${libgav1_intrapred_cfl_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_decoder
+                         libgav1_dsp
+                         libgav1_tests_block_utils
+                         libgav1_tests_utils
+                         libgav1_utils
+                         LIB_DEPS
+                         absl::time
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         intrapred_directional_test
+                         SOURCES
+                         ${libgav1_intrapred_directional_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_decoder
+                         libgav1_dsp
+                         libgav1_tests_block_utils
+                         libgav1_tests_utils
+                         libgav1_utils
+                         LIB_DEPS
+                         absl::time
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         intrapred_filter_test
+                         SOURCES
+                         ${libgav1_intrapred_filter_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_decoder
+                         libgav1_dsp
+                         libgav1_tests_block_utils
+                         libgav1_tests_utils
+                         libgav1_utils
+                         LIB_DEPS
+                         absl::time
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         intrapred_test
+                         SOURCES
+                         ${libgav1_intrapred_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_decoder
+                         libgav1_dsp
+                         libgav1_tests_block_utils
+                         libgav1_tests_utils
+                         libgav1_utils
+                         LIB_DEPS
+                         absl::time
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         intra_edge_test
+                         SOURCES
+                         ${libgav1_intra_edge_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_decoder
+                         libgav1_tests_utils
+                         libgav1_dsp
+                         libgav1_utils
+                         LIB_DEPS
+                         absl::strings
+                         absl::time
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         inverse_transform_test
+                         SOURCES
+                         ${libgav1_inverse_transform_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_decoder
+                         libgav1_tests_block_utils
+                         libgav1_tests_utils
+                         libgav1_dsp
+                         libgav1_utils
+                         LIB_DEPS
+                         absl::strings
+                         absl::time
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         loop_filter_test
+                         SOURCES
+                         ${libgav1_loop_filter_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_decoder
+                         libgav1_dsp
+                         libgav1_tests_block_utils
+                         libgav1_tests_utils
+                         libgav1_utils
+                         LIB_DEPS
+                         absl::time
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         loop_restoration_test
+                         SOURCES
+                         ${libgav1_loop_restoration_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_decoder
+                         libgav1_dsp
+                         libgav1_tests_block_utils
+                         libgav1_tests_utils
+                         libgav1_utils
+                         LIB_DEPS
+                         absl::time
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         mask_blend_test
+                         SOURCES
+                         ${libgav1_mask_blend_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_decoder
+                         libgav1_dsp
+                         libgav1_tests_utils
+                         libgav1_utils
+                         LIB_DEPS
+                         absl::str_format_internal
+                         absl::time
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         motion_field_projection_test
+                         SOURCES
+                         ${libgav1_motion_field_projection_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_decoder
+                         libgav1_dsp
+                         libgav1_tests_utils
+                         libgav1_utils
+                         LIB_DEPS
+                         absl::str_format_internal
+                         absl::time
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         motion_vector_search_test
+                         SOURCES
+                         ${libgav1_motion_vector_search_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_decoder
+                         libgav1_dsp
+                         libgav1_tests_utils
+                         libgav1_utils
+                         LIB_DEPS
+                         absl::str_format_internal
+                         absl::time
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         obmc_test
+                         SOURCES
+                         ${libgav1_obmc_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_decoder
+                         libgav1_dsp
+                         libgav1_tests_block_utils
+                         libgav1_tests_utils
+                         libgav1_utils
+                         LIB_DEPS
+                         absl::str_format_internal
+                         absl::time
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         super_res_test
+                         SOURCES
+                         ${libgav1_super_res_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_decoder
+                         libgav1_dsp
+                         libgav1_tests_utils
+                         libgav1_utils
+                         LIB_DEPS
+                         absl::str_format_internal
+                         absl::time
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         warp_test
+                         SOURCES
+                         ${libgav1_warp_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_decoder
+                         libgav1_dsp
+                         libgav1_tests_block_utils
+                         libgav1_tests_utils
+                         libgav1_utils
+                         LIB_DEPS
+                         absl::str_format_internal
+                         absl::time
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         weight_mask_test
+                         SOURCES
+                         ${libgav1_weight_mask_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_decoder
+                         libgav1_dsp
+                         libgav1_tests_utils
+                         libgav1_utils
+                         LIB_DEPS
+                         absl::str_format_internal
+                         absl::time
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+endmacro()
diff --git a/tests/third_party/libvpx/LICENSE b/tests/third_party/libvpx/LICENSE
new file mode 100644
index 0000000..83ef339
--- /dev/null
+++ b/tests/third_party/libvpx/LICENSE
@@ -0,0 +1,30 @@
+Copyright (c) 2010, The WebM Project authors. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+  * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+
+  * Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in
+    the documentation and/or other materials provided with the
+    distribution.
+
+  * Neither the name of Google, nor the WebM Project, nor the names
+    of its contributors may be used to endorse or promote products
+    derived from this software without specific prior written
+    permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/tests/third_party/libvpx/acm_random.h b/tests/third_party/libvpx/acm_random.h
new file mode 100644
index 0000000..e8cfc9c
--- /dev/null
+++ b/tests/third_party/libvpx/acm_random.h
@@ -0,0 +1,91 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef LIBGAV1_TESTS_THIRD_PARTY_LIBVPX_ACM_RANDOM_H_
+#define LIBGAV1_TESTS_THIRD_PARTY_LIBVPX_ACM_RANDOM_H_
+
+#include <cassert>
+#include <cstdint>
+#include <limits>
+
+#include "gtest/gtest.h"
+
+namespace libvpx_test {
+
+class ACMRandom {
+ public:
+  ACMRandom() : random_(DeterministicSeed()) {}
+
+  explicit ACMRandom(int seed) : random_(seed) {}
+
+  void Reset(int seed) { random_.Reseed(seed); }
+  uint16_t Rand16(void) {
+    const uint32_t value =
+        random_.Generate(testing::internal::Random::kMaxRange);
+    return (value >> 15) & 0xffff;
+  }
+
+  int32_t Rand20Signed(void) {
+    // Use 20 bits: values between 524287 and -524288.
+    const uint32_t value = random_.Generate(1048576);
+    return static_cast<int32_t>(value) - 524288;
+  }
+
+  int16_t Rand16Signed(void) {
+    // Use 16 bits: values between 32767 and -32768.
+    return static_cast<int16_t>(random_.Generate(65536));
+  }
+
+  int16_t Rand13Signed(void) {
+    // Use 13 bits: values between 4095 and -4096.
+    const uint32_t value = random_.Generate(8192);
+    return static_cast<int16_t>(value) - 4096;
+  }
+
+  int16_t Rand9Signed(void) {
+    // Use 9 bits: values between 255 (0x0FF) and -256 (0x100).
+    const uint32_t value = random_.Generate(512);
+    return static_cast<int16_t>(value) - 256;
+  }
+
+  uint8_t Rand8(void) {
+    const uint32_t value =
+        random_.Generate(testing::internal::Random::kMaxRange);
+    // There's a bit more entropy in the upper bits of this implementation.
+    return (value >> 23) & 0xff;
+  }
+
+  uint8_t Rand8Extremes(void) {
+    // Returns a random value near 0 or near 255, to better exercise
+    // saturation behavior.
+    const uint8_t r = Rand8();
+    return static_cast<uint8_t>((r < 128) ? r << 4 : r >> 4);
+  }
+
+  uint32_t RandRange(const uint32_t range) {
+    // testing::internal::Random::Generate provides values in the range
+    // testing::internal::Random::kMaxRange.
+    assert(range <= testing::internal::Random::kMaxRange);
+    return random_.Generate(range);
+  }
+
+  int PseudoUniform(int range) { return random_.Generate(range); }
+
+  int operator()(int n) { return PseudoUniform(n); }
+
+  static constexpr int DeterministicSeed(void) { return 0xbaba; }
+
+ private:
+  testing::internal::Random random_;
+};
+
+}  // namespace libvpx_test
+
+#endif  // LIBGAV1_TESTS_THIRD_PARTY_LIBVPX_ACM_RANDOM_H_
diff --git a/tests/third_party/libvpx/md5_helper.h b/tests/third_party/libvpx/md5_helper.h
new file mode 100644
index 0000000..c97b590
--- /dev/null
+++ b/tests/third_party/libvpx/md5_helper.h
@@ -0,0 +1,53 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef LIBGAV1_TESTS_THIRD_PARTY_LIBVPX_MD5_HELPER_H_
+#define LIBGAV1_TESTS_THIRD_PARTY_LIBVPX_MD5_HELPER_H_
+
+#include <cstddef>
+#include <cstdint>
+
+#include "tests/third_party/libvpx/md5_utils.h"
+
+namespace libvpx_test {
+class MD5 {
+ public:
+  MD5() { MD5Init(&md5_); }
+
+  void Add(const uint8_t *data, size_t size) {
+    MD5Update(&md5_, data, static_cast<uint32_t>(size));
+  }
+
+  const char *Get(void) {
+    static const char hex[16] = {
+        '0', '1', '2', '3', '4', '5', '6', '7',
+        '8', '9', 'a', 'b', 'c', 'd', 'e', 'f',
+    };
+    uint8_t tmp[16];
+    MD5Context ctx_tmp = md5_;
+
+    MD5Final(tmp, &ctx_tmp);
+    for (int i = 0; i < 16; i++) {
+      res_[i * 2 + 0] = hex[tmp[i] >> 4];
+      res_[i * 2 + 1] = hex[tmp[i] & 0xf];
+    }
+    res_[32] = 0;
+
+    return res_;
+  }
+
+ protected:
+  char res_[33];
+  MD5Context md5_;
+};
+
+}  // namespace libvpx_test
+
+#endif  // LIBGAV1_TESTS_THIRD_PARTY_LIBVPX_MD5_HELPER_H_
diff --git a/tests/third_party/libvpx/md5_utils.cc b/tests/third_party/libvpx/md5_utils.cc
new file mode 100644
index 0000000..4638e54
--- /dev/null
+++ b/tests/third_party/libvpx/md5_utils.cc
@@ -0,0 +1,249 @@
+/*
+ * This code implements the MD5 message-digest algorithm.
+ * The algorithm is due to Ron Rivest.  This code was
+ * written by Colin Plumb in 1993, no copyright is claimed.
+ * This code is in the public domain; do with it what you wish.
+ *
+ * Equivalent code is available from RSA Data Security, Inc.
+ * This code has been tested against that, and is equivalent,
+ * except that you don't need to include two pages of legalese
+ * with every copy.
+ *
+ * To compute the message digest of a chunk of bytes, declare an
+ * MD5Context structure, pass it to MD5Init, call MD5Update as
+ * needed on buffers full of bytes, and then call MD5Final, which
+ * will fill a supplied 16-byte array with the digest.
+ *
+ * Changed so as no longer to depend on Colin Plumb's `usual.h' header
+ * definitions
+ *  - Ian Jackson <ian@chiark.greenend.org.uk>.
+ * Still in the public domain.
+ */
+
+#include "tests/third_party/libvpx/md5_utils.h"
+
+#include <cstring>
+
+static void byteSwap(UWORD32 *buf, unsigned words) {
+  md5byte *p;
+
+  /* Only swap bytes for big endian machines */
+  int i = 1;
+
+  if (*(char *)&i == 1) return;
+
+  p = (md5byte *)buf;
+
+  do {
+    *buf++ = (UWORD32)((unsigned)p[3] << 8 | p[2]) << 16 |
+             ((unsigned)p[1] << 8 | p[0]);
+    p += 4;
+  } while (--words);
+}
+
+/*
+ * Start MD5 accumulation.  Set bit count to 0 and buffer to mysterious
+ * initialization constants.
+ */
+void MD5Init(struct MD5Context *ctx) {
+  ctx->buf[0] = 0x67452301;
+  ctx->buf[1] = 0xefcdab89;
+  ctx->buf[2] = 0x98badcfe;
+  ctx->buf[3] = 0x10325476;
+
+  ctx->bytes[0] = 0;
+  ctx->bytes[1] = 0;
+}
+
+/*
+ * Update context to reflect the concatenation of another buffer full
+ * of bytes.
+ */
+void MD5Update(struct MD5Context *ctx, md5byte const *buf, unsigned len) {
+  UWORD32 t;
+
+  /* Update byte count */
+
+  t = ctx->bytes[0];
+
+  if ((ctx->bytes[0] = t + len) < t)
+    ctx->bytes[1]++; /* Carry from low to high */
+
+  t = 64 - (t & 0x3f); /* Space available in ctx->in (at least 1) */
+
+  if (t > len) {
+    memcpy((md5byte *)ctx->in + 64 - t, buf, len);
+    return;
+  }
+
+  /* First chunk is an odd size */
+  memcpy((md5byte *)ctx->in + 64 - t, buf, t);
+  byteSwap(ctx->in, 16);
+  MD5Transform(ctx->buf, ctx->in);
+  buf += t;
+  len -= t;
+
+  /* Process data in 64-byte chunks */
+  while (len >= 64) {
+    memcpy(ctx->in, buf, 64);
+    byteSwap(ctx->in, 16);
+    MD5Transform(ctx->buf, ctx->in);
+    buf += 64;
+    len -= 64;
+  }
+
+  /* Handle any remaining bytes of data. */
+  memcpy(ctx->in, buf, len);
+}
+
+/*
+ * Final wrapup - pad to 64-byte boundary with the bit pattern
+ * 1 0* (64-bit count of bits processed, MSB-first)
+ */
+void MD5Final(md5byte digest[16], struct MD5Context *ctx) {
+  int count = ctx->bytes[0] & 0x3f; /* Number of bytes in ctx->in */
+  md5byte *p = (md5byte *)ctx->in + count;
+
+  /* Set the first char of padding to 0x80.  There is always room. */
+  *p++ = 0x80;
+
+  /* Bytes of padding needed to make 56 bytes (-8..55) */
+  count = 56 - 1 - count;
+
+  if (count < 0) { /* Padding forces an extra block */
+    memset(p, 0, count + 8);
+    byteSwap(ctx->in, 16);
+    MD5Transform(ctx->buf, ctx->in);
+    p = (md5byte *)ctx->in;
+    count = 56;
+  }
+
+  memset(p, 0, count);
+  byteSwap(ctx->in, 14);
+
+  /* Append length in bits and transform */
+  ctx->in[14] = ctx->bytes[0] << 3;
+  ctx->in[15] = ctx->bytes[1] << 3 | ctx->bytes[0] >> 29;
+  MD5Transform(ctx->buf, ctx->in);
+
+  byteSwap(ctx->buf, 4);
+  memcpy(digest, ctx->buf, 16);
+  memset(ctx, 0, sizeof(*ctx)); /* In case it's sensitive */
+}
+
+#ifndef ASM_MD5
+
+/* The four core functions - F1 is optimized somewhat */
+
+/* #define F1(x, y, z) (x & y | ~x & z) */
+#define F1(x, y, z) (z ^ (x & (y ^ z)))
+#define F2(x, y, z) F1(z, x, y)
+#define F3(x, y, z) (x ^ y ^ z)
+#define F4(x, y, z) (y ^ (x | ~z))
+
+/* This is the central step in the MD5 algorithm. */
+#define MD5STEP(f, w, x, y, z, in, s) \
+  (w += f(x, y, z) + in, w = (w << s | w >> (32 - s)) + x)
+
+#if defined(__clang__) && defined(__has_attribute)
+#if __has_attribute(no_sanitize)
+#define VPX_NO_UNSIGNED_OVERFLOW_CHECK \
+  __attribute__((no_sanitize("unsigned-integer-overflow")))
+#endif
+#endif
+
+#ifndef VPX_NO_UNSIGNED_OVERFLOW_CHECK
+#define VPX_NO_UNSIGNED_OVERFLOW_CHECK
+#endif
+
+/*
+ * The core of the MD5 algorithm, this alters an existing MD5 hash to
+ * reflect the addition of 16 longwords of new data.  MD5Update blocks
+ * the data and converts bytes into longwords for this routine.
+ */
+VPX_NO_UNSIGNED_OVERFLOW_CHECK void MD5Transform(UWORD32 buf[4],
+                                                 UWORD32 const in[16]) {
+  UWORD32 a, b, c, d;
+
+  a = buf[0];
+  b = buf[1];
+  c = buf[2];
+  d = buf[3];
+
+  MD5STEP(F1, a, b, c, d, in[0] + 0xd76aa478, 7);
+  MD5STEP(F1, d, a, b, c, in[1] + 0xe8c7b756, 12);
+  MD5STEP(F1, c, d, a, b, in[2] + 0x242070db, 17);
+  MD5STEP(F1, b, c, d, a, in[3] + 0xc1bdceee, 22);
+  MD5STEP(F1, a, b, c, d, in[4] + 0xf57c0faf, 7);
+  MD5STEP(F1, d, a, b, c, in[5] + 0x4787c62a, 12);
+  MD5STEP(F1, c, d, a, b, in[6] + 0xa8304613, 17);
+  MD5STEP(F1, b, c, d, a, in[7] + 0xfd469501, 22);
+  MD5STEP(F1, a, b, c, d, in[8] + 0x698098d8, 7);
+  MD5STEP(F1, d, a, b, c, in[9] + 0x8b44f7af, 12);
+  MD5STEP(F1, c, d, a, b, in[10] + 0xffff5bb1, 17);
+  MD5STEP(F1, b, c, d, a, in[11] + 0x895cd7be, 22);
+  MD5STEP(F1, a, b, c, d, in[12] + 0x6b901122, 7);
+  MD5STEP(F1, d, a, b, c, in[13] + 0xfd987193, 12);
+  MD5STEP(F1, c, d, a, b, in[14] + 0xa679438e, 17);
+  MD5STEP(F1, b, c, d, a, in[15] + 0x49b40821, 22);
+
+  MD5STEP(F2, a, b, c, d, in[1] + 0xf61e2562, 5);
+  MD5STEP(F2, d, a, b, c, in[6] + 0xc040b340, 9);
+  MD5STEP(F2, c, d, a, b, in[11] + 0x265e5a51, 14);
+  MD5STEP(F2, b, c, d, a, in[0] + 0xe9b6c7aa, 20);
+  MD5STEP(F2, a, b, c, d, in[5] + 0xd62f105d, 5);
+  MD5STEP(F2, d, a, b, c, in[10] + 0x02441453, 9);
+  MD5STEP(F2, c, d, a, b, in[15] + 0xd8a1e681, 14);
+  MD5STEP(F2, b, c, d, a, in[4] + 0xe7d3fbc8, 20);
+  MD5STEP(F2, a, b, c, d, in[9] + 0x21e1cde6, 5);
+  MD5STEP(F2, d, a, b, c, in[14] + 0xc33707d6, 9);
+  MD5STEP(F2, c, d, a, b, in[3] + 0xf4d50d87, 14);
+  MD5STEP(F2, b, c, d, a, in[8] + 0x455a14ed, 20);
+  MD5STEP(F2, a, b, c, d, in[13] + 0xa9e3e905, 5);
+  MD5STEP(F2, d, a, b, c, in[2] + 0xfcefa3f8, 9);
+  MD5STEP(F2, c, d, a, b, in[7] + 0x676f02d9, 14);
+  MD5STEP(F2, b, c, d, a, in[12] + 0x8d2a4c8a, 20);
+
+  MD5STEP(F3, a, b, c, d, in[5] + 0xfffa3942, 4);
+  MD5STEP(F3, d, a, b, c, in[8] + 0x8771f681, 11);
+  MD5STEP(F3, c, d, a, b, in[11] + 0x6d9d6122, 16);
+  MD5STEP(F3, b, c, d, a, in[14] + 0xfde5380c, 23);
+  MD5STEP(F3, a, b, c, d, in[1] + 0xa4beea44, 4);
+  MD5STEP(F3, d, a, b, c, in[4] + 0x4bdecfa9, 11);
+  MD5STEP(F3, c, d, a, b, in[7] + 0xf6bb4b60, 16);
+  MD5STEP(F3, b, c, d, a, in[10] + 0xbebfbc70, 23);
+  MD5STEP(F3, a, b, c, d, in[13] + 0x289b7ec6, 4);
+  MD5STEP(F3, d, a, b, c, in[0] + 0xeaa127fa, 11);
+  MD5STEP(F3, c, d, a, b, in[3] + 0xd4ef3085, 16);
+  MD5STEP(F3, b, c, d, a, in[6] + 0x04881d05, 23);
+  MD5STEP(F3, a, b, c, d, in[9] + 0xd9d4d039, 4);
+  MD5STEP(F3, d, a, b, c, in[12] + 0xe6db99e5, 11);
+  MD5STEP(F3, c, d, a, b, in[15] + 0x1fa27cf8, 16);
+  MD5STEP(F3, b, c, d, a, in[2] + 0xc4ac5665, 23);
+
+  MD5STEP(F4, a, b, c, d, in[0] + 0xf4292244, 6);
+  MD5STEP(F4, d, a, b, c, in[7] + 0x432aff97, 10);
+  MD5STEP(F4, c, d, a, b, in[14] + 0xab9423a7, 15);
+  MD5STEP(F4, b, c, d, a, in[5] + 0xfc93a039, 21);
+  MD5STEP(F4, a, b, c, d, in[12] + 0x655b59c3, 6);
+  MD5STEP(F4, d, a, b, c, in[3] + 0x8f0ccc92, 10);
+  MD5STEP(F4, c, d, a, b, in[10] + 0xffeff47d, 15);
+  MD5STEP(F4, b, c, d, a, in[1] + 0x85845dd1, 21);
+  MD5STEP(F4, a, b, c, d, in[8] + 0x6fa87e4f, 6);
+  MD5STEP(F4, d, a, b, c, in[15] + 0xfe2ce6e0, 10);
+  MD5STEP(F4, c, d, a, b, in[6] + 0xa3014314, 15);
+  MD5STEP(F4, b, c, d, a, in[13] + 0x4e0811a1, 21);
+  MD5STEP(F4, a, b, c, d, in[4] + 0xf7537e82, 6);
+  MD5STEP(F4, d, a, b, c, in[11] + 0xbd3af235, 10);
+  MD5STEP(F4, c, d, a, b, in[2] + 0x2ad7d2bb, 15);
+  MD5STEP(F4, b, c, d, a, in[9] + 0xeb86d391, 21);
+
+  buf[0] += a;
+  buf[1] += b;
+  buf[2] += c;
+  buf[3] += d;
+}
+
+#undef VPX_NO_UNSIGNED_OVERFLOW_CHECK
+
+#endif
diff --git a/tests/third_party/libvpx/md5_utils.h b/tests/third_party/libvpx/md5_utils.h
new file mode 100644
index 0000000..13be035
--- /dev/null
+++ b/tests/third_party/libvpx/md5_utils.h
@@ -0,0 +1,41 @@
+/*
+ * This is the header file for the MD5 message-digest algorithm.
+ * The algorithm is due to Ron Rivest.  This code was
+ * written by Colin Plumb in 1993, no copyright is claimed.
+ * This code is in the public domain; do with it what you wish.
+ *
+ * Equivalent code is available from RSA Data Security, Inc.
+ * This code has been tested against that, and is equivalent,
+ * except that you don't need to include two pages of legalese
+ * with every copy.
+ *
+ * To compute the message digest of a chunk of bytes, declare an
+ * MD5Context structure, pass it to MD5Init, call MD5Update as
+ * needed on buffers full of bytes, and then call MD5Final, which
+ * will fill a supplied 16-byte array with the digest.
+ *
+ * Changed so as no longer to depend on Colin Plumb's `usual.h'
+ * header definitions
+ *  - Ian Jackson <ian@chiark.greenend.org.uk>.
+ * Still in the public domain.
+ */
+
+#ifndef LIBGAV1_TESTS_THIRD_PARTY_LIBVPX_MD5_UTILS_H_
+#define LIBGAV1_TESTS_THIRD_PARTY_LIBVPX_MD5_UTILS_H_
+
+#define md5byte unsigned char
+#define UWORD32 unsigned int
+
+typedef struct MD5Context MD5Context;
+struct MD5Context {
+  UWORD32 buf[4];
+  UWORD32 bytes[2];
+  UWORD32 in[16];
+};
+
+void MD5Init(struct MD5Context *context);
+void MD5Update(struct MD5Context *context, md5byte const *buf, unsigned len);
+void MD5Final(unsigned char digest[16], struct MD5Context *context);
+void MD5Transform(UWORD32 buf[4], UWORD32 const in[16]);
+
+#endif  // LIBGAV1_TESTS_THIRD_PARTY_LIBVPX_MD5_UTILS_H_
diff --git a/tests/utils.cc b/tests/utils.cc
new file mode 100644
index 0000000..b73cf01
--- /dev/null
+++ b/tests/utils.cc
@@ -0,0 +1,120 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tests/utils.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <cstring>
+#include <memory>
+#include <string>
+
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/dsp.h"
+#include "src/gav1/decoder_buffer.h"
+#include "src/utils/constants.h"
+#include "tests/third_party/libvpx/md5_helper.h"
+
+namespace libgav1 {
+namespace test_utils {
+
+void ResetDspTable(const int bitdepth) {
+  dsp::Dsp* const dsp = dsp_internal::GetWritableDspTable(bitdepth);
+  ASSERT_NE(dsp, nullptr);
+  memset(dsp, 0, sizeof(dsp::Dsp));
+}
+
+std::string GetMd5Sum(const void* bytes, size_t size) {
+  libvpx_test::MD5 md5;
+  md5.Add(static_cast<const uint8_t*>(bytes), size);
+  return md5.Get();
+}
+
+template <typename Pixel>
+std::string GetMd5Sum(const Pixel* block, int width, int height, int stride) {
+  libvpx_test::MD5 md5;
+  const Pixel* row = block;
+  for (int i = 0; i < height; ++i) {
+    md5.Add(reinterpret_cast<const uint8_t*>(row), width * sizeof(Pixel));
+    row += stride;
+  }
+  return md5.Get();
+}
+
+template std::string GetMd5Sum(const int8_t* block, int width, int height,
+                               int stride);
+template std::string GetMd5Sum(const int16_t* block, int width, int height,
+                               int stride);
+
+std::string GetMd5Sum(const DecoderBuffer& buffer) {
+  libvpx_test::MD5 md5;
+  const size_t pixel_size =
+      (buffer.bitdepth == 8) ? sizeof(uint8_t) : sizeof(uint16_t);
+  for (int plane = kPlaneY; plane < buffer.NumPlanes(); ++plane) {
+    const int height = buffer.displayed_height[plane];
+    const size_t width = buffer.displayed_width[plane] * pixel_size;
+    const int stride = buffer.stride[plane];
+    const uint8_t* plane_buffer = buffer.plane[plane];
+    for (int row = 0; row < height; ++row) {
+      md5.Add(plane_buffer, width);
+      plane_buffer += stride;
+    }
+  }
+  return md5.Get();
+}
+
+void CheckMd5Digest(const char name[], const char function_name[],
+                    const char expected_digest[], const void* data, size_t size,
+                    absl::Duration elapsed_time) {
+  const std::string digest = test_utils::GetMd5Sum(data, size);
+  printf("Mode %s[%31s]: %5d us     MD5: %s\n", name, function_name,
+         static_cast<int>(absl::ToInt64Microseconds(elapsed_time)),
+         digest.c_str());
+  EXPECT_STREQ(expected_digest, digest.c_str());
+}
+
+template <typename Pixel>
+void CheckMd5Digest(const char name[], const char function_name[],
+                    const char expected_digest[], const Pixel* block, int width,
+                    int height, int stride, absl::Duration elapsed_time) {
+  const std::string digest =
+      test_utils::GetMd5Sum(block, width, height, stride);
+  printf("Mode %s[%31s]: %5d us     MD5: %s\n", name, function_name,
+         static_cast<int>(absl::ToInt64Microseconds(elapsed_time)),
+         digest.c_str());
+  EXPECT_STREQ(expected_digest, digest.c_str());
+}
+
+template void CheckMd5Digest(const char name[], const char function_name[],
+                             const char expected_digest[], const int8_t* block,
+                             int width, int height, int stride,
+                             absl::Duration elapsed_time);
+template void CheckMd5Digest(const char name[], const char function_name[],
+                             const char expected_digest[], const int16_t* block,
+                             int width, int height, int stride,
+                             absl::Duration elapsed_time);
+
+void CheckMd5Digest(const char name[], const char function_name[],
+                    const char expected_digest[], const char actual_digest[],
+                    absl::Duration elapsed_time) {
+  printf("Mode %s[%31s]: %5d us     MD5: %s\n", name, function_name,
+         static_cast<int>(absl::ToInt64Microseconds(elapsed_time)),
+         actual_digest);
+  EXPECT_STREQ(expected_digest, actual_digest);
+}
+
+}  // namespace test_utils
+}  // namespace libgav1
diff --git a/tests/utils.h b/tests/utils.h
new file mode 100644
index 0000000..b3062da
--- /dev/null
+++ b/tests/utils.h
@@ -0,0 +1,138 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_TESTS_UTILS_H_
+#define LIBGAV1_TESTS_UTILS_H_
+
+#include <cstddef>
+#include <new>
+#include <string>
+
+#include "absl/base/config.h"
+#include "absl/time/time.h"
+#include "src/gav1/decoder_buffer.h"
+#include "src/utils/memory.h"
+#include "tests/third_party/libvpx/acm_random.h"
+
+#ifdef ABSL_HAVE_EXCEPTIONS
+#include <exception>
+#endif
+
+namespace libgav1 {
+namespace test_utils {
+
+enum { kAlternateDeterministicSeed = 0x9571 };
+static_assert(kAlternateDeterministicSeed !=
+                  libvpx_test::ACMRandom::DeterministicSeed(),
+              "");
+
+// Similar to libgav1::MaxAlignedAllocable, but retains the throwing versions
+// of new to support googletest allocations.
+struct MaxAlignedAllocable {
+  // Class-specific allocation functions.
+  static void* operator new(size_t size) {
+    void* const p =
+        libgav1::MaxAlignedAllocable::operator new(size, std::nothrow);
+#ifdef ABSL_HAVE_EXCEPTIONS
+    if (p == nullptr) throw std::bad_alloc();
+#endif
+    return p;
+  }
+  static void* operator new[](size_t size) {
+    void* const p =
+        libgav1::MaxAlignedAllocable::operator new[](size, std::nothrow);
+#ifdef ABSL_HAVE_EXCEPTIONS
+    if (p == nullptr) throw std::bad_alloc();
+#endif
+    return p;
+  }
+
+  // Class-specific non-throwing allocation functions
+  static void* operator new(size_t size, const std::nothrow_t& tag) noexcept {
+    return libgav1::MaxAlignedAllocable::operator new(size, tag);
+  }
+  static void* operator new[](size_t size, const std::nothrow_t& tag) noexcept {
+    return libgav1::MaxAlignedAllocable::operator new[](size, tag);
+  }
+
+  // Class-specific deallocation functions.
+  static void operator delete(void* ptr) noexcept {
+    libgav1::MaxAlignedAllocable::operator delete(ptr);
+  }
+  static void operator delete[](void* ptr) noexcept {
+    libgav1::MaxAlignedAllocable::operator delete[](ptr);
+  }
+
+  // Only called if new (std::nothrow) is used and the constructor throws an
+  // exception.
+  static void operator delete(void* ptr, const std::nothrow_t& tag) noexcept {
+    libgav1::MaxAlignedAllocable::operator delete(ptr, tag);
+  }
+  // Only called if new[] (std::nothrow) is used and the constructor throws an
+  // exception.
+  static void operator delete[](void* ptr, const std::nothrow_t& tag) noexcept {
+    libgav1::MaxAlignedAllocable::operator delete[](ptr, tag);
+  }
+};
+
+// Clears dsp table entries for |bitdepth|. This function is not thread safe.
+void ResetDspTable(int bitdepth);
+
+//------------------------------------------------------------------------------
+// Gets human readable hexadecimal encoded MD5 sum from given data, block, or
+// frame buffer.
+
+std::string GetMd5Sum(const void* bytes, size_t size);
+template <typename Pixel>
+std::string GetMd5Sum(const Pixel* block, int width, int height, int stride);
+std::string GetMd5Sum(const DecoderBuffer& buffer);
+
+//------------------------------------------------------------------------------
+// Compares the md5 digest of |size| bytes of |data| with |expected_digest|.
+// Prints a log message with |name|, |function_name|, md5 digest and
+// |elapsed_time|. |name| and |function_name| are merely tags used for logging
+// and can be any meaningful string depending on the caller's context.
+
+void CheckMd5Digest(const char name[], const char function_name[],
+                    const char expected_digest[], const void* data, size_t size,
+                    absl::Duration elapsed_time);
+
+//------------------------------------------------------------------------------
+// Compares the md5 digest of |block| with |expected_digest|. The width, height,
+// and stride of |block| are |width|, |height|, and |stride|, respectively.
+// Prints a log message with |name|, |function_name|, md5 digest and
+// |elapsed_time|. |name| and |function_name| are merely tags used for logging
+// and can be any meaningful string depending on the caller's context.
+
+template <typename Pixel>
+void CheckMd5Digest(const char name[], const char function_name[],
+                    const char expected_digest[], const Pixel* block, int width,
+                    int height, int stride, absl::Duration elapsed_time);
+
+//------------------------------------------------------------------------------
+// Compares |actual_digest| with |expected_digest|. Prints a log message with
+// |name|, |function_name|, md5 digest and |elapsed_time|. |name| and
+// |function_name| are merely tags used for logging and can be any meaningful
+// string depending on the caller's context.
+
+void CheckMd5Digest(const char name[], const char function_name[],
+                    const char expected_digest[], const char actual_digest[],
+                    absl::Duration elapsed_time);
+
+}  // namespace test_utils
+}  // namespace libgav1
+
+#endif  // LIBGAV1_TESTS_UTILS_H_
diff --git a/tests/utils_test.cc b/tests/utils_test.cc
new file mode 100644
index 0000000..1d5b598
--- /dev/null
+++ b/tests/utils_test.cc
@@ -0,0 +1,190 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tests/utils.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <new>
+
+#include "absl/base/config.h"
+#include "gtest/gtest.h"
+#include "src/utils/memory.h"
+
+#ifdef ABSL_HAVE_EXCEPTIONS
+#include <exception>
+#endif
+
+namespace libgav1 {
+namespace test_utils {
+namespace {
+
+constexpr size_t kMaxAllocableSize = 0x40000000;
+
+// Has a trivial default constructor that performs no action.
+struct SmallMaxAligned : public MaxAlignedAllocable {
+  alignas(kMaxAlignment) uint8_t x;
+};
+
+// Has a nontrivial default constructor that initializes the data member.
+struct SmallMaxAlignedNontrivialConstructor : public MaxAlignedAllocable {
+  alignas(kMaxAlignment) uint8_t x = 0;
+};
+
+// Has a trivial default constructor that performs no action.
+struct HugeMaxAligned : public MaxAlignedAllocable {
+  alignas(kMaxAlignment) uint8_t x[kMaxAllocableSize + 1];
+};
+
+// Has a nontrivial default constructor that initializes the data member.
+struct HugeMaxAlignedNontrivialConstructor : public MaxAlignedAllocable {
+  alignas(kMaxAlignment) uint8_t x[kMaxAllocableSize + 1] = {};
+};
+
+#ifdef ABSL_HAVE_EXCEPTIONS
+struct MaxAlignedThrowingConstructor : public MaxAlignedAllocable {
+  MaxAlignedThrowingConstructor() { throw std::exception(); }
+
+  uint8_t x;
+};
+#endif
+
+TEST(TestUtilsTest, TestMaxAlignedAllocable) {
+  {
+    // MaxAlignedAllocable::operator new (std::nothrow) is called.
+    std::unique_ptr<SmallMaxAligned> small(new (std::nothrow) SmallMaxAligned);
+    EXPECT_NE(small, nullptr);
+    // Note this check doesn't guarantee conformance as a suitably aligned
+    // address may be returned from any allocator.
+    EXPECT_EQ(reinterpret_cast<uintptr_t>(small.get()) & (kMaxAlignment - 1),
+              0);
+    // MaxAlignedAllocable::operator delete is called.
+  }
+
+  {
+    // MaxAlignedAllocable::operator new is called.
+    std::unique_ptr<SmallMaxAligned> small(new SmallMaxAligned);
+    EXPECT_NE(small, nullptr);
+    // Note this check doesn't guarantee conformance as a suitably aligned
+    // address may be returned from any allocator.
+    EXPECT_EQ(reinterpret_cast<uintptr_t>(small.get()) & (kMaxAlignment - 1),
+              0);
+    // MaxAlignedAllocable::operator delete is called.
+  }
+
+  {
+    // MaxAlignedAllocable::operator new[] (std::nothrow) is called.
+    std::unique_ptr<SmallMaxAligned[]> small_array_of_smalls(
+        new (std::nothrow) SmallMaxAligned[10]);
+    EXPECT_NE(small_array_of_smalls, nullptr);
+    EXPECT_EQ(reinterpret_cast<uintptr_t>(small_array_of_smalls.get()) &
+                  (kMaxAlignment - 1),
+              0);
+    // MaxAlignedAllocable::operator delete[] is called.
+  }
+
+  {
+    // MaxAlignedAllocable::operator new[] is called.
+    std::unique_ptr<SmallMaxAligned[]> small_array_of_smalls(
+        new SmallMaxAligned[10]);
+    EXPECT_NE(small_array_of_smalls, nullptr);
+    EXPECT_EQ(reinterpret_cast<uintptr_t>(small_array_of_smalls.get()) &
+                  (kMaxAlignment - 1),
+              0);
+    // MaxAlignedAllocable::operator delete[] is called.
+  }
+
+  {
+    // MaxAlignedAllocable::operator new (std::nothrow) is called.
+    std::unique_ptr<HugeMaxAligned> huge(new (std::nothrow) HugeMaxAligned);
+    EXPECT_EQ(huge, nullptr);
+  }
+
+  {
+    // MaxAlignedAllocable::operator new[] (std::nothrow) is called.
+    std::unique_ptr<SmallMaxAligned[]> huge_array_of_smalls(
+        new (std::nothrow)
+            SmallMaxAligned[kMaxAllocableSize / sizeof(SmallMaxAligned) + 1]);
+    EXPECT_EQ(huge_array_of_smalls, nullptr);
+  }
+
+#ifdef ABSL_HAVE_EXCEPTIONS
+  try {
+    // MaxAlignedAllocable::operator new (std::nothrow) is called.
+    // The constructor throws an exception.
+    // MaxAlignedAllocable::operator delete (std::nothrow) is called.
+    auto* always = new (std::nothrow) MaxAlignedThrowingConstructor;
+    static_cast<void>(always);
+  } catch (...) {
+  }
+
+  try {
+    // MaxAlignedAllocable::operator new is called.
+    // The constructor throws an exception.
+    // MaxAlignedAllocable::operator delete is called.
+    auto* always = new MaxAlignedThrowingConstructor;
+    static_cast<void>(always);
+  } catch (...) {
+  }
+
+  try {
+    // MaxAlignedAllocable::operator new[] (std::nothrow) is called.
+    // The constructor throws an exception.
+    // MaxAlignedAllocable::operator delete[] (std::nothrow) is called.
+    auto* always = new (std::nothrow) MaxAlignedThrowingConstructor[2];
+    static_cast<void>(always);
+  } catch (...) {
+  }
+
+  try {
+    // MaxAlignedAllocable::operator new[] is called.
+    // The constructor throws an exception.
+    // MaxAlignedAllocable::operator delete[] is called.
+    auto* always = new MaxAlignedThrowingConstructor[2];
+    static_cast<void>(always);
+  } catch (...) {
+  }
+
+  // Note these calls are only safe with exceptions enabled as if the throwing
+  // operator new returns the object is expected to be valid. In this case an
+  // attempt to invoke the object's constructor on a nullptr may be made which
+  // is undefined behavior.
+  try {
+    // MaxAlignedAllocable::operator new is called.
+    std::unique_ptr<HugeMaxAlignedNontrivialConstructor> huge(
+        new HugeMaxAlignedNontrivialConstructor);
+    ADD_FAILURE() << "huge allocation should fail.";
+  } catch (...) {
+    SUCCEED();
+  }
+
+  try {
+    // MaxAlignedAllocable::operator new[] is called.
+    std::unique_ptr<SmallMaxAlignedNontrivialConstructor[]>
+        huge_array_of_smalls(
+            new SmallMaxAlignedNontrivialConstructor
+                [kMaxAllocableSize /
+                     sizeof(SmallMaxAlignedNontrivialConstructor) +
+                 1]);
+    ADD_FAILURE() << "huge_array_of_smalls allocation should fail.";
+  } catch (...) {
+    SUCCEED();
+  }
+#endif  // ABSL_HAVE_EXCEPTIONS
+}
+
+}  // namespace
+}  // namespace test_utils
+}  // namespace libgav1
author	qinxialei <xialeiqin@gmail.com>	2021-04-22 11:20:18 +0800
committer	qinxialei <xialeiqin@gmail.com>	2021-04-22 11:20:18 +0800
commit	81ce37eb93e8ce442ecb1855a4e7166628128ac7 (patch)
tree	2af6329f74f88ce090d08c61db5fb4bed8656584
parent	4dab0c756a3cdd65b43470a4cca835422b32ca6e (diff)
parent	2381d803c76105f44717d75f089ec37f51e5cfe4 (diff)
download	libgav1-81ce37eb93e8ce442ecb1855a4e7166628128ac7.tar.gz libgav1-81ce37eb93e8ce442ecb1855a4e7166628128ac7.tar.bz2 libgav1-81ce37eb93e8ce442ecb1855a4e7166628128ac7.zip